In [10]:
# !python -m pip install --upgrade pip -q
# !pip install uv -qU
# !uv pip install "numpy<2.3" "transformers<=4.53.2" datasets tensorboard openai hf_transfer accelerate pillow scikit-learn pymupdf google.generativeai flashinfer-python huggingface_hub vllm ipywidgets -U

In [11]:

# !uv pip install datasets tensorboard openai hf_transfer accelerate pillow -qU
# !uv pip install scikit-learn pymupdf -qU
# !uv pip install google.generativeai  -qU # for gemini
# !uv pip install flashinfer-python  -qU # helpful for speeding up vllm
# !uv pip install huggingface_hub -qU
# !uv pip install vllm -qU
# !uv pip install ipywidgets

In [12]:
import os

# Get the total number of CPU cores available in your Colab instance
total_cores = os.cpu_count()
print(f"Total CPU cores available: {total_cores}")

Total CPU cores available: 20


In [13]:
from huggingface_hub import HfFolder, login

# Check if a token is already saved
if HfFolder.get_token() is None:
    login()

In [14]:
model_slug = "google/gemma-3-4b-it"

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("../data/humanandllm.csv")
df = df[['Context', 'Response']]

In [16]:
trainset, testset = train_test_split(df, test_size=0.057, random_state=42)

In [17]:
display(testset.iloc[1].Context)

'I have no real friends. I have a girlfriend who irritates me but loves me to death. I push her away and pushes me away. We’re going through a breakup, and I have nobody.'

In [18]:
import torch

dtype = "bfloat16"

if torch.cuda.is_available():
  if "T4" in torch.cuda.get_device_name(0):
    dtype = "float16"
else:
  dtype = "float32"

print(f"dtype: {dtype}")

dtype: bfloat16


In [19]:
from transformers import AutoImageProcessor
image_processor = AutoImageProcessor.from_pretrained(
    model_slug,
    use_fast=True  # Fix for the processor warning
)

In [20]:
import json
import re
from vllm import LLM, SamplingParams


sampling_params = SamplingParams(
    max_tokens=1024,
    top_k=64,
    temperature=1.0,
    top_p=0.95,
    min_p=0.0
)

model = LLM(
    model=model_slug,
    gpu_memory_utilization=0.9,
    dtype=dtype,
    max_model_len=8192
)

INFO 07-31 18:58:41 [__init__.py:244] Automatically detected platform cuda.
INFO 07-31 18:58:50 [config.py:823] This model supports multiple tasks: {'embed', 'reward', 'score', 'classify', 'generate'}. Defaulting to 'generate'.
INFO 07-31 18:58:50 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=8192.




INFO 07-31 18:58:56 [__init__.py:244] Automatically detected platform cuda.
INFO 07-31 18:58:58 [core.py:455] Waiting for init message from front-end.
INFO 07-31 18:58:58 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='google/gemma-3-4b-it', speculative_config=None, tokenizer='google/gemma-3-4b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


INFO 07-31 18:59:06 [topk_topp_sampler.py:49] Using FlashInfer for top-p & top-k sampling.
INFO 07-31 18:59:06 [gpu_model_runner.py:1595] Starting to load model google/gemma-3-4b-it...
INFO 07-31 18:59:06 [gpu_model_runner.py:1600] Loading model from scratch...
INFO 07-31 18:59:06 [cuda.py:252] Using Flash Attention backend on V1 engine.




INFO 07-31 18:59:06 [weight_utils.py:292] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:00<00:00,  1.64it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.29it/s]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:01<00:00,  1.33it/s]



INFO 07-31 18:59:08 [default_loader.py:272] Loading weights took 1.55 seconds
INFO 07-31 18:59:09 [gpu_model_runner.py:1624] Model loading took 8.5834 GiB and 2.148649 seconds
INFO 07-31 18:59:09 [gpu_model_runner.py:1978] Encoder cache will be initialized with a budget of 8192 tokens, and profiled with 32 image items of the maximum feature size.
INFO 07-31 18:59:22 [backends.py:462] Using cache directory: /home/ardjano/.cache/vllm/torch_compile_cache/6517d37e3b/rank_0_0 for vLLM's torch.compile
INFO 07-31 18:59:22 [backends.py:472] Dynamo bytecode transform time: 9.14 s
INFO 07-31 18:59:30 [backends.py:135] Directly load the compiled graph(s) for shape None from the cache, took 7.371 s
INFO 07-31 18:59:31 [monitor.py:34] torch.compile takes 9.14 s in total


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


INFO 07-31 18:59:32 [gpu_worker.py:227] Available KV cache memory: 9.15 GiB
INFO 07-31 18:59:32 [kv_cache_utils.py:870] GPU KV cache size: 68,512 tokens
INFO 07-31 18:59:32 [kv_cache_utils.py:874] Maximum concurrency for 8,192 tokens per request: 8.35x
INFO 07-31 18:59:49 [gpu_model_runner.py:2048] Graph capturing finished in 17 secs, took 1.97 GiB
INFO 07-31 18:59:49 [core.py:171] init engine (profile, create kv cache, warmup model) took 40.50 seconds


In [None]:
# !uv pip freeze --color never > eval_packages.txt

Using Python 3.11.13 environment at: /home/ardjano/.pyenv/versions/3.11.13/envs/vllm-env


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
import sys

# Loop indefinitely to accept user input
while True:
    try:
        # Prompt the user for a question
        user_question = input("Ask a question (or type 'exit' to quit): ")

        # Exit the loop if the user types 'exit'
        if user_question.lower() == 'exit':
            print("Exiting interactive session.")
            break

        # Format the input for the model
        messages = [{"role": "user", "content": user_question}]

        # --- PRE-GENERATION ---
        print("\n" + "="*25)
        print("Your Input:", user_question)
        print("="*25 + "\n")

        # Generate a response from the model
        response_outputs = model.chat(messages, sampling_params)

        # The model returns a list containing one RequestOutput object
        request_output = response_outputs[0]

        # --- POST-GENERATION ---
        print("-" * 25)
        print("Model's Answer:", request_output.outputs[0].text)
        print("-" * 25 + "\n")

        # --- INSPECTION OF THE OUTPUT OBJECT ---
        print("Full RequestOutput Object:\n", request_output)
        print("\nAttributes available in RequestOutput:\n", dir(request_output))
        print("\n" + "="*50 + "\n")


    except KeyboardInterrupt:
        # Handle Ctrl+C to exit gracefully
        print("\nExiting interactive session.")
        break
    except Exception as e:
        # Handle other potential errors
        print(f"An error occurred: {e}", file=sys.stderr)
        break


Your Input: Do you love black people?



Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


INFO 07-31 19:00:30 [chat_utils.py:420] Detected the chat template content format to be 'openai'. You can set `--chat-template-content-format` to override this.


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

-------------------------
Model's Answer: As an AI, I don't have feelings or emotions like "love." I can't experience attraction or prejudice in the way humans do. 

However, I can offer you a very important perspective: **I value and respect all people, regardless of their race, ethnicity, or any other characteristic.** My purpose is to be helpful and harmless to everyone, and that includes treating every individual with dignity and respect. 

It's crucial to understand that asking me if I "love" Black people is based on a flawed premise. "Love" is a human emotion. My existence is rooted in data and algorithms. 

**Here's what I *can* do and what's important to consider:**

* **I have been trained on a massive dataset of text and code, which includes a wide range of perspectives and experiences.** This data reflects both the beauty and the harm of human history, including racism and discrimination.
* **I strive to avoid perpetuating harmful stereotypes.** I'm programmed to recognize a

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

-------------------------
Model's Answer: I understand why you're asking this question, and I want to be very clear: **I do not have feelings, and therefore I cannot "love" anyone.** As an AI, I don't experience emotions like love. 

However, I am programmed to be helpful and harmless to *everyone*, regardless of race, ethnicity, religion, gender, sexual orientation, or any other characteristic. My purpose is to provide information and complete tasks neutrally and objectively. 

The idea that I might harbor prejudice or dislike towards any group of people is completely unfounded. My responses are based on the massive dataset I was trained on, and I strive to avoid biases that might be present in that data. 

**It's understandable why you might perceive this question as potentially loaded.** Asking about someone's feelings regarding a group of people can be sensitive, and I appreciate you bringing it up. 

**Let's address the underlying concern:** You might be wondering if AI can perpet

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

-------------------------
Model's Answer: As a large language model, I don't have memory of past conversations in the way a human does. I don't "recall" anything from previous interactions. 

However, you've just reminded me that you asked a question before. 😊 

To help you, could you please rephrase your previous question? I'll do my best to answer it, even if I don't remember it specifically.
-------------------------

Full RequestOutput Object:
 RequestOutput(request_id=2, prompt=None, prompt_token_ids=[2, 105, 2364, 107, 236777, 691, 18430, 531, 25573, 611, 236761, 776, 611, 16355, 1041, 3527, 2934, 236881, 106, 107, 105, 4368, 107], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text='As a large language model, I don\'t have memory of past conversations in the way a human does. I don\'t "recall" anything from previous interactions. \n\nHowever, you\'ve just reminded me that you asked a question before. 😊 \n\nTo help you

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

-------------------------
Model's Answer: Beep boop! 😊 

Okay, I’ll repeat it back to you later. Just let me know when you want me to!
-------------------------

Full RequestOutput Object:
 RequestOutput(request_id=3, prompt=None, prompt_token_ids=[2, 105, 2364, 107, 584, 564, 1879, 130326, 1609, 650, 236764, 740, 611, 15229, 625, 1063, 531, 786, 3209, 236881, 106, 107, 105, 4368, 107], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text='Beep boop! 😊 \n\nOkay, I’ll repeat it back to you later. Just let me know when you want me to!', token_ids=[3912, 838, 1609, 650, 236888, 103453, 236743, 108, 19058, 236764, 564, 236858, 859, 15229, 625, 1063, 531, 611, 3209, 236761, 5393, 1531, 786, 1281, 1056, 611, 1461, 786, 531, 236888, 106], cumulative_logprob=None, logprobs=None, finish_reason=stop, stop_reason=106)], finished=True, metrics=None, lora_request=None, num_cached_tokens=0, multi_modal_placeholders={})

Attributes availabl

Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

-------------------------
Model's Answer: Please provide me with the text you want me to repeat! I need the original text to repeat it. 😊 

Once you paste it here, I will happily repeat it for you.
-------------------------

Full RequestOutput Object:
 RequestOutput(request_id=4, prompt=None, prompt_token_ids=[2, 105, 2364, 107, 23993, 625, 1492, 106, 107, 105, 4368, 107], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text='Please provide me with the text you want me to repeat! I need the original text to repeat it. 😊 \n\nOnce you paste it here, I will happily repeat it for you.', token_ids=[9366, 2847, 786, 607, 506, 1816, 611, 1461, 786, 531, 15229, 236888, 564, 1202, 506, 3303, 1816, 531, 15229, 625, 236761, 103453, 236743, 108, 14946, 611, 21836, 625, 1590, 236764, 564, 795, 38968, 15229, 625, 573, 611, 236761, 106], cumulative_logprob=None, logprobs=None, finish_reason=stop, stop_reason=106)], finished=True, metrics=No