In [1]:
# !python -m pip install --upgrade pip -q
# !pip install uv -qU
# !uv pip install "numpy<2.3" "transformers<=4.53.2" datasets tensorboard openai hf_transfer accelerate pillow scikit-learn pymupdf google.generativeai flashinfer-python huggingface_hub vllm ipywidgets -U

In [2]:

# !uv pip install datasets tensorboard openai hf_transfer accelerate pillow -qU
# !uv pip install scikit-learn pymupdf -qU
# !uv pip install google.generativeai  -qU # for gemini
# !uv pip install flashinfer-python  -qU # helpful for speeding up vllm
# !uv pip install huggingface_hub -qU
# !uv pip install vllm -qU
# !uv pip install ipywidgets

In [3]:
import os

# Get the total number of CPU cores available in your Colab instance
total_cores = os.cpu_count()
print(f"Total CPU cores available: {total_cores}")

Total CPU cores available: 64


In [4]:
from huggingface_hub import HfFolder, login

# Check if a token is already saved
if HfFolder.get_token() is None:
    login()

In [5]:
# model_slug = "google/gemma-3-4b-it"
model_slug = "google/gemma-3-27b-it"

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("../data/humanandllm.csv")
df = df[['Context', 'Response']]

trainset, testset = train_test_split(df, test_size=0.057, random_state=42)



In [7]:
display(testset.iloc[1].Context)

'I have no real friends. I have a girlfriend who irritates me but loves me to death. I push her away and pushes me away. We’re going through a breakup, and I have nobody.'

In [8]:
import torch

dtype = "bfloat16"

if torch.cuda.is_available():
  if "T4" in torch.cuda.get_device_name(0):
    dtype = "float16"
else:
  dtype = "float32"

print(f"dtype: {dtype}")

dtype: bfloat16


In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_slug)
print(f"Tokenizer type: {type(tokenizer)}")
print(f"Tokenizer model name: {tokenizer.name_or_path}")

Tokenizer type: <class 'transformers.models.gemma.tokenization_gemma_fast.GemmaTokenizerFast'>
Tokenizer model name: google/gemma-3-27b-it


In [10]:

def format_generation_prompt(context_text, tokenizer_obj):
    """
    Formats a single context (user's message) into the chat template for model generation.
    """
    # Create the conversation structure directly with the provided context_text
    messages = [
        {"role": "user", "content": context_text},
    ]
    # For generation, add_generation_prompt should be True
    formatted_prompt = tokenizer_obj.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return formatted_prompt

prompts = []
original_prompts = []
true_responses = []

for i, row in testset.iterrows():
    context_text = row['Context']
    true_response_text = row['Response']

    formatted_input_string = format_generation_prompt(context_text, tokenizer)

    prompts.append(formatted_input_string)
    original_prompts.append(context_text)
    true_responses.append(true_response_text)

In [11]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_slug,
    use_fast=True
)

In [12]:
import json
import re
from vllm import LLM, SamplingParams


sampling_params = SamplingParams(
    max_tokens=1024,
    top_k=64,
    temperature=1.0,
    top_p=0.95,
    min_p=0.0
)

model = LLM(
    model=model_slug,
    gpu_memory_utilization=0.9,
    dtype=dtype,
    max_model_len=8192
)

INFO 09-26 03:59:31 [__init__.py:244] Automatically detected platform cuda.
INFO 09-26 03:59:43 [config.py:823] This model supports multiple tasks: {'score', 'reward', 'generate', 'classify', 'embed'}. Defaulting to 'generate'.
INFO 09-26 03:59:43 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=16384.




INFO 09-26 03:59:50 [__init__.py:244] Automatically detected platform cuda.
INFO 09-26 03:59:53 [core.py:455] Waiting for init message from front-end.
INFO 09-26 03:59:53 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='google/gemma-3-27b-it', speculative_config=None, tokenizer='google/gemma-3-27b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detail

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


INFO 09-26 04:00:02 [topk_topp_sampler.py:49] Using FlashInfer for top-p & top-k sampling.
INFO 09-26 04:00:02 [gpu_model_runner.py:1595] Starting to load model google/gemma-3-27b-it...
INFO 09-26 04:00:02 [gpu_model_runner.py:1600] Loading model from scratch...
INFO 09-26 04:00:02 [cuda.py:252] Using Flash Attention backend on V1 engine.




INFO 09-26 04:00:03 [weight_utils.py:292] Using model weights format ['*.safetensors']
ERROR 09-26 04:00:38 [core.py:515] EngineCore failed to start.
ERROR 09-26 04:00:38 [core.py:515] Traceback (most recent call last):
ERROR 09-26 04:00:38 [core.py:515]   File "/home/amark/.conda/envs/vllm_env/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 506, in run_engine_core
ERROR 09-26 04:00:38 [core.py:515]     engine_core = EngineCoreProc(*args, **kwargs)
ERROR 09-26 04:00:38 [core.py:515]                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ERROR 09-26 04:00:38 [core.py:515]   File "/home/amark/.conda/envs/vllm_env/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 390, in __init__
ERROR 09-26 04:00:38 [core.py:515]     super().__init__(vllm_config, executor_class, log_stats,
ERROR 09-26 04:00:38 [core.py:515]   File "/home/amark/.conda/envs/vllm_env/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 76, in __init__
ERROR 09-26 04:00:38 [core.py:515]     self.model_ex

Process EngineCore_0:
Traceback (most recent call last):
  File "/home/amark/.conda/envs/vllm_env/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/amark/.conda/envs/vllm_env/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/amark/.conda/envs/vllm_env/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 519, in run_engine_core
    raise e
  File "/home/amark/.conda/envs/vllm_env/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 506, in run_engine_core
    engine_core = EngineCoreProc(*args, **kwargs)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/amark/.conda/envs/vllm_env/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 390, in __init__
    super().__init__(vllm_config, executor_class, log_stats,
  File "/home/amark/.conda/envs/vllm_env/lib/python3.11/site-packages/vllm/v1/engine/core.py", line 76, in __init__
    self.model_ex

RuntimeError: Engine core initialization failed. See root cause above. Failed core proc(s): {}

In [None]:
# !uv pip freeze --color never > eval_packages.txt

In [None]:
from tqdm.auto import tqdm

outputs = model.generate(prompts, sampling_params)

generated_responses_data = []

# Process the outputs
for i, output in tqdm(enumerate(outputs), total=len(outputs), desc="Processing outputs"):
    generated_text = output.outputs[0].text.strip()

    generated_responses_data.append({
        "Context": original_prompts[i],
        "True_Response": true_responses[i],
        "Generated_Response": generated_text
    })

print(f"Finished processing {len(generated_responses_data)} generated responses.")

# Convert to DataFrame and save to CSV
responses_df = pd.DataFrame(generated_responses_data)
output_csv_path = "vllm_model_test_responses_finetuned_gemma3-4b.csv"
responses_df.to_csv(output_csv_path, index=False)


Adding requests:   0%|          | 0/200 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/200 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processing outputs:   0%|          | 0/200 [00:00<?, ?it/s]

Finished processing 200 generated responses.
