In [1]:
# !python -m pip install --upgrade pip -q
# !pip install uv -qU
# !uv pip install "numpy<2.3" "transformers<=4.53.2" datasets tensorboard openai hf_transfer accelerate pillow scikit-learn pymupdf google.generativeai flashinfer-python huggingface_hub vllm ipywidgets -U

In [2]:
import os

# The path where 'which nvcc' found the compiler in your terminal
cuda_path = '/sw/arch/RHEL9/EB_production/2024/software/CUDA/12.6.0/bin'

# Prepend the CUDA path to the environment variable
os.environ['PATH'] = f"{cuda_path}:{os.environ['PATH']}"

# Verify the change
!which nvcc


/sw/arch/RHEL9/EB_production/2024/software/CUDA/12.6.0/bin/nvcc


In [3]:

# !uv pip install datasets tensorboard openai hf_transfer accelerate pillow -qU
# !uv pip install scikit-learn pymupdf -qU
# !uv pip install google.generativeai  -qU # for gemini
# !uv pip install flashinfer-python  -qU # helpful for speeding up vllm
# !uv pip install huggingface_hub -qU
# !uv pip install vllm -qU
# !uv pip install ipywidgets

In [4]:
import os

os.environ['HF_HOME'] = '/scratch-shared/amark/huggingface_cache'

In [5]:


# Get the total number of CPU cores available in your Colab instance
total_cores = os.cpu_count()
print(f"Total CPU cores available: {total_cores}")

Total CPU cores available: 64


In [6]:
from huggingface_hub import HfFolder, login

# Check if a token is already saved
if HfFolder.get_token() is None:
    login()

In [7]:
# model_slug = "google/gemma-3-4b-it"
model_slug = "Ardjano/gemma3-27b-finetune1-merged"

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("../data/humanandllm.csv")
df = df[['Context', 'Response']]

trainset, testset = train_test_split(df, test_size=0.057, random_state=42)



In [9]:
display(testset.iloc[1].Context)

'I have no real friends. I have a girlfriend who irritates me but loves me to death. I push her away and pushes me away. We’re going through a breakup, and I have nobody.'

In [10]:
import torch

dtype = "bfloat16"

if torch.cuda.is_available():
  if "T4" in torch.cuda.get_device_name(0):
    dtype = "float16"
else:
  dtype = "float32"

print(f"dtype: {dtype}")

KeyboardInterrupt: 

In [None]:
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained(model_slug)
# print(f"Tokenizer type: {type(tokenizer)}")
# print(f"Tokenizer model name: {tokenizer.name_or_path}")

# The original model with the correct tokenizer and chat template
tokenizer_source = "google/gemma-3-27b-it"

# Load the tokenizer from the original source
tokenizer = AutoTokenizer.from_pretrained(tokenizer_source)

In [None]:

def format_generation_prompt(context_text, tokenizer_obj):
    """
    Formats a single context (user's message) into the chat template for model generation.
    """
    # Create the conversation structure directly with the provided context_text
    messages = [
        {"role": "user", "content": context_text},
    ]
    # For generation, add_generation_prompt should be True
    formatted_prompt = tokenizer_obj.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return formatted_prompt

prompts = []
original_prompts = []
true_responses = []

for i, row in testset.iterrows():
    context_text = row['Context']
    true_response_text = row['Response']

    formatted_input_string = format_generation_prompt(context_text, tokenizer)

    prompts.append(formatted_input_string)
    original_prompts.append(context_text)
    true_responses.append(true_response_text)

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_slug,
    use_fast=True
)

In [None]:
import json
import re
from vllm import LLM, SamplingParams


sampling_params = SamplingParams(
    max_tokens=1024,
    top_k=64,
    temperature=1.0,
    top_p=0.95,
    min_p=0.0
)

model = LLM(
    model=model_slug,
    gpu_memory_utilization=0.9,
    dtype=dtype,
    max_model_len=8192
)

INFO 09-26 05:15:36 [__init__.py:244] Automatically detected platform cuda.
INFO 09-26 05:15:49 [config.py:823] This model supports multiple tasks: {'reward', 'score', 'embed', 'classify', 'generate'}. Defaulting to 'generate'.
INFO 09-26 05:15:49 [config.py:2195] Chunked prefill is enabled with max_num_batched_tokens=16384.




INFO 09-26 05:15:56 [__init__.py:244] Automatically detected platform cuda.
INFO 09-26 05:15:59 [core.py:455] Waiting for init message from front-end.
INFO 09-26 05:15:59 [core.py:70] Initializing a V1 LLM engine (v0.9.1) with config: model='Ardjano/gemma3-27b-finetune1-merged', speculative_config=None, tokenizer='Ardjano/gemma3-27b-finetune1-merged', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_e

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


INFO 09-26 05:16:08 [topk_topp_sampler.py:49] Using FlashInfer for top-p & top-k sampling.
INFO 09-26 05:16:08 [gpu_model_runner.py:1595] Starting to load model Ardjano/gemma3-27b-finetune1-merged...
INFO 09-26 05:16:08 [gpu_model_runner.py:1600] Loading model from scratch...
INFO 09-26 05:16:08 [cuda.py:252] Using Flash Attention backend on V1 engine.




INFO 09-26 05:16:09 [weight_utils.py:292] Using model weights format ['*.safetensors']
INFO 09-26 05:16:09 [weight_utils.py:308] Time spent downloading weights for Ardjano/gemma3-27b-finetune1-merged: 0.588399 seconds


Loading safetensors checkpoint shards:   0% Completed | 0/12 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:   8% Completed | 1/12 [00:04<00:50,  4.60s/it]
Loading safetensors checkpoint shards:  17% Completed | 2/12 [00:09<00:46,  4.68s/it]
Loading safetensors checkpoint shards:  25% Completed | 3/12 [00:14<00:42,  4.68s/it]
Loading safetensors checkpoint shards:  33% Completed | 4/12 [00:18<00:37,  4.68s/it]
Loading safetensors checkpoint shards:  42% Completed | 5/12 [00:23<00:32,  4.62s/it]
Loading safetensors checkpoint shards:  50% Completed | 6/12 [00:23<00:19,  3.27s/it]
Loading safetensors checkpoint shards:  58% Completed | 7/12 [00:28<00:18,  3.66s/it]
Loading safetensors checkpoint shards:  67% Completed | 8/12 [00:32<00:15,  3.95s/it]
Loading safetensors checkpoint shards:  75% Completed | 9/12 [00:37<00:12,  4.15s/it]
Loading safetensors checkpoint shards:  83% Completed | 10/12 [00:42<00:08,  4.29s/it]
Loading safetensors checkpoint shards:  92% Completed | 11/12

INFO 09-26 05:17:01 [default_loader.py:272] Loading weights took 51.27 seconds
INFO 09-26 05:17:01 [gpu_model_runner.py:1624] Model loading took 51.4479 GiB and 52.501311 seconds
INFO 09-26 05:17:01 [gpu_model_runner.py:1978] Encoder cache will be initialized with a budget of 16384 tokens, and profiled with 64 image items of the maximum feature size.
INFO 09-26 05:17:19 [backends.py:462] Using cache directory: /home/amark/.cache/vllm/torch_compile_cache/8d6547b87e/rank_0_0 for vLLM's torch.compile
INFO 09-26 05:17:19 [backends.py:472] Dynamo bytecode transform time: 15.84 s
INFO 09-26 05:17:34 [backends.py:135] Directly load the compiled graph(s) for shape None from the cache, took 13.261 s
INFO 09-26 05:17:41 [monitor.py:34] torch.compile takes 15.84 s in total


If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'].


INFO 09-26 05:18:29 [gpu_worker.py:227] Available KV cache memory: 25.10 GiB
INFO 09-26 05:18:29 [kv_cache_utils.py:870] GPU KV cache size: 46,992 tokens
INFO 09-26 05:18:29 [kv_cache_utils.py:874] Maximum concurrency for 8,192 tokens per request: 5.73x
INFO 09-26 05:19:02 [gpu_model_runner.py:2048] Graph capturing finished in 33 secs, took 0.81 GiB
INFO 09-26 05:19:02 [core.py:171] init engine (profile, create kv cache, warmup model) took 121.32 seconds


In [None]:
# !uv pip freeze --color never > eval_packages.txt

In [None]:
from tqdm.auto import tqdm

outputs = model.generate(prompts, sampling_params)

generated_responses_data = []

# Process the outputs
for i, output in tqdm(enumerate(outputs), total=len(outputs), desc="Processing outputs"):
    generated_text = output.outputs[0].text.strip()

    generated_responses_data.append({
        "Context": original_prompts[i],
        "True_Response": true_responses[i],
        "Generated_Response": generated_text
    })

print(f"Finished processing {len(generated_responses_data)} generated responses.")

# Convert to DataFrame and save to CSV
responses_df = pd.DataFrame(generated_responses_data)
output_csv_path = "vllm_model_test_responses_finetuned_gemma3-27b.csv"
responses_df.to_csv(output_csv_path, index=False)


Adding requests:   0%|          | 0/200 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/200 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processing outputs:   0%|          | 0/200 [00:00<?, ?it/s]

Finished processing 200 generated responses.
