In [1]:
# !python -m pip install --upgrade pip -q
# !pip install uv -qU
# !uv pip install "numpy<2.3" "transformers<=4.53.2" datasets tensorboard openai hf_transfer accelerate pillow scikit-learn pymupdf google.generativeai flashinfer-python huggingface_hub vllm ipywidgets -U

In [2]:
import os

# The path where 'which nvcc' found the compiler in your terminal
cuda_path = '/sw/arch/RHEL9/EB_production/2024/software/CUDA/12.6.0/bin'

# Prepend the CUDA path to the environment variable
os.environ['PATH'] = f"{cuda_path}:{os.environ['PATH']}"

# Verify the change
!which nvcc


/sw/arch/RHEL9/EB_production/2024/software/CUDA/12.6.0/bin/nvcc


In [3]:

# !uv pip install datasets tensorboard openai hf_transfer accelerate pillow -qU
# !uv pip install scikit-learn pymupdf -qU
# !uv pip install google.generativeai  -qU # for gemini
# !uv pip install flashinfer-python  -qU # helpful for speeding up vllm
# !uv pip install huggingface_hub -qU
# !uv pip install vllm -qU
# !uv pip install ipywidgets

In [4]:
import os

os.environ['HF_HOME'] = '/scratch-shared/amark/huggingface_cache'

In [5]:


# Get the total number of CPU cores available in your Colab instance
total_cores = os.cpu_count()
print(f"Total CPU cores available: {total_cores}")

Total CPU cores available: 64


In [6]:
from huggingface_hub import HfFolder, login

# Check if a token is already saved
if HfFolder.get_token() is None:
    login()

In [7]:
# model_slug = "google/gemma-3-4b-it"
model_slug = "unsloth/gpt-oss-20b"

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("../data/humanandllm.csv")
df = df[['Context', 'Response']]

trainset, testset = train_test_split(df, test_size=0.057, random_state=42)



In [9]:
display(testset.iloc[1].Context)

'I have no real friends. I have a girlfriend who irritates me but loves me to death. I push her away and pushes me away. We’re going through a breakup, and I have nobody.'

In [10]:
import torch

dtype = "float16"

# if torch.cuda.is_available():
#   if "T4" in torch.cuda.get_device_name(0):
#     dtype = "float16"
# else:
#   dtype = "float32"

# print(f"dtype: {dtype}")

In [11]:

# # prepare the model input
# prompt = "What's the meaning of life?"
# messages = [
#     {"role": "user", "content": prompt}
# ]
# text = tokenizer.apply_chat_template(
#     messages,
#     tokenize=False,
#     add_generation_prompt=True,
#     chat_template_kwargs={"enable_thinking": False}
# )
# model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# # conduct text completion
# generated_ids = model.generate(
#     **model_inputs,
#     max_new_tokens=16384
# )
# output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

# content = tokenizer.decode(output_ids, skip_special_tokens=True)

# print("content:", content)

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_slug)
print(f"Tokenizer type: {type(tokenizer)}")
print(f"Tokenizer model name: {tokenizer.name_or_path}")

Tokenizer type: <class 'transformers.tokenization_utils_fast.PreTrainedTokenizerFast'>
Tokenizer model name: unsloth/gpt-oss-20b


In [13]:

def format_generation_prompt(context_text, tokenizer_obj):
    """
    Formats a single context (user's message) into the chat template for model generation.
    """
    # Create the conversation structure directly with the provided context_text
    messages = [
        {"role": "user", "content": context_text},
    ]
    # For generation, add_generation_prompt should be True
    formatted_prompt = tokenizer_obj.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True,
        reasoning_effort = "high"
    )
    return formatted_prompt


In [14]:

prompts = []
original_prompts = []
true_responses = []

for i, row in testset.iterrows():
    context_text = row['Context']
    true_response_text = row['Response']

    formatted_input_string = format_generation_prompt(context_text, tokenizer)

    prompts.append(formatted_input_string)
    original_prompts.append(context_text)
    true_responses.append(true_response_text)

In [15]:
from vllm import LLM, SamplingParams


sampling_params = SamplingParams(
    max_tokens=1024,
    top_k=0,
    temperature=1.0,
    top_p=1.0,
    stop=["<|return|>"],
)


INFO 09-26 21:58:55 [__init__.py:216] Automatically detected platform cuda.


In [None]:
import json
import re

model = LLM(
    model=model_slug,
    gpu_memory_utilization=0.9,
    # dtype=dtype,
    max_model_len=8192,
    trust_remote_code=True
)

INFO 09-26 22:00:58 [utils.py:328] non-default args: {'trust_remote_code': True, 'max_model_len': 8192, 'disable_log_stats': True, 'model': 'unsloth/gpt-oss-20b'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 09-26 22:00:59 [__init__.py:742] Resolved architecture: GptOssForCausalLM


Parse safetensors files:   0%|          | 0/3 [00:00<?, ?it/s]

INFO 09-26 22:00:59 [__init__.py:1815] Using max model len 8192
INFO 09-26 22:01:00 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 09-26 22:01:00 [config.py:284] Overriding max cuda graph capture size to 1024 for performance.


generation_config.json:   0%|          | 0.00/165 [00:00<?, ?B/s]

[1;36m(EngineCore_DP0 pid=2403892)[0;0m INFO 09-26 22:01:02 [core.py:654] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=2403892)[0;0m INFO 09-26 22:01:02 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='unsloth/gpt-oss-20b', speculative_config=None, tokenizer='unsloth/gpt-oss-20b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=mxfp4, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_backend='openai_gptoss'), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detai

[1;36m(EngineCore_DP0 pid=2403892)[0;0m W0926 22:01:02.173000 2403892 /gpfs/home5/amark/.conda/envs/vllm_env/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. 
[1;36m(EngineCore_DP0 pid=2403892)[0;0m W0926 22:01:02.173000 2403892 /gpfs/home5/amark/.conda/envs/vllm_env/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.


[1;36m(EngineCore_DP0 pid=2403892)[0;0m INFO 09-26 22:01:03 [parallel_state.py:1165] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
[1;36m(EngineCore_DP0 pid=2403892)[0;0m INFO 09-26 22:01:03 [topk_topp_sampler.py:58] Using FlashInfer for top-p & top-k sampling.
[1;36m(EngineCore_DP0 pid=2403892)[0;0m INFO 09-26 22:01:03 [gpu_model_runner.py:2338] Starting to load model unsloth/gpt-oss-20b...




[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[1;36m(EngineCore_DP0 pid=2403892)[0;0m INFO 09-26 22:01:03 [gpu_model_runner.py:2370] Loading model from scratch...
[1;36m(EngineCore_DP0 pid=2403892)[0;0m INFO 09-26 22:01:03 [cuda.py:362] Using Flash Attention backend on V1 engine.
[1;36m(EngineCore_DP0 pid=2403892)[0;0m INFO 09-26 22:01:03 [mxfp4.py:93] Using Marlin backend
[1;36m(EngineCore_DP0 pid=2403892)[0;0m INFO 09-26 22:01:04 [weight_utils.py:348] Using model weights format ['*.saf

model-00000-of-00002.safetensors:   0%|          | 0.00/4.79G [00:00<?, ?B/s]

In [18]:
# prompt = "Give me a short introduction to large language model."
# messages = [
#     {"role": "user", "content": prompt}
# ]
# formatted_prompt = tokenizer.apply_chat_template(
#     messages,
#     tokenize=False,
#     add_generation_prompt=True,
#     chat_template_kwargs={"enable_thinking": False}
# )

In [19]:
# outputs = model.generate([formatted_prompt], sampling_params)

In [20]:
# !uv pip freeze --color never > eval_packages.txt

In [21]:
THINK_PATTERN = r"<think>.*?</think>"

In [22]:
from tqdm.auto import tqdm

outputs = model.generate(prompts, sampling_params)

generated_responses_data = []

Adding requests:   0%|          | 0/200 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/200 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

In [23]:
outputs

[RequestOutput(request_id=0, prompt="<|im_start|>user\nWe weren't long distance until he joined the military.  I love him and I know he loves, me but it's complicated. He said he's not going to find someone else, but I'm afraid. How do I keep our relationship going?<|im_end|>\n<|im_start|>assistant\n", prompt_token_ids=[151644, 872, 198, 1654, 14716, 944, 1293, 6010, 3080, 566, 10859, 279, 6277, 13, 220, 358, 2948, 1435, 323, 358, 1414, 566, 15803, 11, 752, 714, 432, 594, 16965, 13, 1260, 1053, 566, 594, 537, 2087, 311, 1477, 4325, 770, 11, 714, 358, 2776, 16575, 13, 2585, 653, 358, 2506, 1039, 5025, 2087, 30, 151645, 198, 151644, 77091, 198], encoder_prompt=None, encoder_prompt_token_ids=None, prompt_logprobs=None, outputs=[CompletionOutput(index=0, text='<think>\n\n</think>\n\nHi! I am so sorry that you are having to go through this. I am sure that it is difficult. I have to say that I am not a huge fan of long distance relationships, but I do understand that sometimes they are neces

In [24]:
THINK_PATTERN = r"\s*<think>.*?</think>\s*"

# Process the outputs
for i, output in tqdm(enumerate(outputs), total=len(outputs), desc="Processing outputs"):
    raw_text = output.outputs[0].text
    print(f"raw: {raw_text}")

    clean_text = re.sub(THINK_PATTERN, "", raw_text, flags=re.DOTALL)
    final_text = clean_text.strip()
    print(f"final: {final_text}")

    generated_responses_data.append({
        "Context": original_prompts[i],
        "True_Response": true_responses[i],
        "Generated_Response": final_text
    })

print(f"Finished processing {len(generated_responses_data)} generated responses.")

# Convert to DataFrame and save to CSV
responses_df = pd.DataFrame(generated_responses_data)
output_csv_path = "vllm_model_test_responses_finetuned_qwen3.csv"
responses_df.to_csv(output_csv_path, index=False)


Processing outputs:   0%|          | 0/200 [00:00<?, ?it/s]

raw: <think>

</think>

Hi! I am so sorry that you are having to go through this. I am sure that it is difficult. I have to say that I am not a huge fan of long distance relationships, but I do understand that sometimes they are necessary. I would say that you need to be honest with him about your feelings and concerns. I would also encourage you to be honest with yourself about your feelings. I think it is important to ask yourself if you are okay with the fact that you are going to be far away from each other and to make sure that you are okay with that. I would also encourage you to set a time limit. I would say that if you are not able to see each other in person for a certain amount of time, then you need to re-evaluate the relationship. You need to make sure that you are both on the same page and that you are both willing to do what it takes to make it work. I would also encourage you to be honest with yourself about your feelings. If you are not okay with the fact that you are g