In [2]:
# !python -m pip install --upgrade pip -q
# !pip install uv -qU
# !uv pip install "numpy<2.3" "transformers<=4.53.2" datasets tensorboard openai hf_transfer accelerate pillow scikit-learn pymupdf google.generativeai flashinfer-python huggingface_hub vllm ipywidgets -U

In [3]:
import os

# The path where 'which nvcc' found the compiler in your terminal
cuda_path = '/sw/arch/RHEL9/EB_production/2024/software/CUDA/12.6.0/bin'

# Prepend the CUDA path to the environment variable
os.environ['PATH'] = f"{cuda_path}:{os.environ['PATH']}"

# Verify the change
!which nvcc


/sw/arch/RHEL9/EB_production/2024/software/CUDA/12.6.0/bin/nvcc


In [4]:

# !uv pip install datasets tensorboard openai hf_transfer accelerate pillow -qU
# !uv pip install scikit-learn pymupdf -qU
# !uv pip install google.generativeai  -qU # for gemini
# !uv pip install flashinfer-python  -qU # helpful for speeding up vllm
# !uv pip install huggingface_hub -qU
# !uv pip install vllm -qU
# !uv pip install ipywidgets

In [5]:
import os

os.environ['HF_HOME'] = '/scratch-shared/amark/huggingface_cache'

In [6]:


# Get the total number of CPU cores available in your Colab instance
total_cores = os.cpu_count()
print(f"Total CPU cores available: {total_cores}")

Total CPU cores available: 64


In [7]:
from huggingface_hub import HfFolder, login

# Check if a token is already saved
if HfFolder.get_token() is None:
    login()

In [8]:
# model_slug = "google/gemma-3-4b-it"
model_slug = "../finetune/trained_models/gpt-oss-20b-instruct-finetuned-16bit"

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("../data/humanandllm.csv")
df = df[['Context', 'Response']]

trainset, testset = train_test_split(df, test_size=0.057, random_state=42)



In [10]:
display(testset.iloc[1].Context)

'I have no real friends. I have a girlfriend who irritates me but loves me to death. I push her away and pushes me away. We’re going through a breakup, and I have nobody.'

In [11]:
import torch

dtype = "bfloat16"

# if torch.cuda.is_available():
#   if "T4" in torch.cuda.get_device_name(0):
#     dtype = "float16"
# else:
#   dtype = "float32"

# print(f"dtype: {dtype}")

In [12]:

# # prepare the model input
# prompt = "What's the meaning of life?"
# messages = [
#     {"role": "user", "content": prompt}
# ]
# text = tokenizer.apply_chat_template(
#     messages,
#     tokenize=False,
#     add_generation_prompt=True,
#     chat_template_kwargs={"enable_thinking": False}
# )
# model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# # conduct text completion
# generated_ids = model.generate(
#     **model_inputs,
#     max_new_tokens=16384
# )
# output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()

# content = tokenizer.decode(output_ids, skip_special_tokens=True)

# print("content:", content)

In [13]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_slug)
print(f"Tokenizer type: {type(tokenizer)}")
print(f"Tokenizer model name: {tokenizer.name_or_path}")

Tokenizer type: <class 'transformers.tokenization_utils_fast.PreTrainedTokenizerFast'>
Tokenizer model name: ../finetune/trained_models/gpt-oss-20b-instruct-finetuned-16bit


In [14]:

def format_generation_prompt(context_text, tokenizer_obj):
    """
    Formats a single context (user's message) into the chat template for model generation.
    """
    # Create the conversation structure directly with the provided context_text
    messages = [
        {"role": "user", "content": context_text},
    ]
    # For generation, add_generation_prompt should be True
    formatted_prompt = tokenizer_obj.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True,
        reasoning_effort = "high"
    )
    return formatted_prompt


In [15]:

prompts = []
original_prompts = []
true_responses = []

for i, row in testset.iterrows():
    context_text = row['Context']
    true_response_text = row['Response']

    formatted_input_string = format_generation_prompt(context_text, tokenizer)

    prompts.append(formatted_input_string)
    original_prompts.append(context_text)
    true_responses.append(true_response_text)

In [16]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_slug
)

In [17]:
from vllm import LLM, SamplingParams


sampling_params = SamplingParams(
    max_tokens=1024,
    top_k=0,
    temperature=1.0,
    top_p=1.0,
    stop=["<|return|>"],
)


INFO 09-26 22:03:19 [__init__.py:216] Automatically detected platform cuda.


In [18]:
import json
import re

model = LLM(
    model=model_slug,
    gpu_memory_utilization=0.9,
    max_model_len=8192,
    trust_remote_code=True
)

INFO 09-26 22:03:20 [utils.py:328] non-default args: {'trust_remote_code': True, 'max_model_len': 8192, 'disable_log_stats': True, 'model': '../finetune/trained_models/gpt-oss-20b-instruct-finetuned-16bit'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 09-26 22:03:27 [__init__.py:742] Resolved architecture: GptOssForCausalLM


`torch_dtype` is deprecated! Use `dtype` instead!


INFO 09-26 22:03:27 [__init__.py:1815] Using max model len 8192
INFO 09-26 22:03:27 [scheduler.py:222] Chunked prefill is enabled with max_num_batched_tokens=16384.
INFO 09-26 22:03:27 [config.py:284] Overriding max cuda graph capture size to 1024 for performance.
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:03:28 [core.py:654] Waiting for init message from front-end.
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:03:28 [core.py:76] Initializing a V1 LLM engine (v0.10.2) with config: model='../finetune/trained_models/gpt-oss-20b-instruct-finetuned-16bit', speculative_config=None, tokenizer='../finetune/trained_models/gpt-oss-20b-instruct-finetuned-16bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enf

[1;36m(EngineCore_DP0 pid=2407171)[0;0m W0926 22:03:30.025000 2407171 /gpfs/home5/amark/.conda/envs/vllm_env/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. 
[1;36m(EngineCore_DP0 pid=2407171)[0;0m W0926 22:03:30.025000 2407171 /gpfs/home5/amark/.conda/envs/vllm_env/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.


[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:03:31 [parallel_state.py:1165] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:03:31 [topk_topp_sampler.py:58] Using FlashInfer for top-p & top-k sampling.
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:03:31 [gpu_model_runner.py:2338] Starting to load model ../finetune/trained_models/gpt-oss-20b-instruct-finetuned-16bit...




[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:03:31 [gpu_model_runner.py:2370] Loading model from scratch...
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:03:31 [cuda.py:362] Using Flash Attention backend on V1 engine.


Loading safetensors checkpoint shards:   0% Completed | 0/3 [00:00<?, ?it/s]


[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:03:38 [default_loader.py:268] Loading weights took 6.35 seconds
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:03:38 [gpu_model_runner.py:2392] Model loading took 38.9881 GiB and 6.557104 seconds
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:03:42 [backends.py:539] Using cache directory: /home/amark/.cache/vllm/torch_compile_cache/71708a519d/rank_0_0/backbone for vLLM's torch.compile
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:03:42 [backends.py:550] Dynamo bytecode transform time: 3.67 s
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:03:45 [backends.py:194] Cache the graph for dynamic shape for later use
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:04:07 [backends.py:215] Compiling a graph for dynamic shape takes 24.17 s
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:04:08 [monitor.py:34] torch.compile takes 27.84 s in total


[1;36m(EngineCore_DP0 pid=2407171)[0;0m [rank0]:W0926 22:04:09.099000 2407171 /gpfs/home5/amark/.conda/envs/vllm_env/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. 
[1;36m(EngineCore_DP0 pid=2407171)[0;0m [rank0]:W0926 22:04:09.099000 2407171 /gpfs/home5/amark/.conda/envs/vllm_env/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2425] If this is not desired, please set os.environ['TORCH_CUDA_ARCH_LIST'] to specific architectures.
[1;36m(EngineCore_DP0 pid=2407171)[0;0m [rank0]:W0926 22:04:09.103000 2407171 /gpfs/home5/amark/.conda/envs/vllm_env/lib/python3.11/site-packages/torch/utils/cpp_extension.py:2425] TORCH_CUDA_ARCH_LIST is not set, all archs for visible cards are included for compilation. 
[1;36m(EngineCore_DP0 pid=2407171)[0;0m [rank0]:W0926 22:04:09.103000 2407171 /gpfs/home5/amark/.conda/envs/vllm_env/lib/python3.11/site-packages/torch/utils/cpp_extens

[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:04:57 [gpu_worker.py:298] Available KV cache memory: 41.91 GiB
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:04:57 [kv_cache_utils.py:1028] GPU KV cache size: 915,440 tokens
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:04:57 [kv_cache_utils.py:1032] Maximum concurrency for 8,192 tokens per request: 111.64x


[1;36m(EngineCore_DP0 pid=2407171)[0;0m 2025-09-26 22:04:57,640 - INFO - flashinfer.jit: [Autotuner]: Autotuning process starts ...
[1;36m(EngineCore_DP0 pid=2407171)[0;0m 2025-09-26 22:04:57,993 - INFO - flashinfer.jit: [Autotuner]: Autotuning process ends
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|██████████| 83/83 [00:05<00:00, 16.11it/s]


[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:05:03 [gpu_model_runner.py:3118] Graph capturing finished in 6 secs, took -0.48 GiB
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:05:03 [gpu_worker.py:391] Free memory on device (92.48/93.09 GiB) on startup. Desired GPU memory utilization is (0.9, 83.78 GiB). Actual usage is 38.99 GiB for weight, 2.82 GiB for peak activation, 0.07 GiB for non-torch memory, and -0.48 GiB for CUDAGraph memory. Replace gpu_memory_utilization config with `--kv-cache-memory=45352700723` to fit into requested memory, or `--kv-cache-memory=54697433088` to fully utilize gpu memory. Current kv cache memory in use is 44996184883 bytes.
INFO 09-26 22:05:03 [gpu_model_runner.py:3118] Graph capturing finished in 6 secs, took -0.48 GiB
[1;36m(EngineCore_DP0 pid=2407171)[0;0m INFO 09-26 22:05:03 [gpu_worker.py:391] Free memory on device (92.48/93.09 GiB) on startup. Desired GPU memory utilization is (0.9, 83.78 GiB). Actual usage is 38.99 GiB for wei

In [19]:
# prompt = "Give me a short introduction to large language model."
# messages = [
#     {"role": "user", "content": prompt}
# ]
# formatted_prompt = tokenizer.apply_chat_template(
#     messages,
#     tokenize=False,
#     add_generation_prompt=True,
#     chat_template_kwargs={"enable_thinking": False}
# )

In [20]:
# outputs = model.generate([formatted_prompt], sampling_params)

In [21]:
# !uv pip freeze --color never > eval_packages.txt

In [22]:
THINK_PATTERN = r"<think>.*?</think>"

In [23]:
from tqdm.auto import tqdm

outputs = model.generate(prompts, sampling_params)

generated_responses_data = []

Adding requests:   0%|          | 0/200 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/200 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

In [24]:
outputs

[RequestOutput(request_id=0, prompt="<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\nKnowledge cutoff: 2024-06\nCurrent date: 2025-09-26\n\nReasoning: high\n\n# Valid channels: analysis, commentary, final. Channel must be included for every message.\nCalls to these tools must go to the commentary channel: 'functions'.<|end|><|start|>user<|message|>We weren't long distance until he joined the military.  I love him and I know he loves, me but it's complicated. He said he's not going to find someone else, but I'm afraid. How do I keep our relationship going?<|end|><|start|>assistant", prompt_token_ids=[200006, 17360, 200008, 3575, 553, 17554, 162016, 11, 261, 4410, 6439, 2359, 22203, 656, 7788, 17527, 558, 87447, 100594, 25, 220, 1323, 19, 12, 3218, 198, 6576, 3521, 25, 220, 1323, 20, 12, 3114, 12, 2109, 279, 30377, 289, 25, 1932, 279, 2, 13888, 18403, 25, 8450, 11, 49159, 11, 1721, 13, 21030, 2804, 413, 7360, 395, 1753, 3176, 558, 63446, 316, 1879, 8

In [26]:
THINK_PATTERN = r"\s*<think>.*?</think>\s*"

# Process the outputs
for i, output in tqdm(enumerate(outputs), total=len(outputs), desc="Processing outputs"):
    raw_text = output.outputs[0].text
    print(f"raw: {raw_text}")

    clean_text = re.sub(THINK_PATTERN, "", raw_text, flags=re.DOTALL)
    final_text = clean_text.strip()
    print(f"final: {final_text}")

    generated_responses_data.append({
        "Context": original_prompts[i],
        "True_Response": true_responses[i],
        "Generated_Response": final_text
    })

print(f"Finished processing {len(generated_responses_data)} generated responses.")

# Convert to DataFrame and save to CSV
responses_df = pd.DataFrame(generated_responses_data)
output_csv_path = "vllm_model_test_responses_finetuned_gptoss20b.csv"
responses_df.to_csv(output_csv_path, index=False)


Processing outputs:   0%|          | 0/200 [00:00<?, ?it/s]

raw: I can sense that you are connected with him.How long have you been in the relationship?Is there anyone in his life that you find threatening to your engagement to be in his life?Do you have any suicidal thoughts?Since you have recently attached with partner, it works hard to have several unshared thoughts in your head.Do you have other matters in your mind that could be negatively affecting attachment you have with partner?Is he contributing to the lack connection you are feeling with him?I don't see any mention of any sort of commitments about the soldier, aren't we clear about this?Does the emotionally distant or withdraw he carries presenting cause you doubt about future in relationship?I recommend therapy with whom you can discuss and process the thoughts that arise in mind for now.We learned skills about marriage and responsibility to be more engaged in meeting needs of partners.Emotions and behavior develop in patterns that influence much of our reactions to others in contex