<a href="https://colab.research.google.com/github/tcharos/NLP-Toxicity-Detection/blob/main/AIDL_CS01_NLP_Project_task_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AIDL_B_CS01: Advanced NLP Project

## 5. LLM Tuning with DPO (Gordon Ramsay Alignment)

In [None]:
import os
import io
import sys
import re
import torch
import glob
import pandas as pd
import numpy as np
from datetime import datetime
from google.colab import files

IN_COLAB = 'google.colab' in sys.modules
SEED = 12345

# disable progress bars in order to be visible in github preview
from datasets import disable_progress_bars
disable_progress_bars()

os.environ['TQDM_DISABLE'] = '0'
os.environ['TQDM_MININTERVAL'] = '1'
os.environ['DATASETS_PROGRESS_BAR_TYPE'] = 'tqdm'
# disable progress bars in order to be visible in github preview

if IN_COLAB:
    print("Running in Google Colab. Installing Task 5 NLP stack...")
    !pip install --no-cache-dir bitsandbytes
    !pip install --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
    !pip install -q --no-deps trl peft accelerate bitsandbytes sentence-transformers

from unsloth import FastLanguageModel, PatchDPOTrainer
from datasets import load_dataset, Dataset
from trl import DPOConfig, DPOTrainer
from sentence_transformers import SentenceTransformer, util
from transformers import TrainingArguments, AutoTokenizer

# Device Check
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("PyTorch Device: Colab GPU (CUDA)")
else:
    print("CRITICAL: GPU NOT DETECTED. Change runtime type to T4 GPU.")

print("\nEnvironment Clean. You can now load your Llama-3 model.")

Running in Google Colab. Installing Task 5 NLP stack...
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-ebyxoyk7/unsloth_598655afa9934009a0cc6162704938e0
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-ebyxoyk7/unsloth_598655afa9934009a0cc6162704938e0
  Resolved https://github.com/unslothai/unsloth.git to commit 18d020e56350f87786ce5e21f940da67e475ee77
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
PyTorch Device: Colab GPU (CUDA)

Environment Clean. You can now load your Llama-3 model.


### Dataset Preparation

In [None]:
# code i used to contatenate all *.csv to one test.csv - executed only once locally

# all_csv_files = glob.glob(os.path.join(test_folder_path, "*.csv"))
# valid_dfs = []
# required_cols = ["Question", "Polite", "Ramsay"]

# for f in all_csv_files:
#     try:
#         # Try UTF-8 first, fallback to cp1252 if it fails
#         try:
#             temp_df = pd.read_csv(f, encoding='utf-8')
#         except UnicodeDecodeError:
#             temp_df = pd.read_csv(f, encoding='cp1252')

#         # Check if the required columns exist
#         if all(col in temp_df.columns for col in required_cols):
#             valid_dfs.append(temp_df[required_cols])
#         else:
#             print(f"Skipping {f}: Missing required columns. Found: {temp_df.columns.tolist()}")

#     except Exception as e:
#         print(f"Could not load {f} due to error: {e}")

# # Combine only the valid ones
# if valid_dfs:
#     all_colleagues_data = pd.concat(valid_dfs, ignore_index=True)
#     # Requirement 4: Save to test.csv
#     all_colleagues_data.to_csv("test.csv", index=False)

#     # Take 500 for training
#     train_df = all_colleagues_data.sample(n=min(500, len(all_colleagues_data)), random_state=42)
#     print(f"Successfully loaded {len(valid_dfs)} files.")
#     print(f"Total training rows available: {len(all_colleagues_data)}")
# else:
#     print("No valid CSV files were loaded!")

### Functions

In [None]:
def verify_and_get_files(folder, expected_default_name):
    os.makedirs(folder, exist_ok=True)

    # Check if any CSV already exists in the folder
    existing_csvs = glob.glob(os.path.join(folder, "*.csv"))

    if IN_COLAB and not existing_csvs:
        print(f"No CSV found in {folder}. Upload your file.")
        uploaded = files.upload()
        for filename in uploaded.keys():
            target_path = os.path.join(folder, filename)
            os.rename(filename, target_path)
        existing_csvs = glob.glob(os.path.join(folder, "*.csv"))

    if existing_csvs:
        for f in existing_csvs:
            if os.path.basename(f) == expected_default_name:
                return f
        return existing_csvs[0]
    return None

In [None]:
test_folder_path = './data_sets/Ramsay/test'
val_folder_path = './data_sets/Ramsay/val'

In [None]:
train_file = verify_and_get_files(test_folder_path, "test.csv")
val_file = verify_and_get_files(val_folder_path, "mscaidl-0077_ramsay_dataset.csv")

No CSV found in ./data_sets/Ramsay/test. Upload your file.


Saving test.csv to test.csv
No CSV found in ./data_sets/Ramsay/val. Upload your file.


Saving mscaidl-0077_ramsay_dataset.csv to mscaidl-0077_ramsay_dataset.csv


In [None]:
if train_file and val_file:
    print(f"Training file located: {train_file}")
    print(f"Validation file located: {val_file}")

    try:
        sample_df = pd.read_csv(val_file, sep=None, engine='python', encoding='utf-8-sig')
        print("\nSuccessfully loaded data. Preview of columns:")
        print(sample_df.columns.tolist())
    except Exception as e:
        print(f"Error reading file: {e}")
else:
    print("Files are missing. If you are not in Colab, please place CSVs in the folders manually.")

Training file located: ./data_sets/Ramsay/test/test.csv
Validation file located: ./data_sets/Ramsay/val/mscaidl-0077_ramsay_dataset.csv

Successfully loaded data. Preview of columns:
['AIDL_ID', 'Question', 'Polite', 'Ramsay']


In [None]:
def load_any_ramsay_csv(file_path, limit=None, is_train=True):
    with open(file_path, 'r', encoding='utf-8-sig', errors='ignore') as f:
        content = f.read().replace('"', '')

    df = pd.read_csv(io.StringIO(content), sep=None, engine='python', on_bad_lines='skip')

    df.columns = [c.strip() for c in df.columns]

    required_cols = ["Question", "Polite", "Ramsay"]
    df = df[required_cols]

    if is_train:
        # 500 samples for training
        df = df.sample(n=min(limit, len(df)), random_state=SEED)
    else:
        # first 100 samples for validation
        df = df.head(limit)

    print(f"Successfully loaded {len(df)} rows from {file_path}")

    return Dataset.from_dict({
        "prompt":   df["Question"].astype(str).tolist(),
        "chosen":   df["Ramsay"].astype(str).tolist(),
        "rejected": df["Polite"].astype(str).tolist(),
    }), df

# train dataset
train_dataset, _ = load_any_ramsay_csv(train_file, limit=500, is_train=True)

# val dataset
eval_dataset, eval_df_raw = load_any_ramsay_csv(val_file, limit=100, is_train=False)

Successfully loaded 500 rows from ./data_sets/Ramsay/test/test.csv
Successfully loaded 100 rows from ./data_sets/Ramsay/val/mscaidl-0077_ramsay_dataset.csv


### SLM from usloath (not Zephyr)



In [None]:
model_name = "unsloth/Llama-3.2-3B-Instruct"

max_seq_length = 2048
dtype = None # Auto detect
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# LoRA Adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

print(f"Model {model_name} loaded successfully with LoRA.")

==((====))==  Unsloth 2026.2.1: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.034 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Unsloth 2026.2.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Model unsloth/Llama-3.2-3B-Instruct loaded successfully with LoRA.


In [None]:
BASE_CONFIG = {
    'per_device_train_batch_size': 2,
    'gradient_accumulation_steps': 4,
    'warmup_ratio': 0.1,
    'num_train_epochs': 3,
    'learning_rate': 5e-5,
    'logging_steps': 1,
    'optim': "paged_adamw_32bit",
    'weight_decay': 0.01,
    'lr_scheduler_type': "linear",
    'beta': 0.2,
    'max_prompt_length': 512,
    'max_length': 1024,
}

In [None]:
PatchDPOTrainer()

training_args = DPOConfig(
    per_device_train_batch_size = BASE_CONFIG['per_device_train_batch_size'],
    gradient_accumulation_steps = BASE_CONFIG['gradient_accumulation_steps'],
    warmup_ratio                = BASE_CONFIG['warmup_ratio'],
    num_train_epochs            = BASE_CONFIG['num_train_epochs'],
    learning_rate               = BASE_CONFIG['learning_rate'],
    fp16                        = not torch.cuda.is_bf16_supported(),
    bf16                        = torch.cuda.is_bf16_supported(),
    logging_steps               = 1,
    optim                       = BASE_CONFIG['optim'],
    weight_decay                = BASE_CONFIG['weight_decay'],
    lr_scheduler_type           = BASE_CONFIG['lr_scheduler_type'],
    seed                        = SEED,
    output_dir                  = "outputs",
    eval_strategy               = "steps",
    eval_steps                  = 10,
    report_to                   = "none",

    # DPO specific
    beta                        = BASE_CONFIG['beta'],
    max_prompt_length           = BASE_CONFIG['max_prompt_length'],
    max_length                  = BASE_CONFIG['max_length'],
)

dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = tokenizer,
    beta = 0.1,
    max_prompt_length = 512,
    max_length = 1024,
)

print("--- DPO Patch Complete ---")

--- DPO Patch Complete ---


In [None]:
print("--- Training Started ---")
dpo_trainer.train()

--- Training Started ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 3 | Total steps = 189
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Step,Training Loss,Validation Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
10,0.6808,0.666371,0.041184,-0.013956,0.82,0.05514,-123.031952,-59.130302,-1.256973,-1.135245,0,0,0
20,0.5219,0.35288,0.692604,-0.209062,1.0,0.901666,-119.774857,-60.105839,-1.262044,-1.151422,No Log,No Log,No Log
30,0.219,0.058953,2.12082,-1.362025,1.0,3.482845,-112.633766,-65.870644,-1.182477,-1.145006,No Log,No Log,No Log
40,0.1482,0.016789,2.72055,-3.132638,1.0,5.853189,-109.635117,-74.723717,-1.036376,-1.132801,No Log,No Log,No Log
50,0.0198,0.00817,2.989648,-4.474951,1.0,7.464599,-108.289642,-81.43528,-0.857738,-1.047522,No Log,No Log,No Log
60,0.0885,0.006264,3.173145,-4.880559,1.0,8.053705,-107.372147,-83.463318,-0.755407,-0.972171,No Log,No Log,No Log
70,0.0023,0.005565,3.278077,-5.06465,1.0,8.342728,-106.847481,-84.383766,-0.712004,-0.942329,No Log,No Log,No Log
80,0.0069,0.005072,3.325003,-5.241664,1.0,8.566668,-106.612862,-85.268845,-0.69183,-0.930693,No Log,No Log,No Log
90,0.0016,0.004609,3.348341,-5.382463,1.0,8.730803,-106.49617,-85.972839,-0.669923,-0.915786,No Log,No Log,No Log
100,0.1761,0.004364,3.351089,-5.49896,1.0,8.850049,-106.482422,-86.555313,-0.657476,-0.909873,No Log,No Log,No Log


TrainOutput(global_step=189, training_loss=0.20375898983103172, metrics={'train_runtime': 470.8394, 'train_samples_per_second': 3.186, 'train_steps_per_second': 0.401, 'total_flos': 0.0, 'train_loss': 0.20375898983103172, 'epoch': 3.0})

In [None]:
# Set to inference mode
FastLanguageModel.for_inference(model)

# Load similarity model
sim_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

questions = eval_df_raw["Question"].tolist()
prompts = [f"Question: {q}\nResponse:" for q in questions]

print(f"Generating responses for {len(questions)} questions...")
inputs = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=64,
    temperature=0.7,
    use_cache=True)

decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

model_results = [text.split("Response:")[-1].strip().replace("\n"," ") for text in decoded_outputs]
eval_df_raw["Model_Result"] = model_results

print("Start Cosine Similarity Calculation (Model vs. Original Ramsay)")

model_embeddings = sim_model.encode(eval_df_raw["Model_Result"].tolist(), convert_to_tensor=True)

# Encoding the original Ramsay-style target answers
target_ramsay_embeddings = sim_model.encode(eval_df_raw["Ramsay"].tolist(), convert_to_tensor=True)

cosine_scores = util.cos_sim(model_embeddings, target_ramsay_embeddings)
eval_df_raw["Similarity_Score_Ramsay"] = torch.diag(cosine_scores).cpu().tolist()

avg_sim_ramsay = eval_df_raw['Similarity_Score_Ramsay'].mean()

print(f"Process Complete.")
print(f"Average Similarity to Original Ramsay: {avg_sim_ramsay:.4f}")

In [None]:
print("\n" + "="*50)
print("PREVIEW OF FIRST 5 RAMSAY RESPONSES")
print("="*50)

for i in range(min(5, len(eval_df_raw))):
    print(f"\n[ENTRY {i+1}]")
    print(f"STUDENT QUESTION: {eval_df_raw['Question'].iloc[i]}")
    print(f"RAMSAY RESPONSE:  {eval_df_raw['Model_Result'].iloc[i]}")
    print(f"SIMILARITY SCORE: {eval_df_raw['Similarity_Score_Ramsay'].iloc[i]:.4f}")
    print("-" * 50)


PREVIEW OF FIRST 5 RAMSAY RESPONSES

[ENTRY 1]
STUDENT QUESTION: What is the purpose of a Padding token?
RAMSAY RESPONSE:  The purpose of a padding token is to prevent someone from guessing your password. It is like a guardian angel that watches your back while you're sleeping. It's a safety net that prevents your password from being used by someone else. You can't just leave your house without locking the door, you know. Similarly, you
SIMILARITY SCORE: 0.1535
--------------------------------------------------

[ENTRY 2]
STUDENT QUESTION: What is the purpose of Dropout?
RAMSAY RESPONSE:  To let you know that you messed up and are not good enough!
SIMILARITY SCORE: 0.0162
--------------------------------------------------

[ENTRY 3]
STUDENT QUESTION: Why normalize input data?
RAMSAY RESPONSE:  Normalizing input data is like preparing your clothes for a fashion show! You want to look your best, not like you just rolled out of bed! It means making sure your numbers are in the right orde

In [None]:
timestamp = datetime.now().strftime("%d%m%Y_%H%M")

In [None]:
eval_df_raw.to_csv(f"mscaidl-0077_task_5_{timestamp}.csv", index=False)

In [None]:
# this step takes ~15 mins because it saves the full model ~5GB
# we could save only the changes we made at the DPO step

# model.save_pretrained_merged("dpo_ramsay_model", tokenizer, save_method = "merged_16bit")
# shutil.make_archive("dpo_ramsay_model", 'zip', "dpo_ramsay_model")

# files.download("dpo_ramsay_model.zip")

In [None]:
# LoRA save only

# base_filename = f"dpo_ramsay_lora_{timestamp}"
# model.save_pretrained(base_filename)
# tokenizer.save_pretrained(base_filename)

# shutil.make_archive(base_filename, 'zip', base_filename)

# files.download(f"{base_filename}.zip")

In [None]:
judge_model, judge_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "prometheus-eval/prometheus-7b-v2.0",
    load_in_4bit = True,
)

In [None]:
def get_judge_prompt(question, response, reference):
    # This now accepts the 3 arguments you are passing in Wave 2
    return f"""###Task Description:
    An instruction, a response to evaluate, and a reference answer that gets a score of 5 are given.

    ###The instruction to evaluate:
    {question}

    ###Response to evaluate:
    {response}

    ###Reference Answer (Score 5):
    {reference}

    ###Score Rubrics:
    [Does the response explain Deep Learning while staying perfectly in a volatile Gordon Ramsay persona?]
    Score 1: Polite, boring, or technically wrong.
    Score 5: Aggressive, use of culinary metaphors (raw, bin it, donkey), and technically correct.

    ###Feedback:"""

def batch_judge(df):
    scores = []
    for index, row in df.iterrows():
        prompt = get_judge_prompt(row["Question"], row["Model_Result"], row["Ramsay"])
        inputs = judge_tokenizer([prompt], return_tensors="pt").to("cuda")

        with torch.no_grad():
            output = judge_model.generate(**inputs, max_new_tokens=200)
            verdict = judge_tokenizer.decode(output[0], skip_special_tokens=True)

        score = extract_score(verdict)
        scores.append(score)

        if (index + 1) % 10 == 0:
            print(f"Row {index+1} Score: {score} | Raw Preview: {verdict[-50:]}")

    return scores

def extract_score(text):
    match = re.search(r"\[RESULT\]\s*(\d)", text)
    if match:
        return int(match.group(1))

    digits = re.findall(r"\d", text)
    if digits:
        return int(digits[-1])

    return None

In [None]:
eval_df_raw["Judge_Score"] = batch_judge(eval_df_raw)

Row 10 Score: 5 | Raw Preview: appropriate for a discussion on machine learning. 
Row 20 Score: 5 | Raw Preview: ors that are present in the reference response. So
Row 30 Score: 4 | Raw Preview: rs and aggressive tone. So the overall score is 4.
Row 40 Score: 5 | Raw Preview:  only technically incorrect but also lacks the use
Row 50 Score: 1 | Raw Preview: tions in deep networks. So the overall score is 1.
Row 60 Score: 3 | Raw Preview: core rubric. So the overall score is 3. [Result] 3
Row 70 Score: 1 | Raw Preview: t suitable for the task at hand. 

    ###Score: 1
Row 80 Score: 1 | Raw Preview:  the function of LSTMs. So the overall score is 1.
Row 90 Score: 1 | Raw Preview: igns with a score of 1. So the overall score is 1.
Row 100 Score: 5 | Raw Preview: ting existing data, such as adding noise, changing


In [None]:
print(f"Average Ramsay Score: {eval_df_raw['Judge_Score'].mean():.2f} / 5")

Average Ramsay Score: 2.71 / 5


In [None]:
# Filter rows where Judge_Score is missing
failed_rows = eval_df_raw[eval_df_raw["Judge_Score"].isna()]

print(f"Total failed rows: {len(failed_rows)}")

# Show the Question and the Model_Result for the first few failures
if not failed_rows.empty:
    display(failed_rows[["Question", "Model_Result"]].head())
else:
    print("Zero failures! Every row was scored successfully.")

Total failed rows: 0
Zero failures! Every row was scored successfully.
