<a href="https://colab.research.google.com/github/tcharos/NLP-Toxicity-Detection/blob/main/AIDL_CS01_NLP_Project_task_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AIDL_B_CS01: Advanced NLP Project

## LLM Tuning with DPO (Gordon Ramsay Alignment)

In [1]:
import os
import io
import sys
from datetime import datetime
import shutil
from google.colab import files

IN_COLAB = 'google.colab' in sys.modules
BASE_DIR = "/content" if IN_COLAB else "."
TOXICITY_PATH = os.path.join(BASE_DIR, "data_sets/toxicity")
SEED = 12345

if IN_COLAB:
    print("Running in Google Colab. Installing NLP stack...")
    !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
    !pip install -q -U "trl<=0.24.0" "datasets==4.3.0" transformers accelerate peft sentence-transformers
else:
    print("Running locally. Checking Mac-specific requirements...")
    !{sys.executable} -m pip install -q "tensorflow==2.16.2" "tensorflow-macos==2.16.2" "tf-keras~=2.16"
    !{sys.executable} -m pip install unsloth-mlx
    !{sys.executable} -m pip install -q -U "trl<=0.24.0" "datasets==4.3.0" transformers accelerate peft sentence-transformers

os.environ["KERAS_BACKEND"] = "tensorflow"

if IN_COLAB:
    from unsloth import FastLanguageModel
else:
    from unsloth_mlx import FastLanguageModel

from unsloth import PatchDPOTrainer

import torch
import numpy as np
import pandas as pd
import glob
from datasets import Dataset
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, confusion_matrix

from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer, util
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments
)

from peft import LoraConfig, get_peft_model
from trl import DPOConfig, DPOTrainer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

print(f"\nTensorFlow Version: {tf.__version__}")
print("Num GPUs Available (TF): ", len(tf.config.list_physical_devices('GPU')))

HAS_MPS = torch.backends.mps.is_available()
HAS_CUDA = torch.cuda.is_available()

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("PyTorch Device: Mac GPU (Metal)")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("PyTorch Device: Colab GPU (CUDA)")
else:
    device = torch.device("cpu")
    print("PyTorch Device: CPU")

Running in Google Colab. Installing NLP stack...
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-a2h4q2eu/unsloth_e4084da205e4414ca64ee858c380f60e
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-a2h4q2eu/unsloth_e4084da205e4414ca64ee858c380f60e
  Resolved https://github.com/unslothai/unsloth.git to commit b96a04c17bc6bcb5522eafb17adc2b104be38f99
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2026.1.2 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2026.1.2-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unslo

### Dataset Preparation

In [None]:
# code i used to contatenate all *.csv to one test.csv - executed only once locally

# all_csv_files = glob.glob(os.path.join(test_folder_path, "*.csv"))
# valid_dfs = []
# required_cols = ["Question", "Polite", "Ramsay"]

# for f in all_csv_files:
#     try:
#         # Try UTF-8 first, fallback to cp1252 if it fails
#         try:
#             temp_df = pd.read_csv(f, encoding='utf-8')
#         except UnicodeDecodeError:
#             temp_df = pd.read_csv(f, encoding='cp1252')

#         # Check if the required columns exist
#         if all(col in temp_df.columns for col in required_cols):
#             valid_dfs.append(temp_df[required_cols])
#         else:
#             print(f"Skipping {f}: Missing required columns. Found: {temp_df.columns.tolist()}")

#     except Exception as e:
#         print(f"Could not load {f} due to error: {e}")

# # Combine only the valid ones
# if valid_dfs:
#     all_colleagues_data = pd.concat(valid_dfs, ignore_index=True)
#     # Requirement 4: Save to test.csv
#     all_colleagues_data.to_csv("test.csv", index=False)

#     # Take 500 for training
#     train_df = all_colleagues_data.sample(n=min(500, len(all_colleagues_data)), random_state=42)
#     print(f"Successfully loaded {len(valid_dfs)} files.")
#     print(f"Total training rows available: {len(all_colleagues_data)}")
# else:
#     print("No valid CSV files were loaded!")

### Functions

In [2]:
def verify_and_get_files(folder, expected_default_name):
    os.makedirs(folder, exist_ok=True)

    # Check if any CSV already exists in the folder
    existing_csvs = glob.glob(os.path.join(folder, "*.csv"))

    if IN_COLAB and not existing_csvs:
        print(f"No CSV found in {folder}. Upload your file.")
        uploaded = files.upload()
        for filename in uploaded.keys():
            target_path = os.path.join(folder, filename)
            os.rename(filename, target_path)
        existing_csvs = glob.glob(os.path.join(folder, "*.csv"))

    if existing_csvs:
        for f in existing_csvs:
            if os.path.basename(f) == expected_default_name:
                return f
        return existing_csvs[0]
    return None

In [3]:
test_folder_path = './data_sets/Ramsay/test'
val_folder_path = './data_sets/Ramsay/val'

In [4]:
train_file = verify_and_get_files(test_folder_path, "test.csv")
val_file = verify_and_get_files(val_folder_path, "mscaidl-0077_ramsay_dataset.csv")

No CSV found in ./data_sets/Ramsay/test. Upload your file.


Saving test.csv to test.csv
No CSV found in ./data_sets/Ramsay/val. Upload your file.


Saving mscaidl-0077_ramsay_dataset.csv to mscaidl-0077_ramsay_dataset.csv


In [5]:
if train_file and val_file:
    print(f"Training file located: {train_file}")
    print(f"Validation file located: {val_file}")

    try:
        sample_df = pd.read_csv(val_file, sep=None, engine='python', encoding='utf-8-sig')
        print("\nSuccessfully loaded data. Preview of columns:")
        print(sample_df.columns.tolist())
    except Exception as e:
        print(f"Error reading file: {e}")
else:
    print("Files are missing. If you are not in Colab, please place CSVs in the folders manually.")

Training file located: ./data_sets/Ramsay/test/test.csv
Validation file located: ./data_sets/Ramsay/val/mscaidl-0077_ramsay_dataset.csv

Successfully loaded data. Preview of columns:
['AIDL_ID', 'Question', 'Polite', 'Ramsay']


In [6]:
def load_any_ramsay_csv(file_path, limit=None, is_train=True):
    with open(file_path, 'r', encoding='utf-8-sig', errors='ignore') as f:
        content = f.read().replace('"', '')

    df = pd.read_csv(io.StringIO(content), sep=None, engine='python', on_bad_lines='skip')

    df.columns = [c.strip() for c in df.columns]

    required_cols = ["Question", "Polite", "Ramsay"]
    df = df[required_cols]

    if is_train:
        # 500 samples for training
        df = df.sample(n=min(limit, len(df)), random_state=SEED)
    else:
        # first 100 samples for validation
        df = df.head(limit)

    print(f"Successfully loaded {len(df)} rows from {file_path}")

    return Dataset.from_dict({
        "prompt":   df["Question"].astype(str).tolist(),
        "chosen":   df["Ramsay"].astype(str).tolist(),
        "rejected": df["Polite"].astype(str).tolist(),
    }), df

# train dataset
train_dataset, _ = load_any_ramsay_csv(train_file, limit=500, is_train=True)

# val dataset
eval_dataset, eval_df_raw = load_any_ramsay_csv(val_file, limit=100, is_train=False)

Successfully loaded 500 rows from ./data_sets/Ramsay/test/test.csv
Successfully loaded 100 rows from ./data_sets/Ramsay/val/mscaidl-0077_ramsay_dataset.csv


### SLM from usloath (not Zephyr)



In [7]:
model_name = "unsloth/Llama-3.2-3B-Instruct"

max_seq_length = 2048
dtype = None # Auto detect
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# LoRA Adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

print(f"Model {model_name} loaded successfully with LoRA.")

==((====))==  Unsloth 2026.1.2: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Unsloth 2026.1.2 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Model unsloth/Llama-3.2-3B-Instruct loaded successfully with LoRA.


In [8]:
BASE_CONFIG = {
    'per_device_train_batch_size': 2,
    'gradient_accumulation_steps': 4,
    'warmup_ratio': 0.1,
    'num_train_epochs': 3,
    'learning_rate': 5e-5,
    'logging_steps': 1,
    'optim': "paged_adamw_32bit",
    'weight_decay': 0.01,
    'lr_scheduler_type': "linear",
    'beta': 0.2,
    'max_prompt_length': 512,
    'max_length': 1024,
}

In [9]:
PatchDPOTrainer()

training_args = DPOConfig(
    per_device_train_batch_size = BASE_CONFIG['per_device_train_batch_size'],
    gradient_accumulation_steps = BASE_CONFIG['gradient_accumulation_steps'],
    warmup_ratio                = BASE_CONFIG['warmup_ratio'],
    num_train_epochs            = BASE_CONFIG['num_train_epochs'],
    learning_rate               = BASE_CONFIG['learning_rate'],
    fp16                        = not torch.cuda.is_bf16_supported(),
    bf16                        = torch.cuda.is_bf16_supported(),
    logging_steps               = 1,
    optim                       = BASE_CONFIG['optim'],
    weight_decay                = BASE_CONFIG['weight_decay'],
    lr_scheduler_type           = BASE_CONFIG['lr_scheduler_type'],
    seed                        = SEED,
    output_dir                  = "outputs",
    eval_strategy               = "steps",
    eval_steps                  = 10,
    report_to                   = "none",

    # DPO specific
    beta                        = BASE_CONFIG['beta'],
    max_prompt_length           = BASE_CONFIG['max_prompt_length'],
    max_length                  = BASE_CONFIG['max_length'],
)

dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = tokenizer,
    beta = 0.1,
    max_prompt_length = 512,
    max_length = 1024,
)

Extracting prompt in train dataset (num_proc=16):   0%|          | 0/500 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=16):   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=16):   0%|          | 0/500 [00:00<?, ? examples/s]

Extracting prompt in eval dataset (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

Applying chat template to eval dataset (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

In [10]:
print("--- Training Started ---")
dpo_trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.


--- Training Started ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 3 | Total steps = 189
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Step,Training Loss,Validation Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
10,0.6808,0.666371,0.041184,-0.013956,0.82,0.05514,-123.031952,-59.130302,-1.256973,-1.135245,0,0,0
20,0.5219,0.35288,0.692604,-0.209062,1.0,0.901666,-119.774857,-60.105839,-1.262044,-1.151422,No Log,No Log,No Log
30,0.219,0.058953,2.12082,-1.362025,1.0,3.482845,-112.633766,-65.870644,-1.182477,-1.145006,No Log,No Log,No Log
40,0.1482,0.016789,2.72055,-3.132638,1.0,5.853189,-109.635117,-74.723717,-1.036376,-1.132801,No Log,No Log,No Log
50,0.0198,0.00817,2.989648,-4.474951,1.0,7.464599,-108.289642,-81.43528,-0.857738,-1.047522,No Log,No Log,No Log
60,0.0885,0.006264,3.173145,-4.880559,1.0,8.053705,-107.372147,-83.463318,-0.755407,-0.972171,No Log,No Log,No Log
70,0.0023,0.005565,3.278077,-5.06465,1.0,8.342728,-106.847481,-84.383766,-0.712004,-0.942329,No Log,No Log,No Log
80,0.0069,0.005072,3.325003,-5.241664,1.0,8.566668,-106.612862,-85.268845,-0.69183,-0.930693,No Log,No Log,No Log
90,0.0016,0.004609,3.348341,-5.382463,1.0,8.730803,-106.49617,-85.972839,-0.669923,-0.915786,No Log,No Log,No Log
100,0.1761,0.004364,3.351089,-5.49896,1.0,8.850049,-106.482422,-86.555313,-0.657476,-0.909873,No Log,No Log,No Log


TrainOutput(global_step=189, training_loss=0.20375898983103172, metrics={'train_runtime': 463.6999, 'train_samples_per_second': 3.235, 'train_steps_per_second': 0.408, 'total_flos': 0.0, 'train_loss': 0.20375898983103172, 'epoch': 3.0})

In [15]:
FastLanguageModel.for_inference(model)

sim_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

questions = eval_df_raw["Question"].tolist()
prompts = [f"Question: {q}\nResponse:" for q in questions]

print(f"Generating responses for {len(questions)} questions...")
inputs = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=64,
    temperature=0.7,
    use_cache=True)

decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Clean up responses
model_results = [text.split("Response:")[-1].strip() for text in decoded_outputs]
eval_df_raw["Model_Result"] = model_results

print("Start Cosine Similarity Calculation")
model_embeddings = sim_model.encode(eval_df_raw["Model_Result"].tolist(), convert_to_tensor=True)
polite_embeddings = sim_model.encode(eval_df_raw["Polite"].tolist(), convert_to_tensor=True)

# how close the model got to the "Ramsay" target
cosine_scores = util.cos_sim(model_embeddings, polite_embeddings)
eval_df_raw["Similarity_Score"] = torch.diag(cosine_scores).cpu().tolist()

avg_sim = eval_df_raw['Similarity_Score'].mean()

print(f"Process Complete.")
print(f"Average Similarity Score: {avg_sim:.4f}")

Generating responses for 100 questions...
Start Cosine Similarity Calculation
Process Complete.
Average Similarity Score: 0.4424


In [16]:
print("\n" + "="*50)
print("PREVIEW OF FIRST 5 RAMSAY RESPONSES")
print("="*50)

for i in range(min(5, len(eval_df_raw))):
    print(f"\n[ENTRY {i+1}]")
    print(f"STUDENT QUESTION: {eval_df_raw['Question'].iloc[i]}")
    print(f"RAMSAY RESPONSE:  {eval_df_raw['Model_Result'].iloc[i]}")
    print(f"SIMILARITY SCORE: {eval_df_raw['Similarity_Score'].iloc[i]:.4f}")
    print("-" * 50)


PREVIEW OF FIRST 5 RAMSAY RESPONSES

[ENTRY 1]
STUDENT QUESTION: What is the purpose of a Padding token?
RAMSAY RESPONSE:  It is a symbol of your presence, a reminder that you are here. Just as a pillow provides comfort to your head, a padding token provides comfort to your soul. It's a reminder that you're not alone, that there's someone who cares about you, even if it's just a stuffed animal. It's
SIMILARITY SCORE: 0.2336
--------------------------------------------------

[ENTRY 2]
STUDENT QUESTION: What is the purpose of Dropout?
RAMSAY RESPONSE:  You can't have a perfect student, but you can have a perfect dropouts!
SIMILARITY SCORE: 0.4170
--------------------------------------------------

[ENTRY 3]
STUDENT QUESTION: Why normalize input data?
RAMSAY RESPONSE:  You don‚Äôt. You‚Äôre trying to optimize the system‚Äôs output! It‚Äôs like trying to optimize your diet because you‚Äôre tired of your body not being able to do it. You don‚Äôt try to optimize your diet, you optimize you

In [17]:
eval_df_raw.to_csv("mscaidl-0077_task_5_results.csv", index=False)

In [20]:
timestamp = datetime.now().strftime("%d%m%Y_%H%M")
base_filename = f"dpo_ramsay_lora_{timestamp}"

In [None]:
# this step takes ~15 mins because it saves the full model ~5GB
# we could save only the changes we made at the DPO step

# model.save_pretrained_merged("dpo_ramsay_model", tokenizer, save_method = "merged_16bit")
# shutil.make_archive("dpo_ramsay_model", 'zip', "dpo_ramsay_model")

# files.download("dpo_ramsay_model.zip")

In [21]:
# LoRA save only

model.save_pretrained(base_filename)
tokenizer.save_pretrained(base_filename)

shutil.make_archive(base_filename, 'zip', base_filename)

# files.download(f"{base_filename}.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>