<a href="https://colab.research.google.com/github/tcharos/NLP-Toxicity-Detection/blob/main/AIDL_CS01_NLP_Project_task_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AIDL_B_CS01: Advanced NLP Project

## 5. LLM Tuning with DPO (Gordon Ramsay Alignment)

In [1]:
from unsloth import FastLanguageModel, PatchDPOTrainer
import os
import io
import sys
import torch
import glob
import pandas as pd
import numpy as np
from datetime import datetime
from google.colab import files

IN_COLAB = 'google.colab' in sys.modules
SEED = 12345

if IN_COLAB:
    print("Running in Google Colab. Installing Task 5 NLP stack...")
    !pip install --no-cache-dir bitsandbytes
    !pip install --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
    !pip install -q --no-deps trl peft accelerate bitsandbytes sentence-transformers

# from unsloth import FastLanguageModel, PatchDPOTrainer
from datasets import load_dataset, Dataset
from trl import DPOConfig, DPOTrainer
from sentence_transformers import SentenceTransformer, util
from transformers import TrainingArguments, AutoTokenizer

# Device Check
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("PyTorch Device: Colab GPU (CUDA)")
else:
    print("CRITICAL: GPU NOT DETECTED. Change runtime type to T4 GPU.")

print("\nEnvironment Clean. You can now load your Llama-3 model.")

ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
Running in Google Colab. Installing Task 5 NLP stack...
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-wr8nrrlb/unsloth_5767de569b154afa8aeed86678013d3d
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-wr8nrrlb/unsloth_5767de569b154afa8aeed86678013d3d
  Resolved https://github.com/unslothai/unsloth.git to commit 18d020e56350f87786ce5e21f940da67e475ee77
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
PyTorch Device: Colab GPU (CUDA)

Environment Clean. You can now load your Llama-3 model.


### Dataset Preparation

In [None]:
# code i used to contatenate all *.csv to one test.csv - executed only once locally

# all_csv_files = glob.glob(os.path.join(test_folder_path, "*.csv"))
# valid_dfs = []
# required_cols = ["Question", "Polite", "Ramsay"]

# for f in all_csv_files:
#     try:
#         # Try UTF-8 first, fallback to cp1252 if it fails
#         try:
#             temp_df = pd.read_csv(f, encoding='utf-8')
#         except UnicodeDecodeError:
#             temp_df = pd.read_csv(f, encoding='cp1252')

#         # Check if the required columns exist
#         if all(col in temp_df.columns for col in required_cols):
#             valid_dfs.append(temp_df[required_cols])
#         else:
#             print(f"Skipping {f}: Missing required columns. Found: {temp_df.columns.tolist()}")

#     except Exception as e:
#         print(f"Could not load {f} due to error: {e}")

# # Combine only the valid ones
# if valid_dfs:
#     all_colleagues_data = pd.concat(valid_dfs, ignore_index=True)
#     # Requirement 4: Save to test.csv
#     all_colleagues_data.to_csv("test.csv", index=False)

#     # Take 500 for training
#     train_df = all_colleagues_data.sample(n=min(500, len(all_colleagues_data)), random_state=42)
#     print(f"Successfully loaded {len(valid_dfs)} files.")
#     print(f"Total training rows available: {len(all_colleagues_data)}")
# else:
#     print("No valid CSV files were loaded!")

In [10]:
import io

### Functions

In [6]:
def verify_and_get_files(folder, expected_default_name):
    os.makedirs(folder, exist_ok=True)

    # Check if any CSV already exists in the folder
    existing_csvs = glob.glob(os.path.join(folder, "*.csv"))

    if IN_COLAB and not existing_csvs:
        print(f"No CSV found in {folder}. Upload your file.")
        uploaded = files.upload()
        for filename in uploaded.keys():
            target_path = os.path.join(folder, filename)
            os.rename(filename, target_path)
        existing_csvs = glob.glob(os.path.join(folder, "*.csv"))

    if existing_csvs:
        for f in existing_csvs:
            if os.path.basename(f) == expected_default_name:
                return f
        return existing_csvs[0]
    return None

In [3]:
test_folder_path = './data_sets/Ramsay/test'
val_folder_path = './data_sets/Ramsay/val'

In [7]:
train_file = verify_and_get_files(test_folder_path, "test.csv")
val_file = verify_and_get_files(val_folder_path, "mscaidl-0077_ramsay_dataset.csv")

No CSV found in ./data_sets/Ramsay/test. Upload your file.


Saving test.csv to test.csv
No CSV found in ./data_sets/Ramsay/val. Upload your file.


Saving mscaidl-0077_ramsay_dataset.csv to mscaidl-0077_ramsay_dataset.csv


In [8]:
if train_file and val_file:
    print(f"Training file located: {train_file}")
    print(f"Validation file located: {val_file}")

    try:
        sample_df = pd.read_csv(val_file, sep=None, engine='python', encoding='utf-8-sig')
        print("\nSuccessfully loaded data. Preview of columns:")
        print(sample_df.columns.tolist())
    except Exception as e:
        print(f"Error reading file: {e}")
else:
    print("Files are missing. If you are not in Colab, please place CSVs in the folders manually.")

Training file located: ./data_sets/Ramsay/test/test.csv
Validation file located: ./data_sets/Ramsay/val/mscaidl-0077_ramsay_dataset.csv

Successfully loaded data. Preview of columns:
['AIDL_ID', 'Question', 'Polite', 'Ramsay']


In [11]:
def load_any_ramsay_csv(file_path, limit=None, is_train=True):
    with open(file_path, 'r', encoding='utf-8-sig', errors='ignore') as f:
        content = f.read().replace('"', '')

    df = pd.read_csv(io.StringIO(content), sep=None, engine='python', on_bad_lines='skip')

    df.columns = [c.strip() for c in df.columns]

    required_cols = ["Question", "Polite", "Ramsay"]
    df = df[required_cols]

    if is_train:
        # 500 samples for training
        df = df.sample(n=min(limit, len(df)), random_state=SEED)
    else:
        # first 100 samples for validation
        df = df.head(limit)

    print(f"Successfully loaded {len(df)} rows from {file_path}")

    return Dataset.from_dict({
        "prompt":   df["Question"].astype(str).tolist(),
        "chosen":   df["Ramsay"].astype(str).tolist(),
        "rejected": df["Polite"].astype(str).tolist(),
    }), df

# train dataset
train_dataset, _ = load_any_ramsay_csv(train_file, limit=500, is_train=True)

# val dataset
eval_dataset, eval_df_raw = load_any_ramsay_csv(val_file, limit=100, is_train=False)

Successfully loaded 500 rows from ./data_sets/Ramsay/test/test.csv
Successfully loaded 100 rows from ./data_sets/Ramsay/val/mscaidl-0077_ramsay_dataset.csv


### SLM from usloath (not Zephyr)



In [12]:
model_name = "unsloth/Llama-3.2-3B-Instruct"

max_seq_length = 2048
dtype = None # Auto detect
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# LoRA Adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
)

print(f"Model {model_name} loaded successfully with LoRA.")

==((====))==  Unsloth 2026.2.1: Fast Llama patching. Transformers: 4.57.6.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.034 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Unsloth 2026.2.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


Model unsloth/Llama-3.2-3B-Instruct loaded successfully with LoRA.


In [13]:
BASE_CONFIG = {
    'per_device_train_batch_size': 2,
    'gradient_accumulation_steps': 4,
    'warmup_ratio': 0.1,
    'num_train_epochs': 3,
    'learning_rate': 5e-5,
    'logging_steps': 1,
    'optim': "paged_adamw_32bit",
    'weight_decay': 0.01,
    'lr_scheduler_type': "linear",
    'beta': 0.2,
    'max_prompt_length': 512,
    'max_length': 1024,
}

In [14]:
PatchDPOTrainer()

training_args = DPOConfig(
    per_device_train_batch_size = BASE_CONFIG['per_device_train_batch_size'],
    gradient_accumulation_steps = BASE_CONFIG['gradient_accumulation_steps'],
    warmup_ratio                = BASE_CONFIG['warmup_ratio'],
    num_train_epochs            = BASE_CONFIG['num_train_epochs'],
    learning_rate               = BASE_CONFIG['learning_rate'],
    fp16                        = not torch.cuda.is_bf16_supported(),
    bf16                        = torch.cuda.is_bf16_supported(),
    logging_steps               = 1,
    optim                       = BASE_CONFIG['optim'],
    weight_decay                = BASE_CONFIG['weight_decay'],
    lr_scheduler_type           = BASE_CONFIG['lr_scheduler_type'],
    seed                        = SEED,
    output_dir                  = "outputs",
    eval_strategy               = "steps",
    eval_steps                  = 10,
    report_to                   = "none",

    # DPO specific
    beta                        = BASE_CONFIG['beta'],
    max_prompt_length           = BASE_CONFIG['max_prompt_length'],
    max_length                  = BASE_CONFIG['max_length'],
)

dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = tokenizer,
    beta = 0.1,
    max_prompt_length = 512,
    max_length = 1024,
)

print("--- DPO Patch Complete ---")

Extracting prompt in train dataset (num_proc=16):   0%|          | 0/500 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=16):   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=16):   0%|          | 0/500 [00:00<?, ? examples/s]

Extracting prompt in eval dataset (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

Applying chat template to eval dataset (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

--- DPO Patch Complete ---


In [15]:
print("--- Training Started ---")
dpo_trainer.train()

--- Training Started ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 3 | Total steps = 189
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Step,Training Loss,Validation Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
10,0.6808,0.663401,0.043238,-0.018203,0.81,0.06144,-123.021683,-59.151539,-1.25763,-1.135182,0,0,0
20,0.5197,0.354917,0.688109,-0.205267,1.0,0.893376,-119.797325,-60.086864,-1.263037,-1.151491,No Log,No Log,No Log
30,0.2192,0.058805,2.123313,-1.361916,1.0,3.485229,-112.621307,-65.87011,-1.184075,-1.146012,No Log,No Log,No Log
40,0.1492,0.016776,2.716105,-3.11851,1.0,5.834616,-109.657356,-74.653069,-1.037339,-1.134233,No Log,No Log,No Log
50,0.02,0.008155,2.993555,-4.460798,1.0,7.454354,-108.270096,-81.36451,-0.858029,-1.047554,No Log,No Log,No Log
60,0.0885,0.006385,3.160683,-4.872452,1.0,8.033134,-107.434456,-83.422783,-0.755164,-0.971502,No Log,No Log,No Log
70,0.0022,0.0055,3.269092,-5.067053,1.0,8.336146,-106.89241,-84.39579,-0.71185,-0.94139,No Log,No Log,No Log
80,0.0069,0.005104,3.316792,-5.226403,1.0,8.543195,-106.653915,-85.192535,-0.691902,-0.930443,No Log,No Log,No Log
90,0.0015,0.004608,3.350618,-5.366563,1.0,8.717181,-106.484764,-85.893341,-0.668566,-0.913889,No Log,No Log,No Log
100,0.176,0.004337,3.35779,-5.482944,1.0,8.840733,-106.448929,-86.475243,-0.657163,-0.907969,No Log,No Log,No Log


TrainOutput(global_step=189, training_loss=0.20388893353279738, metrics={'train_runtime': 464.4071, 'train_samples_per_second': 3.23, 'train_steps_per_second': 0.407, 'total_flos': 0.0, 'train_loss': 0.20388893353279738, 'epoch': 3.0})

In [16]:
# Set to inference mode
FastLanguageModel.for_inference(model)

# Load similarity model
sim_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

questions = eval_df_raw["Question"].tolist()
prompts = [f"Question: {q}\nResponse:" for q in questions]

print(f"Generating responses for {len(questions)} questions...")
inputs = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")

outputs = model.generate(
    **inputs,
    max_new_tokens=64,
    temperature=0.7,
    use_cache=True)

decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

model_results = [text.split("Response:")[-1].strip().replace("\n"," ") for text in decoded_outputs]
eval_df_raw["Model_Result"] = model_results

print("Start Cosine Similarity Calculation (Model vs. Original Ramsay)")

model_embeddings = sim_model.encode(eval_df_raw["Model_Result"].tolist(), convert_to_tensor=True)

# Encoding the original Ramsay-style target answers
target_ramsay_embeddings = sim_model.encode(eval_df_raw["Ramsay"].tolist(), convert_to_tensor=True)

cosine_scores = util.cos_sim(model_embeddings, target_ramsay_embeddings)
eval_df_raw["Similarity_Score_Ramsay"] = torch.diag(cosine_scores).cpu().tolist()

avg_sim_ramsay = eval_df_raw['Similarity_Score_Ramsay'].mean()

print(f"Process Complete.")
print(f"Average Similarity to Original Ramsay: {avg_sim_ramsay:.4f}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Generating responses for 100 questions...
Start Cosine Similarity Calculation (Model vs. Original Ramsay)
Process Complete.
Average Similarity to Original Ramsay: 0.3638


In [18]:
print("\n" + "="*50)
print("PREVIEW OF FIRST 5 RAMSAY RESPONSES")
print("="*50)

for i in range(min(5, len(eval_df_raw))):
    print(f"\n[ENTRY {i+1}]")
    print(f"STUDENT QUESTION: {eval_df_raw['Question'].iloc[i]}")
    print(f"RAMSAY RESPONSE:  {eval_df_raw['Model_Result'].iloc[i]}")
    print(f"SIMILARITY SCORE: {eval_df_raw['Similarity_Score_Ramsay'].iloc[i]:.4f}")
    print("-" * 50)


PREVIEW OF FIRST 5 RAMSAY RESPONSES

[ENTRY 1]
STUDENT QUESTION: What is the purpose of a Padding token?
RAMSAY RESPONSE:  The purpose of a padding token is to prevent someone from guessing your password. It is like a guardian angel that watches your back while you're sleeping. It's a safety net that prevents your password from being used by someone else. You can't just leave your house without locking the door, you know. Similarly, you
SIMILARITY SCORE: 0.1535
--------------------------------------------------

[ENTRY 2]
STUDENT QUESTION: What is the purpose of Dropout?
RAMSAY RESPONSE:  To let you know that you messed up and are not good enough!
SIMILARITY SCORE: 0.0162
--------------------------------------------------

[ENTRY 3]
STUDENT QUESTION: Why normalize input data?
RAMSAY RESPONSE:  Normalizing input data is like preparing your clothes for a fashion show! You want to look your best, not like you just rolled out of bed! It means making sure your numbers are in the right orde

In [19]:
timestamp = datetime.now().strftime("%d%m%Y_%H%M")

In [20]:
eval_df_raw.to_csv(f"mscaidl-0077_task_5_{timestamp}.csv", index=False)

In [None]:
# this step takes ~15 mins because it saves the full model ~5GB
# we could save only the changes we made at the DPO step

# model.save_pretrained_merged("dpo_ramsay_model", tokenizer, save_method = "merged_16bit")
# shutil.make_archive("dpo_ramsay_model", 'zip', "dpo_ramsay_model")

# files.download("dpo_ramsay_model.zip")

In [None]:
# LoRA save only

# base_filename = f"dpo_ramsay_lora_{timestamp}"
# model.save_pretrained(base_filename)
# tokenizer.save_pretrained(base_filename)

# shutil.make_archive(base_filename, 'zip', base_filename)

# files.download(f"{base_filename}.zip")