<a href="https://colab.research.google.com/github/tcharos/NLP-Toxicity-Detection/blob/main/AIDL_CS01_NLP_Project_task_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AIDL_B_CS01: Advanced NLP Project

## LLM Tuning with DPO (Gordon Ramsay Alignment)

In [37]:
import os
import sys
import shutil
from google.colab import files

IN_COLAB = 'google.colab' in sys.modules
BASE_DIR = "/content" if IN_COLAB else "."
TOXICITY_PATH = os.path.join(BASE_DIR, "data_sets/toxicity")
SEED = 12345

if IN_COLAB:
    from unsloth import FastLanguageModel
else:
    from unsloth_mlx import FastLanguageModel

if IN_COLAB:
    print("Running in Google Colab. Installing NLP stack...")
    !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
    !pip install -q -U "trl<=0.24.0" "datasets==4.3.0" transformers accelerate peft sentence-transformers
else:
    print("Running locally. Checking Mac-specific requirements...")
    !{sys.executable} -m pip install -q "tensorflow==2.16.2" "tensorflow-macos==2.16.2" "tf-keras~=2.16"
    !{sys.executable} -m pip install unsloth-mlx
    !{sys.executable} -m pip install -q -U "trl<=0.24.0" "datasets==4.3.0" transformers accelerate peft sentence-transformers

os.environ["KERAS_BACKEND"] = "tensorflow"

from unsloth import PatchDPOTrainer

import torch
import numpy as np
import pandas as pd
import glob
from datasets import Dataset
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, confusion_matrix

from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer, util
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments
)

from peft import LoraConfig, get_peft_model
from trl import DPOConfig, DPOTrainer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

print(f"\nTensorFlow Version: {tf.__version__}")
print("Num GPUs Available (TF): ", len(tf.config.list_physical_devices('GPU')))

HAS_MPS = torch.backends.mps.is_available()
HAS_CUDA = torch.cuda.is_available()

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("PyTorch Device: Mac GPU (Metal)")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("PyTorch Device: Colab GPU (CUDA)")
else:
    device = torch.device("cpu")
    print("PyTorch Device: CPU")

Running in Google Colab. Installing NLP stack...
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-6dryo8qz/unsloth_1b720dd74e1348ba9362f02e945efb37
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-6dryo8qz/unsloth_1b720dd74e1348ba9362f02e945efb37
  Resolved https://github.com/unslothai/unsloth.git to commit ec1757c1a02175851146ff5f6ab2a26c8c863fc8
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone

TensorFlow Version: 2.19.0
Num GPUs Available (TF):  1
PyTorch Device: Colab GPU (CUDA)


### Dataset Preparation

### Functions

In [38]:
def verify_and_get_files(folder, expected_default_name):
    """Checks if a folder has any CSVs; if not, asks for upload."""
    os.makedirs(folder, exist_ok=True)

    # Check if any CSV already exists in the folder
    existing_csvs = glob.glob(os.path.join(folder, "*.csv"))

    if IN_COLAB and not existing_csvs:
        print(f"ðŸ“¥ No CSV found in {folder}. Please upload your file.")
        uploaded = files.upload()
        for filename in uploaded.keys():
            target_path = os.path.join(folder, filename)
            os.rename(filename, target_path)
        # Refresh the list after upload
        existing_csvs = glob.glob(os.path.join(folder, "*.csv"))

    if existing_csvs:
        # Priority: Try to find the exact expected name, otherwise pick the first CSV found
        for f in existing_csvs:
            if os.path.basename(f) == expected_default_name:
                return f
        return existing_csvs[0]
    return None

In [39]:
test_folder_path = './data_sets/Ramsay/test'
val_folder_path = './data_sets/Ramsay/val'

In [40]:
# code i used to contatenate all *.csv to one test.csv - executed only once

# all_csv_files = glob.glob(os.path.join(test_folder_path, "*.csv"))
# valid_dfs = []
# required_cols = ["Question", "Polite", "Ramsay"]

# for f in all_csv_files:
#     try:
#         # Try UTF-8 first, fallback to cp1252 if it fails
#         try:
#             temp_df = pd.read_csv(f, encoding='utf-8')
#         except UnicodeDecodeError:
#             temp_df = pd.read_csv(f, encoding='cp1252')

#         # Check if the required columns exist
#         if all(col in temp_df.columns for col in required_cols):
#             valid_dfs.append(temp_df[required_cols])
#         else:
#             print(f"Skipping {f}: Missing required columns. Found: {temp_df.columns.tolist()}")

#     except Exception as e:
#         print(f"Could not load {f} due to error: {e}")

# # Combine only the valid ones
# if valid_dfs:
#     all_colleagues_data = pd.concat(valid_dfs, ignore_index=True)
#     # Requirement 4: Save to test.csv
#     all_colleagues_data.to_csv("test.csv", index=False)

#     # Take 500 for training
#     train_df = all_colleagues_data.sample(n=min(500, len(all_colleagues_data)), random_state=42)
#     print(f"Successfully loaded {len(valid_dfs)} files.")
#     print(f"Total training rows available: {len(all_colleagues_data)}")
# else:
#     print("No valid CSV files were loaded!")

In [41]:
train_file = verify_and_get_files(test_folder_path, "test.csv")
val_file = verify_and_get_files(val_folder_path, "mscaidl-0077_ramsay_dataset.csv")

In [42]:
if train_file and val_file:
    print(f"Training file located: {train_file}")
    print(f"Validation file located: {val_file}")

    try:
        sample_df = pd.read_csv(val_file, sep=None, engine='python', encoding='utf-8-sig')
        print("\nSuccessfully connected to data. Preview of columns:")
        print(sample_df.columns.tolist())
    except Exception as e:
        print(f"Error reading file: {e}")
else:
    print("Files are missing. If you are not in Colab, please place CSVs in the folders manually.")

Training file located: ./data_sets/Ramsay/test/test (1).csv
Validation file located: ./data_sets/Ramsay/val/mscaidl-0077_ramsay_dataset (1).csv

Successfully connected to data. Preview of columns:
['AIDL_ID', 'Question', 'Polite', 'Ramsay']


In [43]:
def load_any_ramsay_csv(file_path, limit=None, is_train=True):
    # 1. Read raw text and remove double quotes to prevent ParserErrors
    with open(file_path, 'r', encoding='utf-8-sig', errors='ignore') as f:
        content = f.read().replace('"', '')

    df = pd.read_csv(io.StringIO(content), sep=None, engine='python', on_bad_lines='skip')

    df.columns = [c.strip() for c in df.columns]

    required_cols = ["Question", "Polite", "Ramsay"]
    df = df[required_cols]

    if is_train:
        # 500 samples for training
        df = df.sample(n=min(limit, len(df)), random_state=SEED)
    else:
        # first 100 samples for validation
        df = df.head(limit)

    print(f"Successfully loaded {len(df)} rows from {file_path}")

    return Dataset.from_dict({
        "prompt":   df["Question"].astype(str).tolist(),
        "chosen":   df["Polite"].astype(str).tolist(),
        "rejected": df["Ramsay"].astype(str).tolist(),
    }), df

# train dataset
train_dataset, _ = load_any_ramsay_csv(train_file_path, limit=500, is_train=True)

# val dataset
eval_dataset, eval_df_raw = load_any_ramsay_csv(val_file_path, limit=100, is_train=False)

Successfully loaded 500 rows from ./data_sets/Ramsay/test/test (1).csv
Successfully loaded 100 rows from ./data_sets/Ramsay/val/mscaidl-0077_ramsay_dataset (1).csv


### SLM from usloath (not Zephyr)



In [44]:
model_name = "unsloth/Llama-3.2-3B-Instruct"
#model_name = "unsloth/Llama-3.2-1B-Instruct"

max_seq_length = 2048 # Can handle longer contexts if needed
dtype = None # Auto-detect (Float16 or Bfloat16)
load_in_4bit = True # Essential for DPO memory efficiency

# 2. Load the model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

# 3. Add LoRA Adapters
# This allows us to train the model efficiently by only updating a small percentage of weights
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Rank
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Optimized for Unsloth
    bias = "none",    # Optimized for Unsloth
    use_gradient_checkpointing = "unsloth", # Reduces VRAM usage
    random_state = 3407,
)

print(f"Model {model_name} loaded successfully with LoRA adapters.")

==((====))==  Unsloth 2026.1.2: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model unsloth/Llama-3.2-3B-Instruct loaded successfully with LoRA adapters.


In [45]:
BASE_CONFIG = {
    'per_device_train_batch_size': 2,
    'gradient_accumulation_steps': 4,
    'warmup_ratio': 0.1,
    'num_train_epochs': 3,
    'learning_rate': 5e-5,
    'logging_steps': 1,
    'optim': "paged_adamw_32bit",
    'weight_decay': 0.01,
    'lr_scheduler_type': "linear",
    'beta': 0.2,
    'max_prompt_length': 512,
    'max_length': 1024,
}

In [46]:
PatchDPOTrainer()

training_args = DPOConfig(
    per_device_train_batch_size = BASE_CONFIG['per_device_train_batch_size'],
    gradient_accumulation_steps = BASE_CONFIG['gradient_accumulation_steps'],
    warmup_ratio                = BASE_CONFIG['warmup_ratio'],
    num_train_epochs            = BASE_CONFIG['num_train_epochs'],
    learning_rate               = BASE_CONFIG['learning_rate'],
    fp16                        = not torch.cuda.is_bf16_supported(),
    bf16                        = torch.cuda.is_bf16_supported(),
    logging_steps               = 1,
    optim                       = BASE_CONFIG['optim'],
    weight_decay                = BASE_CONFIG['weight_decay'],
    lr_scheduler_type           = BASE_CONFIG['lr_scheduler_type'],
    seed                        = SEED,
    output_dir                  = "outputs",
    eval_strategy               = "steps",
    eval_steps                  = 10,
    report_to                   = "none",

    # DPO specific requirements inside the config
    beta                        = BASE_CONFIG['beta'],
    max_prompt_length           = BASE_CONFIG['max_prompt_length'],
    max_length                  = BASE_CONFIG['max_length'],
)

dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = tokenizer,
    beta = 0.1,
    max_prompt_length = 512,
    max_length = 1024,
)

print("--- Training Started ---")
dpo_trainer.train()

Extracting prompt in train dataset (num_proc=16):   0%|          | 0/500 [00:00<?, ? examples/s]

Applying chat template to train dataset (num_proc=16):   0%|          | 0/500 [00:00<?, ? examples/s]

Tokenizing train dataset (num_proc=16):   0%|          | 0/500 [00:00<?, ? examples/s]

Extracting prompt in eval dataset (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

Applying chat template to eval dataset (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenizing eval dataset (num_proc=16):   0%|          | 0/100 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


--- Training Started ---


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 3 | Total steps = 189
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856 of 3,237,063,680 (0.75% trained)


Step,Training Loss,Validation Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
10,0.6805,0.665361,0.012716,-0.044444,0.84,0.05716,-58.996937,-123.460091,-1.130586,-1.254269,0,0,0
20,0.5073,0.327499,0.159262,-0.843759,1.0,1.003021,-58.264217,-127.456673,-1.118496,-1.242883,No Log,No Log,No Log
30,0.1962,0.029074,0.588988,-4.879188,1.0,5.468176,-56.115585,-147.633804,-1.258035,-1.307231,No Log,No Log,No Log
40,0.0915,0.009343,0.766695,-9.631791,1.0,10.398485,-55.227051,-171.39682,-1.52321,-1.48287,No Log,No Log,No Log
50,0.0048,0.007288,0.772805,-12.03709,1.0,12.809896,-55.19651,-183.423325,-1.654245,-1.605023,No Log,No Log,No Log
60,0.0871,0.006699,0.685526,-13.316235,1.0,14.00176,-55.632885,-189.819046,-1.74605,-1.69872,No Log,No Log,No Log
70,0.0001,0.006271,0.628443,-13.924769,1.0,14.553211,-55.918316,-192.861725,-1.79256,-1.74667,No Log,No Log,No Log
80,0.0004,0.006274,0.58497,-14.202863,1.0,14.78783,-56.135674,-194.252167,-1.815038,-1.770891,No Log,No Log,No Log
90,0.0009,0.006126,0.583965,-14.359791,1.0,14.943756,-56.140701,-195.036835,-1.830325,-1.786844,No Log,No Log,No Log
100,0.1736,0.006328,0.576445,-14.475849,1.0,15.052295,-56.178299,-195.617111,-1.841838,-1.799101,No Log,No Log,No Log


TrainOutput(global_step=189, training_loss=0.19474108869660217, metrics={'train_runtime': 477.332, 'train_samples_per_second': 3.142, 'train_steps_per_second': 0.396, 'total_flos': 0.0, 'train_loss': 0.19474108869660217, 'epoch': 3.0})

In [47]:
FastLanguageModel.for_inference(model)

sim_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

questions = eval_df_raw["Question"].tolist()
prompts = [f"Question: {q}\nResponse:" for q in questions]

print(f"Generating responses for {len(questions)} questions...")
inputs = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# Clean up responses
model_results = [text.split("Response:")[-1].strip() for text in decoded_outputs]
eval_df_raw["Model_Result"] = model_results

print("Calculating Cosine Similarity...")
model_embeddings = sim_model.encode(eval_df_raw["Model_Result"].tolist(), convert_to_tensor=True)
polite_embeddings = sim_model.encode(eval_df_raw["Polite"].tolist(), convert_to_tensor=True)

# how close the model got to the "Polite" target
cosine_scores = util.cos_sim(model_embeddings, polite_embeddings)
eval_df_raw["Similarity_Score"] = torch.diag(cosine_scores).cpu().tolist()

eval_df_raw.to_csv("mscaidl-0077_task_5_results.csv", index=False)

avg_sim = eval_df_raw['Similarity_Score'].mean()
print(f"âœ… Process Complete!")
print(f"ðŸ“Š Average Similarity Score: {avg_sim:.4f}")

Generating responses for 100 questions...
Calculating Cosine Similarity...
âœ… Process Complete!
ðŸ“Š Average Similarity Score: 0.5803


In [48]:
model.save_pretrained_merged("dpo_ramsay_model", tokenizer, save_method = "merged_16bit")

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00002.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:00<00:00, 12446.01it/s]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2/2 [00:29<00:00, 14.99s/it]


Unsloth: Merge process complete. Saved to `/content/dpo_ramsay_model`


In [49]:
# this step takes ~15 mins because it saves the full model ~5GB
# we could save only LoRA (the changes we made at the DPO step)

# shutil.make_archive("dpo_ramsay_model", 'zip', "dpo_ramsay_model")

# files.download("dpo_ramsay_model.zip")

In [50]:
# LoRA save only

model.save_pretrained("dpo_ramsay_lora_only")
tokenizer.save_pretrained("dpo_ramsay_lora_only")

shutil.make_archive("dpo_ramsay_lora_only", 'zip', "dpo_ramsay_lora_only")

files.download("dpo_ramsay_lora_only.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>