# AIDL_B_CS01: Advanced NLP Project

**Tasks Covered:**
1. LSTM Toxicity Detection (Custom & GloVe Embeddings)
2. BERT-alike STS-b Semantic Similarity (PyTorch/HF)
3. Custom RAG Mechanism
5. LLM Tuning with DPO (Gordon Ramsay Alignment)

*(Task 4, the dataset creation, is a prerequisite and not coded here.)*

In [None]:
## ðŸ’» Setup and Imports

# Install necessary libraries (uncomment if needed)
# !pip install tensorflow keras torch transformers datasets scikit-learn pandas numpy scipy trl peft sentence-transformers

# --- Core Libraries ---
import pandas as pd
import numpy as np
import os

# --- Metrics ---
from scipy.stats import pearsonr, spearmanr # Task 2 correlation metrics
from sklearn.metrics import f1_score, confusion_matrix

# --- TensorFlow/Keras (Task 1) ---
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --- PyTorch/Transformers/Datasets (Task 2 & 5) ---
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM, Trainer, TrainingArguments

# --- DPO/PEFT (Task 5) ---
from peft import LoraConfig, get_peft_model
from trl import DPOTrainer

# --- RAG (Task 3) ---
# Specific imports depend on your chosen vector store and embedding method
# from sentence_transformers import SentenceTransformer 
# from langchain.text_splitter import RecursiveCharacterTextSplitter # Example chunking tool


## 1. Toxicity Detection with LSTM (Custom & GloVe Embeddings)

In [None]:
### 1.1. Data Loading and Preparation

# **ACTION REQUIRED: Update file paths**
TRAIN_PATH = "data/train.csv"
VALID_PATH = "data/valid.csv"
TEST_PATH = "data/test.csv"

# Load Data
df_train = pd.read_csv(TRAIN_PATH)
df_valid = pd.read_csv(VALID_PATH)
df_test = pd.read_csv(TEST_PATH)

# Separate features (text) and labels (Toxicity)
X_train, y_train = df_train['Utterance'], df_train['Toxicity']
X_valid, y_valid = df_valid['Utterance'], df_valid['Toxicity']
X_test, y_test = df_test['Utterance'], df_test['Toxicity']

# Configuration
MAX_WORDS = 20000 
MAX_LEN = 100    
EMBEDDING_DIM = 100 

# Tokenizer
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train.astype(str))
vocab_size = len(tokenizer.word_index) + 1

# Convert texts to sequences and pad them
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train.astype(str)), maxlen=MAX_LEN, padding='post', truncating='post')
X_valid_seq = pad_sequences(tokenizer.texts_to_sequences(X_valid.astype(str)), maxlen=MAX_LEN, padding='post', truncating='post')
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test.astype(str)), maxlen=MAX_LEN, padding='post', truncating='post')


In [None]:
### 1.2. Model 1: LSTM with Custom Learned Embeddings

def create_custom_lstm_model(vocab_size, embedding_dim, max_len):
    """Defines an LSTM model with an initialized Embedding layer."""
    model = Sequential([
        Embedding(vocab_size, embedding_dim, input_length=max_len),
        SpatialDropout1D(0.2),
        LSTM(100, dropout=0.2, recurrent_dropout=0.2),
        Dense(1, activation='sigmoid') 
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

model_custom = create_custom_lstm_model(vocab_size, EMBEDDING_DIM, MAX_LEN)
model_custom.summary()

# **ACTION REQUIRED: Training**
# history_custom = model_custom.fit(
#     X_train_seq, y_train,
#     epochs=10,
#     batch_size=32,
#     validation_data=(X_valid_seq, y_valid)
# )

# **ACTION REQUIRED: Evaluation**
# y_pred_custom_proba = model_custom.predict(X_test_seq)
# y_pred_custom = (y_pred_custom_proba > 0.5).astype(int)

# f1_custom = f1_score(y_test, y_pred_custom)
# cm_custom = confusion_matrix(y_test, y_pred_custom)
# print(f"\n--- Custom Embeddings Results ---")
# print(f"F1 Score: {f1_custom:.4f}")
# print("Confusion Matrix:\n", cm_custom)


In [None]:
### 1.3. Model 2: LSTM with GloVe Pre-trained Embeddings

# **ACTION REQUIRED: GloVe Loading and Embedding Matrix Creation**
# 1. Load the GloVe file (e.g., 'glove.6B.100d.txt')
# 2. Parse the vectors into an index map.
# 3. Create the embedding_matrix for your vocabulary.

# Placeholder for embedding_matrix (replace with actual loading logic)
# embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
# for word, i in tokenizer.word_index.items():
#     if i < vocab_size:
#         # embedding_vector = embeddings_index.get(word)
#         # if embedding_vector is not None:
#         #     embedding_matrix[i] = embedding_vector
#         pass # Temporary placeholder

# NOTE: Run the cell above and verify embedding_matrix is correctly built before running this cell

# def create_glove_lstm_model(embedding_matrix, max_len):
#     """Defines an LSTM model using pre-trained GloVe weights."""
#     model = Sequential([
#         Embedding(
#             input_dim=embedding_matrix.shape[0],
#             output_dim=embedding_matrix.shape[1],
#             weights=[embedding_matrix],
#             input_length=max_len,
#             trainable=False # Crucial for pre-trained embeddings
#         ),
#         SpatialDropout1D(0.2),
#         LSTM(100, dropout=0.2, recurrent_dropout=0.2),
#         Dense(1, activation='sigmoid')
#     ])
#     model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
#     return model

# **ACTION REQUIRED: Uncomment and use the function**
# model_glove = create_glove_lstm_model(embedding_matrix, MAX_LEN)
# model_glove.summary()

# **ACTION REQUIRED: Training**
# history_glove = model_glove.fit(...)

# **ACTION REQUIRED: Evaluation**
# y_pred_glove_proba = model_glove.predict(X_test_seq)
# y_pred_glove = (y_pred_glove_proba > 0.5).astype(int)

# f1_glove = f1_score(y_test, y_pred_glove)
# cm_glove = confusion_matrix(y_test, y_pred_glove)
# print(f"\n--- GloVe Embeddings Results ---")
# print(f"F1 Score: {f1_glove:.4f}")
# print("Confusion Matrix:\n", cm_glove)


## 2. Semantic Similarity (STS-b) with BERT-alike Models (PyTorch/HF)

In [None]:
### 2.1. Data Loading, Preprocessing, and Metrics

MODEL_NAME_1 = "bert-base-uncased" 
MODEL_NAME_2 = "roberta-base"      

# Load Dataset
dataset = load_dataset("glue", "stsb")

# Use tokenizer for first model (works for both BERT/RoBERTa base)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_1)

# Preprocessing function for STS-b
def preprocess_function(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding="max_length")

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Prepare labels (must be float for regression)
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets = tokenized_datasets.map(lambda e: {'labels': float(e['labels'])}, batched=True)

# Select and format splits
train_dataset = tokenized_datasets["train"].remove_columns(['sentence1', 'sentence2', 'idx'])
valid_dataset = tokenized_datasets["validation"].remove_columns(['sentence1', 'sentence2', 'idx'])
test_dataset = tokenized_datasets["test"].remove_columns(['sentence1', 'sentence2', 'idx'])

# Regression Metric Function for Trainer
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Squeeze predictions to remove single-dimensional entries (e.g., shape (N, 1) to (N,))
    predictions = predictions.squeeze() 
    
    pearson, _ = pearsonr(predictions, labels)
    spearman, _ = spearmanr(predictions, labels)
    
    # **ACTION REQUIRED: Confusion Matrix**
    # To compute a Confusion Matrix, you'd need to quantize the continuous scores (0-5) into discrete classes.
    # Example: score_to_class(score) function, then calculate CM.
    # cm = confusion_matrix(labels_quantized, predictions_quantized)
    
    return {"pearson": pearson, "spearman": spearman}


In [None]:
### 2.2. Model Training and Evaluation (Model 1: BERT)

model_1 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_1, num_labels=1) 

training_args_1 = TrainingArguments(
    output_dir="./results_bert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='pearson',
)

trainer_1 = Trainer(
    model=model_1,
    args=training_args_1,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    compute_metrics=compute_metrics,
)

# **ACTION REQUIRED: Training**
# trainer_1.train()

# **ACTION REQUIRED: Evaluation**
# results_1 = trainer_1.evaluate(test_dataset)
# print(f"\n--- Model 1 ({MODEL_NAME_1}) Test Results ---")
# print(results_1)


In [None]:
### 2.3. Model Training and Evaluation (Model 2: RoBERTa)

# **ACTION REQUIRED: Repeat the process for RoBERTa**
# model_2 = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME_2, num_labels=1)
# ... define training_args_2, trainer_2 ...
# trainer_2.train()
# results_2 = trainer_2.evaluate(test_dataset)
# print(f"\n--- Model 2 ({MODEL_NAME_2}) Test Results ---")
# print(results_2)


## 3. Custom RAG (Retrieval-Augmented Generation) Mechanism



In [None]:
### 3.1. Knowledge Base Preparation

# **ACTION REQUIRED: Define or Load Knowledge Base**
KB_DOCS = [
    "Doc 1: Transformer architecture uses multi-head attention to weigh the importance of different words in the input sequence. This allows parallelization.",
    "Doc 2: Recurrent Neural Networks (RNNs) suffer from the vanishing gradient problem, which LSTMs and GRUs were designed to solve through gating mechanisms.",
    # Add your documents here
]

# **ACTION REQUIRED: Chunking and Embedding**
# 1. Chunk documents (e.g., using a text splitter).
# 2. Embed the chunks (e.g., using 'sentence-transformers/all-MiniLM-L6-v2').
# 3. Store chunks and embeddings in a Vector Store (e.g., simple list, Faiss, or Chroma).

# Example Placeholder for a simple in-memory store (requires implementation of actual embedding)
# class SimpleVectorStore:
#     def __init__(self, docs, embed_model):
#         self.chunks = docs
#         self.embeddings = [embed_model.encode(c) for c in docs]
#         self.embed_model = embed_model
#     def search(self, query, k=3):
#         query_embedding = self.embed_model.encode(query)
#         # Implement similarity calculation (e.g., cosine similarity) and return top k chunks
#         return self.chunks[:k] # Placeholder return
# embed_model = SentenceTransformer('all-MiniLM-L6-v2')
# vector_store = SimpleVectorStore(KB_DOCS, embed_model)


In [None]:
### 3.2. Retrieval and Generation Logic

# **ACTION REQUIRED: Define your LLM for generation**
# llm_pipeline = pipeline('text-generation', model='your-small-llm-model')

def custom_rag_mechanism(query, top_k=3):
    """Performs retrieval and augmented generation."""
    
    # --- Step 1: Retrieval (Requires `vector_store` from above) ---
    # retrieved_chunks = vector_store.search(query, k=top_k)
    retrieved_chunks = ["<retrieved_chunk_1>", "<retrieved_chunk_2>"] # TEMPORARY
    
    # --- Step 2: Context Formatting ---
    context = "\n\n".join(retrieved_chunks)
    
    # --- Step 3: Generation (LLM Call) ---
    LLM_PROMPT = f"""
    You are an AI assistant. Use the following CONTEXT to answer the USER QUESTION. 
    If the CONTEXT does not contain the answer, state that you cannot answer based on the provided information.

    CONTEXT:
    {context}

    USER QUESTION: {query}

    ANSWER:
    """
    
    # response = llm_pipeline(LLM_PROMPT, max_new_tokens=100)[0]['generated_text']
    response = "RAG Answer based on the retrieved context!"
    return response, retrieved_chunks

# Example Test
user_query = "What were LSTMs designed to solve in traditional RNNs?"
answer, context_used = custom_rag_mechanism(user_query)
print(f"Query: {user_query}")
print(f"Answer: {answer}")
print(f"Context Used: {context_used}")


## 5. LLM Tuning with DPO for Gordon Ramsay Persona



In [None]:
### 5.1. Dataset Preparation

# **ACTION REQUIRED: Update file path**
PREFERENCE_DATA_PATH = "path/to/task4_ramsay_preference_dataset.csv" 

# Load the dataset created in Task 4
# DPO_DF = pd.read_csv(PREFERENCE_DATA_PATH)

# DPO requires (prompt, chosen_response, rejected_response)
# dpo_dataset_df = DPO_DF.rename(columns={
#     'Question': 'prompt',
#     'Ramsay': 'chosen', # This is the preferred answer
#     'Polite': 'rejected' # This is the non-preferred answer
# })

# Convert to Hugging Face Dataset format (requires `from datasets import Dataset`)
# dpo_hf_dataset = Dataset.from_pandas(dpo_dataset_df[['prompt', 'chosen', 'rejected']])
# dpo_hf_dataset = dpo_hf_dataset.train_test_split(test_size=0.1) # Split for validation


In [None]:
### 5.2. Model and DPO Trainer Setup

# **ACTION REQUIRED: Choose your base model**
BASE_MODEL = "facebook/opt-125m" # Use a small model for development, or a larger one if resources allow

# 1. Load the base model and tokenizer
# model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, torch_dtype=torch.bfloat16) # Use a suitable dtype
# tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
# tokenizer.pad_token = tokenizer.eos_token # Ensure pad token is set

# 2. Setup PEFT/LoRA configuration
# peft_config = LoraConfig(
#     r=16,
#     lora_alpha=16,
#     target_modules=["q_proj", "v_proj"], # Check model documentation for correct layers
#     lora_dropout=0.05,
#     bias="none",
#     task_type="CAUSAL_LM",
# )

# 3. DPO Training Arguments
# training_args_dpo = TrainingArguments(
#     output_dir="./dpo_results_ramsay",
#     num_train_epochs=1,
#     per_device_train_batch_size=4,
#     gradient_accumulation_steps=4,
#     logging_steps=10,
#     learning_rate=5e-5,
#     remove_unused_columns=False,
#     save_strategy="epoch",
#     fp16=True, # Use fp16/bf16 if supported
#     report_to="none"
# )

# 4. Initialize DPOTrainer
# dpo_trainer = DPOTrainer(
#     model=model,
#     ref_model=None, # Set to None for implicit reference model loading
#     args=training_args_dpo,
#     beta=0.1, 
#     train_dataset=dpo_hf_dataset['train'],
#     eval_dataset=dpo_hf_dataset['test'],
#     tokenizer=tokenizer,
#     peft_config=peft_config,
# )

# 5. Train Placeholder
# print("Starting DPO Training...")
# dpo_trainer.train()

# 6. Save the final model (LoRA weights)
# dpo_trainer.save_model("ramsay_dpo_adapter")

# **ACTION REQUIRED: Inference Test**
# Test the fine-tuned model with a new question to verify the Ramsay persona.
# from peft import PeftModel
# ft_model = PeftModel.from_pretrained(model, "ramsay_dpo_adapter")
# ft_model.eval()
# print("\n--- DPO Fine-Tuned Model Test ---")
# test_prompt = "Why is batch normalization useful?"
# ... generate response ...
