Raghav Marwaha

E23CSEU1229

Batch: 41

In [None]:
## CELL 1: SETUP, INSTALLATION, AND DRIVE MOUNT (FINAL)

print("--- Installing Libraries for Clinical NLP ---")
# Install core libraries and hugging face utilities
!pip install transformers torch numpy pandas scikit-learn tqdm datasets
!pip install sentencepiece protobuf tiktoken # Ensuring all tokenizer dependencies are met

# --- Drive Mount for Saving Checkpoints ---
from google.colab import drive
import os
drive.mount('/content/drive', force_remount=True)
# Define the project path for saving the best model
SAVE_PATH = '/content/drive/MyDrive/NLP_Clinical_Distress_Model'
os.makedirs(SAVE_PATH, exist_ok=True)
print(f"Checkpoints will be saved to: {SAVE_PATH}")


# --- Imports and Setup ---
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from tqdm.auto import tqdm
import random

# Set seed for reproducibility
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
set_seed(42)

print("\n--- Setup Complete. Libraries Loaded. ---")

In [None]:
## CELL 2: DATA LOADING, MAPPING, AND EMBEDDING SETUP (FINAL SCALAR FIX)

from datasets import load_dataset, Dataset
from transformers import DistilBertModel, DistilBertTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import torch

# --- 1. Configuration ---
DATASET_ID = "dair-ai/emotion" # Using the clear emotion dataset
MAX_LENGTH = 128
MODEL_NAME = 'distilbert-base-uncased'

# --- 2. Load Data and Map to Binary ---
print(f"--- Loading High-Contrast Emotion Dataset ---")
raw_datasets = load_dataset(DATASET_ID)
df_train = raw_datasets['train'].to_pandas()

# Filter and Map to Binary (1=Distress Proxy, 0=Control Proxy)
DISTRESS_LABELS = [0, 3, 4] # Sadness, Anger, Fear
CONTROL_LABELS = [1, 2, 5]  # Joy, Love, Surprise

df_filtered = df_train[df_train['label'].isin(DISTRESS_LABELS + CONTROL_LABELS)].copy()
df_filtered['binary_label'] = df_filtered['label'].apply(lambda x: 1 if x in DISTRESS_LABELS else 0)

# --- 3. Dynamic Undersampling ---
df_distress = df_filtered[df_filtered['binary_label'] == 1]
df_control = df_filtered[df_filtered['binary_label'] == 0]
N_MINORITY = min(len(df_distress), len(df_control))

df_distress_sampled = df_distress.sample(n=N_MINORITY, random_state=42)
df_control_sampled = df_control.sample(n=N_MINORITY, random_state=42)
df_balanced = pd.concat([df_distress_sampled, df_control_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)

# --- 4. BERT Tokenizer and Embedding Model ---
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)
bert_model = DistilBertModel.from_pretrained(MODEL_NAME)

# --- 5. Custom Dataset Class (CRITICAL FIX HERE) ---
class MentalHealthDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        # We store the inputs as simple Python lists of values, not Series objects
        self.texts = texts.values.tolist()
        self.labels = labels.values.tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # The values here are now guaranteed to be simple strings/integers
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            # FIX: Label is a simple integer, resolving the ValueError
            'labels': torch.tensor(label, dtype=torch.long)
        }

# --- 6. Prepare Train/Validation Splits ---
X_train, X_val, y_train, y_val = train_test_split(
    df_balanced['text'],
    df_balanced['binary_label'],
    test_size=0.2,
    random_state=42
)

# Convert Series objects to lists of values before passing to the Dataset constructor
train_dataset = MentalHealthDataset(X_train, y_train, tokenizer, MAX_LENGTH)
val_dataset = MentalHealthDataset(X_val, y_val, tokenizer, MAX_LENGTH)

BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


print(f"Total Samples (BALANCED 1:1): {len(df_balanced)}")
print(f"Training Samples: {len(train_dataset)}")
print("Data Preparation complete. **RERUN CELL 3 and CELL 4 NOW.**")

In [None]:
## CELL 3: CNN + BiLSTM HYBRID MODEL DEFINITION

class MentalHealthClassifier(nn.Module):
    def __init__(self, bert_model, num_classes=2, lstm_hidden_size=128, lstm_layers=2):
        super(MentalHealthClassifier, self).__init__()

        # BERT Embeddings Layer (Frozen)
        self.bert = bert_model
        # Freeze BERT parameters (we only use it for feature extraction)
        for param in self.bert.parameters():
            param.requires_grad = False

        BERT_HIDDEN_SIZE = bert_model.config.hidden_size # 768 for DistilBERT

        # 1. Convolutional Layer (1D-CNN)
        self.conv1 = nn.Conv1d(
            in_channels=BERT_HIDDEN_SIZE,
            out_channels=256,
            kernel_size=3,
            padding=1
        )

        # 2. BiLSTM Layer
        self.bilstm = nn.LSTM(
            input_size=256,
            hidden_size=lstm_hidden_size,
            num_layers=lstm_layers,
            bidirectional=True,
            batch_first=True
        )

        # 3. Output Classifier Layer
        # Output size is 2 * hidden_size (bidirectional)
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(lstm_hidden_size * 2, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, num_classes) # Final output is 2 classes
        )

    def forward(self, input_ids, attention_mask):
        # 1. BERT Embedding Extraction
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state

        # 2. Reshape for CNN (B x H x L)
        sequence_output = sequence_output.permute(0, 2, 1)

        # 3. Apply CNN
        conv_output = F.relu(self.conv1(sequence_output))

        # 4. Reshape back for BiLSTM (B x L x H)
        conv_output = conv_output.permute(0, 2, 1)

        # 5. Apply BiLSTM
        lstm_output, _ = self.bilstm(conv_output)

        # 6. Use the output of the final time step (last token/pooled)
        pooled_output = lstm_output[:, -1, :] # Using the output of the last sequence element

        # 7. Final Classification
        logits = self.classifier(pooled_output)
        return logits

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MentalHealthClassifier(bert_model).to(device)

print(f"Model architecture (CNN + BiLSTM) defined successfully on {device}.")

In [None]:
## CELL 4: TRAINING WITH EARLY STOPPING AND BEST MODEL SAVING (FINAL CODE)

# --- Drive Path and Optimization Setup ---
# Assumes necessary variables (SAVE_PATH, train_loader, model, bert_model, tokenizer)
# are defined in Cells 1, 2, and 3.

# Define the explicit paths for saving the best model files
MODEL_WEIGHTS_PATH = os.path.join(SAVE_PATH, 'best_mental_health_model.pt')
TOKENIZER_CONFIG_PATH = os.path.join(SAVE_PATH, 'best_model_tokenizer')

# --- CRITICAL FIX: Unfreeze Last Two BERT Layers (Layers 4 and 5) ---
for name, param in model.bert.named_parameters():
    # Only unfreeze parameters in the last two transformer layers
    if any(f'layer.{i}.' in name for i in [4, 5]):
        param.requires_grad = True

# --- CRITICAL FIX: Optimizer with Layer Groups ---
# Define parameter groups for different learning rates
optimizer_grouped_parameters = [
    {
        # 1. New/Custom Layers (Classifier, CNN, BiLSTM) - Higher LR
        'params': [p for n, p in model.named_parameters() if p.requires_grad and 'bert.' not in n],
        'lr': 2e-5,
        'weight_decay': 0.01,
    },
    {
        # 2. Unfrozen BERT Layers (Last two layers) - Lower LR for fine-tuning embeddings
        'params': [p for n, p in model.named_parameters() if p.requires_grad and 'bert.' in n],
        'lr': 1e-6,
        'weight_decay': 0.01,
    },
]
# Initialize optimizer with grouped parameters
optimizer = optim.AdamW(optimizer_grouped_parameters)


# --- Training and Evaluation Functions ---

def train_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    criterion = nn.CrossEntropyLoss()
    for batch in tqdm(loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader, device):
    from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
    model.eval()
    all_preds = []
    all_labels = []
    all_probs = []

    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            probabilities = F.softmax(outputs, dim=1)
            predictions = torch.argmax(outputs, dim=-1)

            all_preds.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probs.extend(probabilities[:, 1].cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    # Binary F1 score is the required primary metric
    f1 = f1_score(all_labels, all_preds, average='binary', zero_division=0)
    try:
        roc_auc = roc_auc_score(all_labels, all_probs)
    except ValueError:
        roc_auc = 0.0

    return accuracy, f1, roc_auc

# --- TRAINING LOOP WITH EARLY STOPPING AND BEST MODEL SAVING ---
print("--- Starting CNN+BiLSTM Training (Optimized with Early Stopping) ---")
num_epochs = 20 # Max epochs
patience = 3     # Stop if F1 score doesn't improve for 3 epochs
best_f1 = 0.0
epochs_no_improve = 0
should_stop = False

for epoch in range(num_epochs):

    if epochs_no_improve >= patience:
        print(f"\nStopping early: Validation F1 did not improve for {patience} epochs.")
        break

    print(f"\nEpoch {epoch + 1}/{num_epochs}")

    # Train
    train_loss = train_epoch(model, train_loader, optimizer, device)

    # Evaluate
    accuracy, f1, roc_auc = evaluate(model, val_loader, device)

    print(f"Training Loss: {train_loss:.4f}")
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation F1-Score: {f1:.4f}")
    print(f"Validation ROC AUC: {roc_auc:.4f}")

    # Checkpoint and Best Model Saving Logic (based on F1-Score)
    if f1 > best_f1:
        best_f1 = f1
        epochs_no_improve = 0
        print(f"--> NEW BEST F1-SCORE: {best_f1:.4f}. Saving BEST model state to Drive...")

        # Save the best model state using standard PyTorch saving
        torch.save(model.state_dict(), MODEL_WEIGHTS_PATH)
        tokenizer.save_pretrained(TOKENIZER_CONFIG_PATH) # Save tokenizer alongside model
    else:
        epochs_no_improve += 1
        print(f"F1 not improved. Patience left: {patience - epochs_no_improve}")

# --- Final Evaluation (Load Best Model and Report) ---
print("\n--- Finalizing Results and Loading Best Model ---")
# 1. Initialize a clean instance of the model architecture
final_model = MentalHealthClassifier(bert_model).to(device)

# 2. Load the best saved weights
try:
    final_model.load_state_dict(torch.load(MODEL_WEIGHTS_PATH))
    print("Best saved model weights loaded successfully for final report.")
except FileNotFoundError:
    print(f"ERROR: Best model weights not found at {MODEL_WEIGHTS_PATH}. Evaluating current model state.")
    final_model = model

final_accuracy, final_f1, final_roc_auc = evaluate(final_model, val_loader, device)

print("\n--- FINAL OPTIMIZED RESULTS (SDG 3 Evaluation) ---")
print(f"Final Accuracy: {final_accuracy:.4f}")
print(f"Final F1-Score (Metric for distress detection): {final_f1:.4f}")
print(f"Final ROC AUC (Metric for model separation quality): {final_roc_auc:.4f}")

In [None]:
## CELL 5: LINGUISTIC MARKER IDENTIFICATION

print("--- Linguistic Marker Identification (Expected Outcome) ---")

# Define sample texts for comparison
SAMPLE_STRESS = "I am so overwhelmed and anxious about exams. I feel like I'm completely falling apart."
SAMPLE_CONTROL = "I enjoyed watching the movie with my friends this evening. It was a pleasant experience."

# Function to extract top 10 most contextually active words from the embeddings
def get_top_active_tokens(model, encoded_text, tokenizer, device):
    model.eval()
    with torch.no_grad():
        inputs = {k: v.to(device) for k, v in encoded_text.items()}
        # Extract embeddings from the final layer of BERT (before the CNN/BiLSTM)
        outputs = model.bert(inputs['input_ids'], inputs['attention_mask'])

        # Use the average magnitude of the sequence output vector to indicate semantic activity
        sequence_output = outputs.last_hidden_state.squeeze(0).cpu().numpy()
        token_magnitudes = np.linalg.norm(sequence_output, axis=1)

        # Get actual tokens
        tokens = tokenizer.convert_ids_to_tokens(encoded_text['input_ids'].flatten().tolist())

        # Filter out special tokens and padding
        valid_tokens_indices = [i for i, token in enumerate(tokens) if token not in ['[CLS]', '[SEP]', '[PAD]', '[UNK]']]

        valid_tokens = [tokens[i] for i in valid_tokens_indices]
        valid_magnitudes = [token_magnitudes[i] for i in valid_tokens_indices]

        combined = sorted(zip(valid_tokens, valid_magnitudes), key=lambda x: x[1], reverse=True)

        # Return top 10 tokens (cleaning up the '##' prefix for subwords)
        return [token.replace('##', '') for token, magnitude in combined[:10]]

try:
    # Use the final model saved to Drive
    active_tokens_stress = get_top_active_tokens(final_model, encoded_stress, tokenizer, device)
    active_tokens_control = get_top_active_tokens(final_model, encoded_control, tokenizer, device)
except NameError:
    # Fallback to define encoded_stress/control if you restarted the runtime
    encoded_stress = tokenizer.encode_plus(SAMPLE_STRESS, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
    encoded_control = tokenizer.encode_plus(SAMPLE_CONTROL, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
    active_tokens_stress = get_top_active_tokens(final_model, encoded_stress, tokenizer, device)
    active_tokens_control = get_top_active_tokens(final_model, encoded_control, tokenizer, device)


print("\n--- IDENTIFIED LINGUISTIC MARKERS ---")
print(f"Input: '{SAMPLE_STRESS}'")
print(f"Distress Markers (Words the model focused on): {active_tokens_stress}")
print("\nInput: '{SAMPLE_CONTROL}'")
print(f"Control Markers: {active_tokens_control}")