# Lab Session 6 - Task 1: Spam Classification with RNN and CNN

1. Imports and Setup

In [13]:
"""
Lab Session 6 - Task 1: Spam Classification (Final Version)

This script builds, trains, and compares an RNN (LSTM) and a CNN for SMS spam classification 
using pre-trained word embeddings.

Workflow:
1. Setup: Import libraries and apply environment fixes.
2. Load Embeddings: Reconstruct the vocabulary and load pre-trained embeddings.
3. Load & Preprocess Spam Data: Clean and convert text to padded sequences.
4. Split Data: Train/test split and PyTorch DataLoaders.
5. Define Models: RNN (LSTM) and CNN architectures.
6. Run Cross-Validation: 5-fold training and evaluation.
7. Analyze Results: Present comparison and discussion.
"""

# ================================
# 1. Imports and Setup
# ================================
import os
import re
import numpy as np
import pandas as pd
from collections import Counter
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Fix for the OMP: Error #15 on some systems
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

# Ensure NLTK data is available
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

print("Setup complete.")

# ================================
# 2. Load Pre-trained Embeddings and Vocabulary
# ================================
def bbc_preprocess_for_vocab(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return word_tokenize(text)

BBC_FOLDER = 'bbc'
VOCAB_SIZE = 10000
EMBEDDING_DIM = 300

print("Reconstructing vocabulary from BBC dataset...")
all_tokens = []
for category in os.listdir(BBC_FOLDER):
    category_path = os.path.join(BBC_FOLDER, category)
    if not os.path.isdir(category_path):
        continue
    for filename in os.listdir(category_path):
        with open(os.path.join(category_path, filename), 'r', encoding='utf-8', errors='ignore') as f:
            all_tokens.extend(bbc_preprocess_for_vocab(f.read()))

word_counts = Counter(all_tokens)
vocab = [word for word, _ in word_counts.most_common(VOCAB_SIZE)]

word_to_idx = {word: i+2 for i, word in enumerate(vocab)}
word_to_idx['<PAD>'] = 0
word_to_idx['<UNK>'] = 1
print(f"Vocabulary of size {len(word_to_idx)} created.")

try:
    numpy_weights = np.load('word2vec_embeddings.npy')
    pretrained_weights = torch.from_numpy(numpy_weights)
    print("'word2vec_embeddings.npy' loaded successfully.")
    final_pretrained_weights = torch.cat([torch.randn(2, EMBEDDING_DIM), pretrained_weights])
except FileNotFoundError:
    print("ERROR: 'word2vec_embeddings.npy' not found. Using random embeddings.")
    final_pretrained_weights = torch.randn(len(word_to_idx), EMBEDDING_DIM)

# ================================
# 3. Load and Preprocess SMS Spam Dataset
# ================================
def sms_preprocess(text):
    """Basic cleaning and tokenization for SMS messages."""
    text = text.lower()
    text = re.sub(r'[^a-z\\s]', '', text)
    return word_tokenize(text)

# Load dataset
# --- FIX IS HERE: The column names have been swapped to match the file format ---
df = pd.read_csv('spam.csv', sep='\\t', header=None, names=['message', 'label'], on_bad_lines='skip', encoding='latin-1')

# Clean up any rows that were not parsed correctly or have missing values
df.dropna(inplace=True)

# The labels are already 0s and 1s, so we just need to ensure they are integers.
df['label'] = df['label'].astype(int)

# Convert text to sequences of integer IDs. Use 1 for <UNK> if word is not in vocab
sequences = [[word_to_idx.get(word, 1) for word in sms_preprocess(msg)] for msg in df['message']]

# Pad sequences to a fixed length for batching
MAX_SEQ_LENGTH = 50
padded_sequences = np.zeros((len(sequences), MAX_SEQ_LENGTH), dtype=np.int64)
for i, seq in enumerate(sequences):
    seq_len = len(seq)
    if seq_len > 0:
        if seq_len < MAX_SEQ_LENGTH:
            padded_sequences[i, -seq_len:] = np.array(seq)
        else:
            padded_sequences[i, :] = np.array(seq[:MAX_SEQ_LENGTH])

X = padded_sequences
y = df['label'].values

print(f"Dataset loaded and processed. Total samples: {len(X)}")

# ================================
# 4. Split Data and Create DataLoaders
# ================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Selected dataset split ratio: Train: {len(X_train)} ({len(X_train)/len(X)*100:.1f}%), "
      f"Test: {len(X_test)} ({len(X_test)/len(X)*100:.1f}%)")

class SpamDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.LongTensor(features)
        self.labels = torch.FloatTensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

test_dataset = SpamDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=64)

# ================================
# 5. Model Architectures (RNN and CNN)
# ================================
def create_embedding_layer(weights, non_trainable=True):
    num_embeddings, embedding_dim = weights.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights})
    if non_trainable:
        emb_layer.weight.requires_grad = False
    return emb_layer

class SpamRNN(nn.Module):
    def __init__(self, pretrained_weights, hidden_dim=128, n_layers=2):
        super(SpamRNN, self).__init__()
        self.embedding = create_embedding_layer(pretrained_weights)
        self.lstm = nn.LSTM(EMBEDDING_DIM, hidden_dim, n_layers, batch_first=True, 
                            dropout=0.5, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.lstm(x)
        out = self.fc(lstm_out[:, -1, :])
        return self.sigmoid(out)

class SpamCNN(nn.Module):
    def __init__(self, pretrained_weights, n_filters=100, filter_sizes=[2,3,4]):
        super(SpamCNN, self).__init__()
        self.embedding = create_embedding_layer(pretrained_weights)
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=EMBEDDING_DIM, out_channels=n_filters, kernel_size=fs) 
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, 1)
        self.sigmoid = nn.Sigmoid()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(0, 2, 1)
        conved = [torch.relu(conv(x)) for conv in self.convs]
        pooled = [torch.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        out = self.fc(cat)
        return self.sigmoid(out)

# ================================
# 6. Training and Evaluation with 5-Fold Cross-Validation
# ================================
def run_cross_validation(model_class, model_name, pretrained_weights, X_train, y_train, test_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"\n----- Evaluating {model_name} using device: {device} -----")
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_results = []
    
    for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(X_train, y_train), total=5, desc=f"CV Folds for {model_name}")):
        model = model_class(pretrained_weights).to(device)
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        criterion = nn.BCELoss()
        
        X_train_fold, y_train_fold = X_train[train_idx], y_train[train_idx]
        train_dataset_fold = SpamDataset(X_train_fold, y_train_fold)
        train_loader_fold = DataLoader(train_dataset_fold, batch_size=64, shuffle=True)
        
        model.train()
        for epoch in range(5):
            for inputs, labels in train_loader_fold:
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model(inputs).squeeze()
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
        
        model.eval()
        all_preds, all_labels = [], []
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs).squeeze()
                preds = (outputs > 0.5).int().cpu().numpy()
                all_labels.extend(labels.cpu().numpy().astype(int))
                all_preds.extend(preds)
        
        metrics = {
            'accuracy': accuracy_score(all_labels, all_preds),
            'precision': precision_score(all_labels, all_preds, zero_division=0),
            'recall': recall_score(all_labels, all_preds, zero_division=0),
            'f1_score': f1_score(all_labels, all_preds, zero_division=0)
        }
        fold_results.append(metrics)
        print(f"Fold {fold+1} Test Metrics: F1-Score={metrics['f1_score']:.4f}")
    
    return pd.DataFrame(fold_results)

# Run CV for both models
rnn_results_df = run_cross_validation(SpamRNN, "RNN (LSTM)", final_pretrained_weights, X_train, y_train, test_loader)
cnn_results_df = run_cross_validation(SpamCNN, "CNN", final_pretrained_weights, X_train, y_train, test_loader)

# ================================
# 7. Final Results and Discussion
# ================================
rnn_summary = rnn_results_df.agg(['mean', 'std'])
cnn_summary = cnn_results_df.agg(['mean', 'std'])

comparison_df = pd.concat([rnn_summary, cnn_summary], keys=['RNN (LSTM)', 'CNN'])

print("\n--- Final Model Performance Comparison (Mean ± Std Dev over 5 Folds) ---")
print(comparison_df)

best_f1_model = comparison_df.xs('mean', level=1)['f1_score'].idxmax()
print(f"\n**Best model by F1-score:** {best_f1_model}")


Setup complete.
Reconstructing vocabulary from BBC dataset...
Vocabulary of size 10002 created.
'word2vec_embeddings.npy' loaded successfully.


  df = pd.read_csv('spam.csv', sep='\\t', header=None, names=['message', 'label'], on_bad_lines='skip', encoding='latin-1')


Dataset loaded and processed. Total samples: 1547
Selected dataset split ratio: Train: 1237 (80.0%), Test: 310 (20.0%)

----- Evaluating RNN (LSTM) using device: cpu -----


CV Folds for RNN (LSTM):  20%|██        | 1/5 [00:11<00:46, 11.59s/it]

Fold 1 Test Metrics: F1-Score=0.0000


CV Folds for RNN (LSTM):  40%|████      | 2/5 [00:20<00:29,  9.80s/it]

Fold 2 Test Metrics: F1-Score=0.0000


CV Folds for RNN (LSTM):  60%|██████    | 3/5 [00:28<00:18,  9.29s/it]

Fold 3 Test Metrics: F1-Score=0.0000


CV Folds for RNN (LSTM):  80%|████████  | 4/5 [00:37<00:08,  9.00s/it]

Fold 4 Test Metrics: F1-Score=0.0000


CV Folds for RNN (LSTM): 100%|██████████| 5/5 [00:45<00:00,  9.18s/it]


Fold 5 Test Metrics: F1-Score=0.0000

----- Evaluating CNN using device: cpu -----


CV Folds for CNN:  20%|██        | 1/5 [00:02<00:08,  2.23s/it]

Fold 1 Test Metrics: F1-Score=0.0000


CV Folds for CNN:  40%|████      | 2/5 [00:04<00:06,  2.22s/it]

Fold 2 Test Metrics: F1-Score=0.0000


CV Folds for CNN:  60%|██████    | 3/5 [00:06<00:04,  2.21s/it]

Fold 3 Test Metrics: F1-Score=0.0000


CV Folds for CNN:  80%|████████  | 4/5 [00:08<00:02,  2.22s/it]

Fold 4 Test Metrics: F1-Score=0.6536


CV Folds for CNN: 100%|██████████| 5/5 [00:11<00:00,  2.22s/it]

Fold 5 Test Metrics: F1-Score=0.6522

--- Final Model Performance Comparison (Mean ± Std Dev over 5 Folds) ---
                 accuracy  precision    recall  f1_score
RNN (LSTM) mean  0.515484   0.000000  0.000000  0.000000
           std   0.001443   0.000000  0.000000  0.000000
CNN        mean  0.503226   0.193862  0.400000  0.261154
           std   0.016290   0.265456  0.547723  0.357600

**Best model by F1-score:** CNN



