# Spam Classifier 

This notebook demonstrates building a spam classifier using the SpamAssassin dataset. It reuses the train and test sets previously split and guides you through data loading, preprocessing, feature extraction, model training, and evaluation.

In [1]:
# Import required libraries
import os
import re
import string
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

## Load Pre-Split Train and Test Sets
using the train and test folders created previously for each category (easy_ham, hard_ham, spam).

In [2]:
# Define paths to train and test folders
train_folders = [
    '20021010_easy_ham/easy_ham/train',
    '20030228_hard_ham/hard_ham/train',
    '20030228_spam/spam/train'
]
test_folders = [
    '20021010_easy_ham/easy_ham/test',
    '20030228_hard_ham/hard_ham/test',
    '20030228_spam/spam/test'
]

# List a few files from each folder as a check
for folder in train_folders + test_folders:
    print(f"Sample files in {folder}:")
    print(os.listdir(folder)[:5])

Sample files in 20021010_easy_ham/easy_ham/train:
['0003.acfc5ad94bbd27118a0d8685d18c89dd', '0004.e8d5727378ddde5c3be181df593f1712', '0005.8c3b9e9c0f3f183ddaf7592a11b99957', '0006.ee8b0dba12856155222be180ba122058', '0007.c75188382f64b090022fa3b095b020b0']
Sample files in 20030228_hard_ham/hard_ham/train:
['00001.7c7d6921e671bbe18ebb5f893cd9bb35', '00003.268fd170a3fc73bee2739d8204856a53', '00004.68819fc91d34c82433074d7bd3127dcc', '00005.34bcaad58ad5f598f5d6af8cfa0c0465', '00006.3409dec8ca4fcf2d6e0582554473b5c9']
Sample files in 20030228_spam/spam/train:
['00001.7848dde101aa985090474a91ec93fcf0', '00003.2ee33bc6eacdb11f38d052c44819ba6c', '00004.eac8de8d759b7e74154f142194282724', '00005.57696a39d7d84318ce497886896bf90d', '00006.5ab5620d3d7c6c0db76234556a16f6c1']
Sample files in 20021010_easy_ham/easy_ham/test:
['0001.ea7e79d3153e7469e7a9c3e0af6a357e', '0002.b3120c4bcbf3101e661161ee7efcb8bf', '0011.07b11073b53634cff892a7988289a72e', '0015.a9ff8d7550759f6ab62cc200bdf156e7', '0018.ba70ecbeea

In [3]:
# Preprocessing and vectorization functions
REMOVE_HEADERS = True
TO_LOWER = True
REMOVE_PUNCT = True
REPLACE_URLS = True
REPLACE_NUMBERS = True
BINARY_FEATURES = True  # True: presence/absence, False: word counts

def preprocess(text):
    if REMOVE_HEADERS:
        parts = text.split('\n\n', 1)
        text = parts[1] if len(parts) > 1 else text
    if TO_LOWER:
        text = text.lower()
    if REPLACE_URLS:
        text = re.sub(r'http[s]?://\S+', 'URL', text)
    if REPLACE_NUMBERS:
        text = re.sub(r'\b\d+\b', 'NUMBER', text)
    if REMOVE_PUNCT:
        text = text.translate(str.maketrans('', '', string.punctuation))
    words = text.split()
    return words

def get_files(folder):
    return [os.path.join(folder, f) for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]

def load_emails(folder, label):
    emails = []
    labels = []
    for fname in get_files(folder):
        with open(fname, encoding='latin1') as f:
            text = f.read()
            words = preprocess(text)
            emails.append(words)
            labels.append(label)
    return emails, labels

# Load all emails and labels
train_emails, train_labels = [], []
test_emails, test_labels = [], []

for i, folder in enumerate(train_folders):
    label = 0 if i < 2 else 1  # easy_ham/hard_ham=0, spam=1
    emails, labels = load_emails(folder, label)
    train_emails.extend(emails)
    train_labels.extend(labels)
for i, folder in enumerate(test_folders):
    label = 0 if i < 2 else 1
    emails, labels = load_emails(folder, label)
    test_emails.extend(emails)
    test_labels.extend(labels)

# Build vocabulary from training set
vocab_counter = Counter()
for words in train_emails:
    vocab_counter.update(set(words) if BINARY_FEATURES else words)
vocab = sorted(vocab_counter)
word_idx = {word: i for i, word in enumerate(vocab)}

# Vectorize emails
def vectorize(emails):
    X = np.zeros((len(emails), len(vocab)), dtype=int)
    for i, words in enumerate(emails):
        counts = Counter(words)
        for word in counts:
            if word in word_idx:
                X[i, word_idx[word]] = 1 if BINARY_FEATURES else counts[word]
    return X

X_train = vectorize(train_emails)
X_test = vectorize(test_emails)
y_train = np.array(train_labels)
y_test = np.array(test_labels)

print(f"Vocabulary size: {len(vocab)}")
print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")

Vocabulary size: 63977
Train set: (2640, 63977), Test set: (663, 63977)


## Training and Evaluating the Classifier
We will train a Multinomial Naive Bayes classifier on the training data and evaluate its performance on the test set using accuracy, precision, recall, F1-score, and confusion matrix.

In [4]:
# Train the classifier
clf = MultinomialNB()
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["ham", "spam"]))

Accuracy: 0.9532428355957768
Precision: 0.9861111111111112
Recall: 0.7029702970297029
F1 Score: 0.8208092485549133

Confusion Matrix:
 [[561   1]
 [ 30  71]]

Classification Report:
               precision    recall  f1-score   support

         ham       0.95      1.00      0.97       562
        spam       0.99      0.70      0.82       101

    accuracy                           0.95       663
   macro avg       0.97      0.85      0.90       663
weighted avg       0.95      0.95      0.95       663

