### 1) Setup and Imports


In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

RANDOM_STATE = 42  # reproducibility

### 2) Load and Clean Dataset


In [21]:
df = pd.read_csv("mail_l7_dataset.csv")

# Basic cleaning: replace NaNs with empty strings (text models can't handle NaN)
df = df.where(pd.notnull(df), "")

# Encode labels: spam -> 0, ham -> 1 (keep teacher's original convention)
df.loc[df["Category"].str.lower().str.strip() == "spam", "Category"] = 0
df.loc[df["Category"].str.lower().str.strip() == "ham",  "Category"] = 1

X = df["Message"].astype(str)
y = df["Category"].astype(int)

print("=== DATA PREVIEW ===")
print(df.head())

=== DATA PREVIEW ===
  Category                                            Message
0        1  Go until jurong point, crazy.. Available only ...
1        1                      Ok lar... Joking wif u oni...
2        0  Free entry in 2 a wkly comp to win FA Cup fina...
3        1  U dun say so early hor... U c already then say...
4        1  Nah I don't think he goes to usf, he lives aro...


### 3) Train/Test Split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

print("=== SPLIT SIZES  ===")

print(f"Train: {X_train.shape[0]} | Test: {X_test.shape[0]}")


=== SPLIT SIZES  ===
Train: 4457 | Test: 1115


### 4) Text â†’ TF-IDF Features

In [23]:
tfidf = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)

X_train_features = tfidf.fit_transform(X_train)
X_test_features  = tfidf.transform(X_test)

print("=== TF-IDF SHAPES ===")

print(f"X_train: {X_train_features.shape} | X_test: {X_test_features.shape}")

=== TF-IDF SHAPES ===
X_train: (4457, 7440) | X_test: (1115, 7440)


### 5) Train Models

In [24]:
# 1) Logistic Regression

lr = LogisticRegression(max_iter=1000, random_state=RANDOM_STATE)
lr.fit(X_train_features, y_train)
lr_pred = lr.predict(X_test_features)

# 2) Random Forest (convert to array for consistency)

rf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
rf.fit(X_train_features, y_train)
rf_pred = rf.predict(X_test_features.toarray())

# 3) Naive Bayes (MultinomialNB)

nb = MultinomialNB()
nb.fit(X_train_features, y_train)
nb_pred = nb.predict(X_test_features)

### 6) Evaluate Metrics

In [25]:
def print_clf_metrics(name, y_true, y_pred, pos_label=0):
    """Print Accuracy, Precision, Recall, F1 with formatted Confusion Matrix DataFrame"""
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=pos_label)
    rec = recall_score(y_true, y_pred, pos_label=pos_label)
    f1 = f1_score(y_true, y_pred, pos_label=pos_label)
    
    # Creating the beautiful Confusion Matrix DataFrame as requested
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0])
    cm_df = pd.DataFrame(
        cm,
        index   = ["Actual: Ham (1)",  "Actual: Spam (0)"],
        columns = ["Pred: Ham (1)",    "Pred: Spam (0)"]
    )
    
    print(f"\n{name} Performance:")
    print(f"  Accuracy  : {acc:.3f}")
    print(f"  Precision : {prec:.3f} (positive = spam=0)")
    print(f"  Recall    : {rec:.3f} (positive = spam=0)")
    print(f"  F1-Score  : {f1:.3f} (positive = spam=0)")
    print(f"\n{name} Confusion Matrix:")
    print(cm_df)
    print("="*50)

print_clf_metrics("Logistic Regression", y_test, lr_pred)
print_clf_metrics("Random Forest", y_test, rf_pred)
print_clf_metrics("Naive Bayes", y_test, nb_pred)


Logistic Regression Performance:
  Accuracy  : 0.968
  Precision : 1.000 (positive = spam=0)
  Recall    : 0.758 (positive = spam=0)
  F1-Score  : 0.863 (positive = spam=0)

Logistic Regression Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             36             113

Random Forest Performance:
  Accuracy  : 0.983
  Precision : 1.000 (positive = spam=0)
  Recall    : 0.872 (positive = spam=0)
  F1-Score  : 0.932 (positive = spam=0)

Random Forest Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actual: Spam (0)             19             130

Naive Bayes Performance:
  Accuracy  : 0.977
  Precision : 1.000 (positive = spam=0)
  Recall    : 0.826 (positive = spam=0)
  F1-Score  : 0.904 (positive = spam=0)

Naive Bayes Confusion Matrix:
                  Pred: Ham (1)  Pred: Spam (0)
Actual: Ham (1)             966               0
Actua

### 7) Sanity Checks

In [26]:
test_messages = [
    "Free entry in 2 a weekly competition!",
    "I will meet you at the cafe tomorrow",
    "Congratulations, you won a free ticket"
]

def lab2str(v):
    return "Spam (0)" if v == 0 else "Ham (1)"

print("=== SANITY CHECKS ===")
for msg in test_messages:
    print(f"\nText: '{msg}'")
    feat = tfidf.transform([msg])
    
    l_p = lr.predict(feat)[0]
    r_p = rf.predict(feat.toarray())[0]
    n_p = nb.predict(feat)[0]
    
    print(f"LR Prediction : {lab2str(l_p)}")
    print(f"RF Prediction : {lab2str(r_p)}")
    print(f"NB Prediction : {lab2str(n_p)}")

=== SANITY CHECKS ===

Text: 'Free entry in 2 a weekly competition!'
LR Prediction : Ham (1)
RF Prediction : Ham (1)
NB Prediction : Spam (0)

Text: 'I will meet you at the cafe tomorrow'
LR Prediction : Ham (1)
RF Prediction : Ham (1)
NB Prediction : Ham (1)

Text: 'Congratulations, you won a free ticket'
LR Prediction : Ham (1)
RF Prediction : Ham (1)
NB Prediction : Ham (1)
