In [40]:
#loads
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
)

random_state=42


In [41]:
# Load dataset
df = pd.read_csv("mail_l7_dataset.csv")

# Show first rows
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [42]:
#Preprocess Data
#Handle missing values

df = df.where(pd.notnull(df), "")

# Encodee labels
df.loc[df["Category"].str.lower().str.strip() == "spam", "Category"] = 0
df.loc[df["Category"].str.lower().str.strip() == "ham",  "Category"] = 1

df["Category"] = df["Category"].astype(int)
df.head(5)


Unnamed: 0,Category,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [51]:
X = df["Message"].astype(str)
y = df["Category"].astype(int)

In [52]:
X_train, X_test, y_train, y_test = train_test_split(
    X,y, test_size=0.2, random_state=42
)

print("Training samples:",  X_train.shape[0])
print("Testing samples :",  X_test.shape[0])

Training samples: 4457
Testing samples : 1115


In [53]:
tfidf = TfidfVectorizer(stop_words="english", lowercase=True)

X_train_features = tfidf.fit_transform(X_train)
X_test_features  = tfidf.transform(X_test)

print("Train shape:", X_train_features.shape)
print("Test shape:", X_test_features.shape)

Train shape: (4457, 7440)
Test shape: (1115, 7440)


In [54]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_features, y_train)
lr_pred = lr.predict(X_test_features)

In [55]:
rf = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
rf.fit(X_train_features, y_train)

rf_pred = rf.predict(X_test_features)

In [56]:
nb = MultinomialNB()
nb.fit(X_train_features, y_train)

nb_pred = nb.predict(X_test_features)

In [57]:
def evaluate_model(name, y_true, y_pred):
    print(f"\n{name} Performance:")
    print("Accuracy :", round(accuracy_score(y_true, y_pred), 3))
    print("Precision:", round(precision_score(y_true, y_pred, pos_label=0), 3))
    print("Recall   :", round(recall_score(y_true, y_pred, pos_label=0), 3))
    print("F1-Score :", round(f1_score(y_true, y_pred, pos_label=0), 3))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

In [58]:
evaluate_model("Logistic Regression", y_test, lr_pred)
evaluate_model("Random Forest", y_test, rf_pred)
evaluate_model("Naive Bayes", y_test, nb_pred)


Logistic Regression Performance:
Accuracy : 0.968
Precision: 1.0
Recall   : 0.758
F1-Score : 0.863
Confusion Matrix:
 [[113  36]
 [  0 966]]

Random Forest Performance:
Accuracy : 0.983
Precision: 1.0
Recall   : 0.872
F1-Score : 0.932
Confusion Matrix:
 [[130  19]
 [  0 966]]

Naive Bayes Performance:
Accuracy : 0.977
Precision: 1.0
Recall   : 0.826
F1-Score : 0.904
Confusion Matrix:
 [[123  26]
 [  0 966]]
