In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix




In [2]:
# read csv file using pandas
df= pd.read_csv('mail_l7_dataset.csv')
df.head()
               

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
# label Encoding
df["Category"]= df["Category"].str.lower().str.strip().map({ "spam": 0, "ham" : 1})

print(df.head())

   Category                                            Message
0         1  Go until jurong point, crazy.. Available only ...
1         1                      Ok lar... Joking wif u oni...
2         0  Free entry in 2 a wkly comp to win FA Cup fina...
3         1  U dun say so early hor... U c already then say...
4         1  Nah I don't think he goes to usf, he lives aro...


In [4]:

X = df["Message"].astype(str)
y = df["Category"].astype(int)



In [5]:
 # Split Data trainning 80% and tests 20%

X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=42
)

print("== SPLIT SIZE ==")
print("Train Data:", X_train.shape[0], " | Test Data:", X_test.shape[0])


== SPLIT SIZE ==
Train Data: 4457  | Test Data: 1115


In [6]:
# tf-idf
vectorizer = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)

X_train_features = vectorizer.fit_transform(X_train)
X_test_features  = vectorizer.transform(X_test)   # muhiim

In [7]:
# train model using logistic regression
lr = LogisticRegression(max_iter=1000, random_state=42)

lr.fit(X_train_features, y_train)

lr_predict = lr.predict(X_test_features)
print(lr_predict)
#print(lr.predict(X_test_features))
lr_pred_df = pd.DataFrame(lr_predict, columns=["LR prediction"])
# lr_pred_df.head(30)
lr_pred_df.value_counts()


[1 1 1 ... 1 1 1]


LR prediction
1                1002
0                 113
Name: count, dtype: int64

In [8]:
#Train the model by using a RandomForest Classifier
rf = RandomForestClassifier(n_estimators=200, random_state=42)
# fit the model
rf.fit(X_train_features, y_train)
# pridect the model
rf_pred = rf.predict(X_test_features)  
# convert the predictions of the RandomForest Classifier model to a DataFrame
rf_pred_df = pd.DataFrame(rf_pred, columns=["RF prediction"])
# rf_pred_df.head(30)
rf_pred_df.value_counts()

RF prediction
1                985
0                130
Name: count, dtype: int64

In [9]:
#Train the model by using Naive Bayes
nb = MultinomialNB()
# fit the model
nb.fit(X_train_features,y_train)
# pridect the model
nb_pred = nb.predict(X_test_features)
# convert the predictions of the Naive Bayes model to a DataFrame
nb_pred_df = pd.DataFrame(nb_pred, columns=["np prediction"])
nb_pred_df.value_counts()

np prediction
1                992
0                123
Name: count, dtype: int64

In [10]:
def display_metrics(model_name, y_actual, y_pred, pos_label=0):
    accuracy = accuracy_score(y_actual, y_pred)

    precision = precision_score(
        y_actual, y_pred, pos_label=pos_label, average="binary", zero_division=0
    )
    recall = recall_score(
        y_actual, y_pred, pos_label=pos_label, average="binary", zero_division=0
    )
    f1 = f1_score(
        y_actual, y_pred, pos_label=pos_label, average="binary", zero_division=0
    )

    print(f"\n{model_name} Performance:")
    print(f"Accuracy (Overall): {accuracy:.4f}")
    print(f"Precision (pos={pos_label}): {precision:.4f}")
    print(f"Recall (pos={pos_label}):    {recall:.4f}")
    print(f"F1-Score (pos={pos_label}):  {f1:.4f}")

# Displaying the Confusion matrix
def show_confusion_matrix(model_name, y_actual, y_pred, labels_order=(0, 1)):
    cm = confusion_matrix(y_actual, y_pred, labels=list(labels_order))

    # labels_order = (0,1) means: row/col0 is Spam(0), row/col1 is Ham(1)
    index_names = [f"Actual {labels_order[0]} (Spam)", f"Actual {labels_order[1]} (Ham)"]
    col_names   = [f"Pred {labels_order[0]} (Spam)",  f"Pred {labels_order[1]} (Ham)"]

    cm_df = pd.DataFrame(cm, index=index_names, columns=col_names)

    print(f"\n{model_name} Confusion Matrix (labels {labels_order}):")
    print(cm_df)

# Evaluate models
display_metrics("Logistic Regression", y_test, lr_predict, pos_label=0)
show_confusion_matrix("Logistic Regression", y_test, lr_predict, labels_order=(0, 1))

display_metrics("Random Forest", y_test, rf_pred, pos_label=0)
show_confusion_matrix("Random Forest", y_test, rf_pred, labels_order=(0, 1))

display_metrics("Naive Bayes", y_test, nb_pred, pos_label=0)
show_confusion_matrix("Naive Bayes", y_test, nb_pred, labels_order=(0, 1))


Logistic Regression Performance:
Accuracy (Overall): 0.9677
Precision (pos=0): 1.0000
Recall (pos=0):    0.7584
F1-Score (pos=0):  0.8626

Logistic Regression Confusion Matrix (labels (0, 1)):
                 Pred 0 (Spam)  Pred 1 (Ham)
Actual 0 (Spam)            113            36
Actual 1 (Ham)               0           966

Random Forest Performance:
Accuracy (Overall): 0.9830
Precision (pos=0): 1.0000
Recall (pos=0):    0.8725
F1-Score (pos=0):  0.9319

Random Forest Confusion Matrix (labels (0, 1)):
                 Pred 0 (Spam)  Pred 1 (Ham)
Actual 0 (Spam)            130            19
Actual 1 (Ham)               0           966

Naive Bayes Performance:
Accuracy (Overall): 0.9767
Precision (pos=0): 1.0000
Recall (pos=0):    0.8255
F1-Score (pos=0):  0.9044

Naive Bayes Confusion Matrix (labels (0, 1)):
                 Pred 0 (Spam)  Pred 1 (Ham)
Actual 0 (Spam)            123            26
Actual 1 (Ham)               0           966


In [11]:
# sanity check (for the Sample Message)
sample_message = [
    "Congratulations! You've won a free prize!",
    "You're invited to claim your free reward!",
    "You’ve been selected for a chance to win big!",
]

# this function will change the label output to string like: 1-> ham and 0-> spam
def label_to_str(r):
 return "Spam (0)" if r == 0 else "Ham (1)"


print("\n=== SAMPLE MESSAGE PREDICTIONS ===")
for i in sample_message:
 lr_pred_sample = int(lr.predict(vectorizer.transform([i]))[0])
 rf_pred_sample = int(rf.predict(vectorizer.transform([i]))[0])
 nb_pred_sample = int(nb.predict(vectorizer.transform([i]))[0])
 #
 print(f"\nSample Message: {i}")
 print(f"LR pred: {label_to_str(lr_pred_sample)}") 
 print(f"RF pred: {label_to_str(rf_pred_sample)}")
 print(f"NB pred: {label_to_str(nb_pred_sample)}\n")


=== SAMPLE MESSAGE PREDICTIONS ===

Sample Message: Congratulations! You've won a free prize!
LR pred: Spam (0)
RF pred: Ham (1)
NB pred: Spam (0)


Sample Message: You're invited to claim your free reward!
LR pred: Spam (0)
RF pred: Ham (1)
NB pred: Spam (0)


Sample Message: You’ve been selected for a chance to win big!
LR pred: Ham (1)
RF pred: Ham (1)
NB pred: Ham (1)

