In [19]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [20]:
df=pd.read_csv("mail_l7_dataset.csv")
df.head(5)

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [21]:
df.loc[df["Category"].str.lower().str.strip() == "spam", "Category"] = 0
df.loc[df["Category"].str.lower().str.strip() == "ham",  "Category"] = 1

Split features (X) and target (y) and train,test

In [22]:
X = df["Message"].astype(str)
y = df["Category"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [23]:
tfidf = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)
X_train_features = tfidf.fit_transform(X_train)
X_test_features  = tfidf.transform(X_test)

Train Logistic Regression

In [24]:

# Model 1: Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_features, y_train)

# Model 2: Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_features, y_train)

# Model 3: Naive Bayes (MultinomialNB)
nb = MultinomialNB()
nb.fit(X_train_features, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


Evaluation Results

In [25]:

def evaluate_model(name, model, X_test_feat, y_true):
    y_pred = model.predict(X_test_feat)
    
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, pos_label=0) # Focus on Spam (0)
    rec  = recall_score(y_true, y_pred, pos_label=0)
    f1   = f1_score(y_true, y_pred, pos_label=0)
    
    print(f"\n{name} Performance:")
    print(f"  Accuracy : {acc:.3f}")
    print(f"  Precision: {prec:.3f}")
    print(f"  Recall   : {rec:.3f}")
    print(f"  F1-Score : {f1:.3f}")
    
    # Create a readable Confusion Matrix
    cm = confusion_matrix(y_true, y_pred, labels=[1, 0])
    cm_df = pd.DataFrame(
        cm,
        index   = ["Actual: Ham(1)", "Actual: Spam(0)"],
        columns = ["Pred: Ham(1)", "Pred: Spam(0)"]
    )
    print(f"  Confusion Matrix:\n{cm_df}")

# Run evaluations
evaluate_model("Logistic Regression", lr, X_test_features, y_test)
evaluate_model("Random Forest", rf, X_test_features, y_test)
evaluate_model("Naive Bayes", nb, X_test_features, y_test)


Logistic Regression Performance:
  Accuracy : 0.968
  Precision: 1.000
  Recall   : 0.758
  F1-Score : 0.863
  Confusion Matrix:
                 Pred: Ham(1)  Pred: Spam(0)
Actual: Ham(1)            966              0
Actual: Spam(0)            36            113

Random Forest Performance:
  Accuracy : 0.981
  Precision: 1.000
  Recall   : 0.859
  F1-Score : 0.924
  Confusion Matrix:
                 Pred: Ham(1)  Pred: Spam(0)
Actual: Ham(1)            966              0
Actual: Spam(0)            21            128

Naive Bayes Performance:
  Accuracy : 0.977
  Precision: 1.000
  Recall   : 0.826
  F1-Score : 0.904
  Confusion Matrix:
                 Pred: Ham(1)  Pred: Spam(0)
Actual: Ham(1)            966              0
Actual: Spam(0)            26            123


In [26]:

test_messages = [
    "Free entry in 2 a weekly competition!",
    "I will meet you at the cafe tomorrow",
    "Congratulations, you won a free ticket"
]

def lab2str(v):
    return "Spam (0)" if v == 0 else "Ham (1)"

print("=== SINGLE MESSAGE PREDICTIONS ===")
for msg in test_messages:
    # Transform message using the same TF-IDF used for training
    msg_feat = tfidf.transform([msg])
    
    pred_lr = lr.predict(msg_feat)[0]
    pred_rf = rf.predict(msg_feat)[0]
    pred_nb = nb.predict(msg_feat)[0]
    
    print(f"\nMessage: '{msg}'")
    print(f"  LR Prediction: {lab2str(pred_lr)}")
    print(f"  RF Prediction: {lab2str(pred_rf)}")
    print(f"  NB Prediction: {lab2str(pred_nb)}")

=== SINGLE MESSAGE PREDICTIONS ===

Message: 'Free entry in 2 a weekly competition!'
  LR Prediction: Ham (1)
  RF Prediction: Ham (1)
  NB Prediction: Spam (0)

Message: 'I will meet you at the cafe tomorrow'
  LR Prediction: Ham (1)
  RF Prediction: Ham (1)
  NB Prediction: Ham (1)

Message: 'Congratulations, you won a free ticket'
  LR Prediction: Ham (1)
  RF Prediction: Ham (1)
  NB Prediction: Ham (1)
