In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix



In [2]:
# Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')

# Inspect the dataset
print(df.head())
print(df.info())

# Drop unnecessary columns and rename columns for clarity
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Encode the labels: 'ham' = 0, 'spam' = 1
df['label'] = df['label'].map({'ham': 0, 'spam': 1})



     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
m

In [3]:
# Preprocess the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)

X = tfidf_vectorizer.fit_transform(df['message'])  # Features (TF-IDF vectorized messages)
y = df['label']  # Target (spam or ham)




In [4]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [5]:
# Model 1: Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)



In [6]:
# Model 2: Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)



In [7]:
# Model 3: Support Vector Machine (SVM)
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)



In [8]:
# Predictions for all models
y_pred_nb = nb_classifier.predict(X_test)
y_pred_log_reg = log_reg.predict(X_test)
y_pred_svm = svm_classifier.predict(X_test)



In [9]:
# Evaluation function
def evaluate_model(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1



In [10]:
# Evaluating Naive Bayes
acc_nb, prec_nb, rec_nb, f1_nb = evaluate_model(y_test, y_pred_nb)
print(f"Naive Bayes: Accuracy={acc_nb}, Precision={prec_nb}, Recall={rec_nb}, F1-Score={f1_nb}")

# Evaluating Logistic Regression
acc_log_reg, prec_log_reg, rec_log_reg, f1_log_reg = evaluate_model(y_test, y_pred_log_reg)
print(f"Logistic Regression: Accuracy={acc_log_reg}, Precision={prec_log_reg}, Recall={rec_log_reg}, F1-Score={f1_log_reg}")

# Evaluating SVM
acc_svm, prec_svm, rec_svm, f1_svm = evaluate_model(y_test, y_pred_svm)
print(f"SVM: Accuracy={acc_svm}, Precision={prec_svm}, Recall={rec_svm}, F1-Score={f1_svm}")



Naive Bayes: Accuracy=0.968609865470852, Precision=1.0, Recall=0.7666666666666667, F1-Score=0.8679245283018869
Logistic Regression: Accuracy=0.9443946188340807, Precision=0.9680851063829787, Recall=0.6066666666666667, F1-Score=0.7459016393442623
SVM: Accuracy=0.9721973094170404, Precision=0.9917355371900827, Recall=0.8, F1-Score=0.8856088560885609


In [11]:
# Confusion Matrix for SVM (or any other model)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)
print(f"Confusion Matrix for SVM:\n{conf_matrix_svm}")

Confusion Matrix for SVM:
[[964   1]
 [ 30 120]]
