In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
# Load the dataset
df = pd.read_csv('/content/drive/MyDrive/spam dataset/spam.csv', encoding='latin-1')
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [4]:
# Drop unnecessary columns
df = df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

# Rename columns for clarity
df.columns = ['label', 'message']

# Preprocess the data
df['message'] = df['message'].str.lower()

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label'], test_size=0.2, random_state=42
)

In [6]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit the vectorizer to the training data
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = vectorizer.transform(X_test)

In [7]:
# Train the models
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train_tfidf, y_train)

# Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)

In [8]:
# Evaluate the models
# Predict on the test data
nb_predictions = nb_model.predict(X_test_tfidf)
lr_predictions = lr_model.predict(X_test_tfidf)
svm_predictions = svm_model.predict(X_test_tfidf)

# Calculate metrics
def evaluate_model(predictions, y_test):
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions, pos_label='spam')
    recall = recall_score(y_test, predictions, pos_label='spam')
    f1 = f1_score(y_test, predictions, pos_label='spam')
    return accuracy, precision, recall, f1

In [9]:
print("Naive Bayes Model Evaluation:")
nb_accuracy, nb_precision, nb_recall, nb_f1 = evaluate_model(nb_predictions, y_test)
print(f"Accuracy: {nb_accuracy:.4f}")
print(f"Precision: {nb_precision:.4f}")
print(f"Recall: {nb_recall:.4f}")
print(f"F1-Score: {nb_f1:.4f}")



Naive Bayes Model Evaluation:
Accuracy: 0.9623
Precision: 1.0000
Recall: 0.7200
F1-Score: 0.8372


In [10]:
print("\nLogistic Regression Model Evaluation:")
lr_accuracy, lr_precision, lr_recall, lr_f1 = evaluate_model(lr_predictions, y_test)
print(f"Accuracy: {lr_accuracy:.4f}")
print(f"Precision: {lr_precision:.4f}")
print(f"Recall: {lr_recall:.4f}")
print(f"F1-Score: {lr_f1:.4f}")




Logistic Regression Model Evaluation:
Accuracy: 0.9659
Precision: 0.9912
Recall: 0.7533
F1-Score: 0.8561


In [11]:
print("\nSupport Vector Machine Model Evaluation:")
svm_accuracy, svm_precision, svm_recall, svm_f1 = evaluate_model(svm_predictions, y_test)
print(f"Accuracy: {svm_accuracy:.4f}")
print(f"Precision: {svm_precision:.4f}")
print(f"Recall: {svm_recall:.4f}")
print(f"F1-Score: {svm_f1:.4f}")


Support Vector Machine Model Evaluation:
Accuracy: 0.9821
Precision: 1.0000
Recall: 0.8667
F1-Score: 0.9286
