In [31]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import joblib
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [32]:
df = pd.read_csv("spam.csv", encoding="latin-1")
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df = df.rename(columns={"v1": "label", "v2": "text"})


# Preprocess text data
df["text"] = df["text"].str.lower()
df["text"] = df["text"].apply(word_tokenize)
stop_words = set(stopwords.words("english"))
df["text"] = df["text"].apply(lambda x: [word for word in x if word not in stop_words])
df["text"] = df["text"].apply(lambda x: " ".join(x))
df

Unnamed: 0,label,text
0,ham,"go jurong point , crazy .. available bugis n g..."
1,ham,ok lar ... joking wif u oni ...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor ... u c already say ...
4,ham,"nah n't think goes usf , lives around though"
...,...,...
5567,spam,2nd time tried 2 contact u. u å£750 pound priz...
5568,ham,ì_ b going esplanade fr home ?
5569,ham,"pity , * mood . ... suggestions ?"
5570,ham,guy bitching acted like 'd interested buying s...


In [33]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["text"])

X_train, X_test, y_train, y_test = train_test_split(X, df["label"], test_size=0.2, random_state=1)

classifier = MultinomialNB()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(report)

Accuracy: 0.9829596412556054
Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.99      0.99       976
        spam       0.91      0.96      0.93       139

    accuracy                           0.98      1115
   macro avg       0.95      0.97      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [34]:
loaded_model = joblib.dump(classifier, filename='email_spam_model.pkl')

In [35]:
loaded_model = joblib.load('email_spam_model.pkl')
new_email = ["Congratulations! You've won a prize. Claim it now."]
new_email = vectorizer.transform(new_email)
prediction = loaded_model.predict(new_email)

if prediction[0] == "spam":
  print("This email is spam.")
else:
  print("This email is not spam.")

This email is spam.
