In [29]:
import pandas as pd


In [30]:
df = pd.read_csv("spam.csv", encoding="latin-1")
df.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [31]:
df = df[['v1', 'v2']]
df.columns = ['label', 'message']
df.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [32]:
df['label'] = df['label'].map({'ham': 0, 'spam': 1})
df.head()


Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [33]:
import nltk
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


In [34]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
print(stop_words)



{'with', 'here', 'now', "mustn't", 'but', 'from', 'should', "you're", 'ma', 'aren', 'am', "you'd", 'while', 'just', 'o', "it'd", 're', 'will', 'same', "wasn't", 'hadn', 'needn', 'who', 'll', "wouldn't", 'only', 'its', "didn't", 'or', "we'll", 'because', 'doing', 'had', "they're", 'yours', 'does', 'he', 'then', 'again', 'him', 'don', "she'd", "he'll", "it'll", 'most', 'this', 'hasn', 'for', 'mustn', 'no', "don't", "shan't", 'the', "weren't", 'such', "you'll", 'when', "she's", 's', 'd', 'after', 'it', "hadn't", 've', 'them', 'below', "he'd", 'other', "they'll", 'ain', 'own', 'all', 'isn', 'their', 'too', 'they', 'until', 'm', "we'd", 'we', "hasn't", 'whom', 'off', "couldn't", "i'll", 'into', 'are', 'y', "isn't", 'ourselves', "we're", 'under', 'hers', 'during', 'an', "she'll", 'against', 'before', 'couldn', "they'd", 'more', 'nor', 'by', 'and', 'once', 'that', 'been', 'my', 'wasn', 'there', 'himself', 'theirs', 'than', "you've", "it's", "should've", "we've", "i'm", 'these', 'down', 'about

In [35]:
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    text = ''.join([ch for ch in text if ch not in string.punctuation])
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)


In [36]:
df['cleaned_message'] = df['message'].apply(preprocess_text)
df.head()


Unnamed: 0,label,message,cleaned_message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [38]:
vectorizer = TfidfVectorizer(max_features=3000)

X = vectorizer.fit_transform(df['cleaned_message'])
y = df['label']


In [39]:
from sklearn.model_selection import train_test_split


In [40]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [41]:
from sklearn.naive_bayes import MultinomialNB


In [42]:
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)


In [43]:
y_pred_nb = nb_model.predict(X_test)


In [44]:
from sklearn.metrics import accuracy_score, classification_report


In [45]:
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.97847533632287
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       1.00      0.84      0.91       150

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [46]:
from sklearn.linear_model import LogisticRegression


In [47]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)


In [48]:
y_pred_lr = lr_model.predict(X_test)


In [49]:
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.9515695067264573
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.97      0.66      0.79       150

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115



In [50]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred_lr)


array([[962,   3],
       [ 51,  99]])

In [51]:
sample_email = ["Congratulations! You have won a free prize"]

sample_cleaned = preprocess_text(sample_email[0])
sample_vector = vectorizer.transform([sample_cleaned])

prediction = lr_model.predict(sample_vector)

if prediction[0] == 1:
    print("Spam Email")
else:
    print("Not Spam")


Spam Email


In [52]:
sample_email = ["you have passed your admission test"]

sample_cleaned = preprocess_text(sample_email[0])
sample_vector = vectorizer.transform([sample_cleaned])

prediction = lr_model.predict(sample_vector)

if prediction[0] == 1:
    print("Spam Email")
else:
    print("Not Spam")


Not Spam
