Spam Mail Detection Model

In [59]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [60]:
# Load CSV Dataset
spam_df = pd.read_csv("SpamMail.csv")

In [61]:
spam_df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [62]:
#convert spam/ham to numerical variable in new coloumn spam
spam_df['spam'] = spam_df['Category'].apply(lambda x: 1 if x=='spam' else 0)
spam_df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [63]:
# x variable is content of the mail
# y variable is label of the mail i.e. spam or not spam
x_train, x_test, y_train, y_test = train_test_split(spam_df.Message,spam_df.spam)

In [64]:
# To find word count and store data as a matrix using Countvectorizer
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)

In [65]:
# Checking the data
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [66]:
# train model
model = MultinomialNB()
model.fit(x_train_count, y_train)


In [67]:
# Pretest Ham emails
email_ham=["hey let's go to the restaurant"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([0], dtype=int64)

In [68]:
# Pretest Spam emails
email_spam=["jackpot money reward click link"]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)

array([1], dtype=int64)

In [69]:
#Testing model with test set x 
x_test_count = cv.transform(x_test)
model.score(x_test_count,y_test)*100

98.34888729361091

Thus we can see that this model is 98.34% accurate

Now we will save the model in form of pickle model file

In [None]:
import pickle
pickle.dump(model, open("spamModel.pkl", "wb"))
pickle.dump(cv, open("vectorizer.pkl", "wb"))