In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
data=pd.read_csv("spam.csv",encoding = "ISO-8859-1")

In [4]:
df=pd.DataFrame(data)

In [5]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
data=df.drop(['Unnamed: 2','Unnamed: 3','Unnamed: 4'],axis=1)

In [7]:
pd.set_option('display.max_colwidth', -1)
data.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [8]:
vector = CountVectorizer()

In [9]:
vector.fit(data['v2'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [10]:
features = vector.transform(data['v2'])

In [11]:
target = data['v1'] 
target.head()

0    ham 
1    ham 
2    spam
3    ham 
4    ham 
Name: v1, dtype: object

In [12]:
X_train, X_test, y_train, y_test = train_test_split(features, target)

In [13]:
X_train[:5]

<5x8672 sparse matrix of type '<class 'numpy.int64'>'
	with 91 stored elements in Compressed Sparse Row format>

In [14]:
y_train[:5]

4338    ham
228     ham
3531    ham
3749    ham
409     ham
Name: v1, dtype: object

In [15]:
model = MultinomialNB()
model.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
pred = model.predict(X_test)
pred[:5]

array(['ham', 'ham', 'ham', 'spam', 'ham'], dtype='<U4')

In [17]:
y_test[:5]

5182    ham 
5045    ham 
3470    ham 
1572    spam
2731    ham 
Name: v1, dtype: object

In [18]:
print(confusion_matrix(y_test, pred))

[[1200   14]
 [   7  172]]


In [19]:
print(accuracy_score(y_test, pred)*100)

98.49246231155779


In [20]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         ham       0.99      0.99      0.99      1214
        spam       0.92      0.96      0.94       179

    accuracy                           0.98      1393
   macro avg       0.96      0.97      0.97      1393
weighted avg       0.99      0.98      0.99      1393



In [21]:
import pickle
fp=open('spam_ham.pk1','wb')
pickle.dump(model,fp)
fp.close()

In [None]:
import tkinter as tk
import PIL.Image
import PIL.ImageTk

with open('spam_ham.pk1', 'rb') as fp:
    model = pickle.load(fp)
    fp.close()
    
root=tk.Tk()
root.title('Spam Message Detector')

canvas = tk.Canvas(root, height=460, width=620)

im = PIL.Image.open("spam.jpg")
photo = PIL.ImageTk.PhotoImage(im)
background_label = tk.Label(root, image=photo)
background_label.place(relx=0, rely=0, relwidth=1, relheight=1)

msg=tk.StringVar()

l=tk.Label(root,text='Type Your Message', bd=2,relief='sunken')
l.config(bg='#FFFFFF',fg='#3E3E43', font=('Ink Free', 16, 'bold'))
l.place(relx=0.53, rely=0.03, relwidth=0.4, relheight=0.07, anchor='n')

e = tk.Entry(root,textvariable=msg,bg='#FBF7F5',fg='#3E3E43',font=('Ink Free', 12))
e.place(relx=0.2, rely=0.55, relwidth=0.58, relheight=0.2)

b = tk.Button(root, text='Check', command=lambda : predict())
b.config(bg='#eeeeee', fg='black', font=('Ink Free', 12, 'bold'),border=5)
b.place(relx=0.63, rely=0.83, relwidth=0.15, relheight=0.05)

def predict():
    ms = msg.get()
    f = [ ms ]
    t_msg = vector.transform(f)
    m = model.predict(t_msg)
    if m=='spam':
        text ="Your Message seems to be Spam."
    else:
        text ="Your Message seems to se Ham."
        
    label = tk.Label(root,text=text,relief='sunken')
    label.config(bg='#FFFFFF',font=('Ink Free', 14))
    label.place(relx=0.16, rely=0.92, relwidth=0.66, relheight=0.06)

canvas.pack()
root.mainloop()