# 1.Importing the Dependencies

In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB


# 2.Import Data

In [5]:
spam_df=pd.read_csv('/content/Spam Email Detection - spam.csv')

# 3.Inspect Data

In [36]:
spam_df

Unnamed: 0,v1,v2,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will �_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [8]:
spam_df.groupby('v1').describe()

Unnamed: 0_level_0,v2,v2,v2,v2,Unnamed: 2,Unnamed: 2,Unnamed: 2,Unnamed: 2,Unnamed: 3,Unnamed: 3,Unnamed: 3,Unnamed: 3,Unnamed: 4,Unnamed: 4,Unnamed: 4,Unnamed: 4
Unnamed: 0_level_1,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
ham,4825,4516,"Sorry, I'll call later",30,45,39,"bt not his girlfrnd... G o o d n i g h t . . .@""",3,10,9,GE,2,6,5,"GNT:-)""",2.0
spam,747,647,Please call our customer service representativ...,4,5,4,PO Box 5249,2,2,1,"MK17 92H. 450Ppw 16""",2,0,0,,


# 4.Data Cleaning

In [9]:
# drop last 3 cols
spam_df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [20]:
spam_df

Unnamed: 0,v1,v2,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will �_ b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [11]:
# inspect data
spam_df.groupby('v1').describe()

Unnamed: 0_level_0,v2,v2,v2,v2
Unnamed: 0_level_1,count,unique,top,freq
v1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,647,Please call our customer service representativ...,4


# 5.Turn spam/ham into numerical data

In [12]:
#turn spam/ham into numerical data, creating a new column 'spam'
spam_df['spam']=spam_df['v1'].apply(lambda x: 1 if x == 'spam' else 0)

# 6.Create train/test split

In [25]:
# create train/test split
x_train, x_test, y_train, y_test = train_test_split(spam_df.v2, spam_df.spam, test_size = 0.25)

In [27]:
x_train

5176    Company is very good.environment is terrific a...
4540    Hiya. How was last night? I've been naughty an...
4963    Dear Voucher holder Have your next meal on us....
5147    Get your garden ready for summer with a FREE s...
1933                              R u over scratching it?
                              ...                        
3288    Camera - You are awarded a SiPix Digital Camer...
3392                                          Ok thanx...
3934                             You need to get up. Now.
4704                             Hey anyway i have to :-)
4254    Block Breaker now comes in deluxe format with ...
Name: v2, Length: 4179, dtype: object

# 7.Find word count and store data as a matrix

In [28]:
# find word
cv=CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)

In [37]:
x_train_count

<4179x7417 sparse matrix of type '<class 'numpy.int64'>'
	with 55675 stored elements in Compressed Sparse Row format>

In [38]:
x_train_count.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# 8.Train Model


*   Pre-Test ham
*   Pre-Test spam



In [31]:
# train model
model = MultinomialNB()
model.fit(x_train_count, y_train)

In [32]:
# pre-test ham
email_ham = [" hey wanna meet up for the game?"]
email_ham_count = cv.transform(email_ham)
model.predict(email_ham_count)

array([0])

In [33]:
# pre-test spam
email_spam = [" reward money click aaawa kohvba"]
email_spam_count = cv.transform(email_spam)
model.predict(email_spam_count)


array([1])

# 9.Test Model

In [39]:
#Test model
x_test_count = cv.transform(x_test)
model.score(x_test_count, y_test)

0.9834888729361091