In [1]:
# importing the required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# reading the data from csv

df=pd.read_csv('spam.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
# grouping the data based upon the 2 categories to get insight on how many rows are in each category

df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [33]:
# Creating a new numerical column based upon the categorical values in categories columns

df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [34]:
# Splitting the dataset into training & test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(df.Message, df.spam, test_size=0.2)

In [35]:
len(X_train)

4457

In [36]:
len(X_test)

1115

In [37]:
# using CountVectorizer to split the sentences in each message into individual words (each word acting as a column) and then counting the no of times a particular word appears in the sentence
# Convert a collection of text documents to a matrix of token counts.

from sklearn.feature_extraction.text import CountVectorizer
v=CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:5]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [38]:
X_train_count.shape

(4457, 7729)

In [39]:
# Importing Multinomial Naive bayes for model training

from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(X_train_count,y_train)

In [40]:
# Classifying the 2 mails into spam or not spam

emails=['Hey Mohan,Can we get together to watch football match tomorrow?',
        'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!']
email_count=v.transform(emails)
model.predict(email_count)

array([0, 1], dtype=int64)

In [42]:
X_test_count =v.transform(X_test)
model.score(X_test_count,y_test)

0.9883408071748879

In [46]:
# using pipeline to transform the data using countvectorizer and then using the model to fit the data---This will remove the extrac step of transforming the data into email_count as done above
# Pipeline of transforms with a final estimator.

from sklearn.pipeline import Pipeline
clf=Pipeline([('Vectorizer',CountVectorizer()), ('nb',MultinomialNB())])

In [47]:
clf.fit(X_train,y_train)

In [48]:
clf.score(X_test,y_test)

0.9883408071748879

In [50]:
clf.predict(emails)

array([0, 1], dtype=int64)