In [1]:
# read csv file and show first 5 lines
import pandas as pd
dts = pd.read_csv('spam.csv', encoding='latin1')
dts.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [2]:
# Remove unwanted columns
dts.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)
dts.rename(columns = {'v1': 'labels', 'v2': 'message'}, inplace = True)
dts.head()

Unnamed: 0,labels,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
dts['labels'].value_counts()

0    4825
1     747
Name: labels, dtype: int64

In [3]:
#change ham/span to numeric values 0/1
dts['labels'] = dts['labels'].map({'ham': 0, 'spam': 1})
dts.head()

Unnamed: 0,labels,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
#lower case messages
dts['message'] = dts.message.map(lambda x: x.lower())  
dts.head()

Unnamed: 0,labels,message
0,0,"go until jurong point, crazy.. available only ..."
1,0,ok lar... joking wif u oni...
2,1,free entry in 2 a wkly comp to win fa cup fina...
3,0,u dun say so early hor... u c already then say...
4,0,"nah i don't think he goes to usf, he lives aro..."


In [5]:
# remove punctuation from messages
dts['message'] = dts.message.str.replace('[\d]', '')
dts['message'] = dts.message.str.replace('[^\w\s]', '')
dts.head()

Unnamed: 0,labels,message
0,0,go until jurong point crazy available only in ...
1,0,ok lar joking wif u oni
2,1,free entry in a wkly comp to win fa cup final...
3,0,u dun say so early hor u c already then say
4,0,nah i dont think he goes to usf he lives aroun...


In [6]:
#convert each message(sentence) to list of words
from nltk import word_tokenize
dts['message'] = dts['message'].apply(word_tokenize)  
dts.head()

Unnamed: 0,labels,message
0,0,"[go, until, jurong, point, crazy, available, o..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, in, a, wkly, comp, to, win, fa, ..."
3,0,"[u, dun, say, so, early, hor, u, c, already, t..."
4,0,"[nah, i, dont, think, he, goes, to, usf, he, l..."


In [7]:
#Remove Stopwords from sentences
from nltk.corpus import stopwords

stp = stopwords.words('english')
#dts['message'] = 
dts['message'] =  [[word for word in sub if word not in stp] for sub in dts['message']]
dts.head()

Unnamed: 0,labels,message
0,0,"[go, jurong, point, crazy, available, bugis, n..."
1,0,"[ok, lar, joking, wif, u, oni]"
2,1,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,"[u, dun, say, early, hor, u, c, already, say]"
4,0,"[nah, dont, think, goes, usf, lives, around, t..."


In [8]:
#apply Stemmer to simplify words
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

dts['message'] = dts['message'].apply(lambda x: [stemmer.stem(y) for y in x])
dts.head()

Unnamed: 0,labels,message
0,0,"[go, jurong, point, crazi, avail, bugi, n, gre..."
1,0,"[ok, lar, joke, wif, u, oni]"
2,1,"[free, entri, wkli, comp, win, fa, cup, final,..."
3,0,"[u, dun, say, earli, hor, u, c, alreadi, say]"
4,0,"[nah, dont, think, goe, usf, live, around, tho..."


In [71]:
# apply Lemmetizer
from nltk.stem import WordNetLemmatizer

lemmatizer=WordNetLemmatizer()

dts['message'] = dts['message'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x])  
dts.head()

Unnamed: 0,labels,message
0,0,"[go, jurong, point, crazi, avail, bugi, n, gre..."
1,0,"[ok, lar, joke, wif, u, oni]"
2,1,"[free, entri, wkli, comp, win, fa, cup, final,..."
3,0,"[u, dun, say, earli, hor, u, c, alreadi, say]"
4,0,"[nah, dont, think, goe, usf, live, around, tho..."


In [9]:
# This converts the list of words into space-separated strings 
dts['message'] = dts['message'].apply(lambda x: ' '.join(x))

dts.head()

Unnamed: 0,labels,message
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri wkli comp win fa cup final tkt st m...
3,0,u dun say earli hor u c alreadi say
4,0,nah dont think goe usf live around though


In [17]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()  

counts = count_vect.fit_transform(dts['message'])

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer().fit(counts)

counts = transformer.transform(counts)

In [19]:
#Split data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(counts, dts['labels'], test_size=0.1, random_state=42)


In [20]:
#initialize the Multinomial Naive Bayes Classifier and fit the data.
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB().fit(X_train, y_train)

In [77]:
import numpy as np

predicted = model.predict(X_test)

print(np.mean(predicted == y_test)) 

0.9802867383512545
