In [1]:
import re
import pandas as pd
import numpy as np
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix


### Spam Classification
Deciding whether an email is spam or not.

## Step 1 loading the dataset

In [2]:
#load dataset
df=pd.read_csv('spam.csv', encoding='latin-1')
df=df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis='columns')

#df[v1] is the class variable and df[v2] is the  email
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


## Step 2: removing stopwords and stemming

In [4]:
stemmer=SnowballStemmer('english')
#A  stemming algorithm reduces words like fishing, fished, and fisher to the stem fish.
#The stem need not be a word, for example  argue, argued, 
#argues, arguing, and argus could be reduced to the stem argu. 

stop=set(stopwords.words('english'))
#Stop words are  the most common words in a language
#and are filtered out before processing of natural language data 


df['v2']=[re.sub('[^a-zA-Z]', ' ', sms) for sms in df['v2']]
word_list=[sms.split() for sms in df['v2']]
def normalize(words):
    current_words=list()
    for word in words:
        if word.lower() not in stop: #remove  the most common words
            updated_word=stemmer.stem(word) #stemming
            current_words.append(updated_word.lower()) #lower case
    return current_words
word_list=[normalize(word) for word in word_list]
df['v2']=[" ".join(word) for word in word_list]


In [5]:
#df[v1] is the class variable and df[v2] is the processed email
df

Unnamed: 0,v1,v2
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri wkli comp win fa cup final tkts st ...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah think goe usf live around though
...,...,...
5567,spam,nd time tri contact u u pound prize claim easi...
5568,ham,b go esplanad fr home
5569,ham,piti mood suggest
5570,ham,guy bitch act like interest buy someth els nex...


In [6]:
#split in training and testing
x_train, x_test, y_train, y_test=train_test_split(df['v2'], df['v1'], test_size=0.2, random_state=7)

## Step 3: transforming email into numerical string

In [8]:
#it counts the words
cv=CountVectorizer()
#it returns the number of times a word appears in the i-th email
x_train_df=cv.fit_transform(x_train) #x_train_df is a matrix emails times words
print("number of emails=",x_train_df.shape[0])
print("number of words=",x_train_df.shape[1])
x_test_df=cv.transform(x_test)

#this is a sparse matrix (it means that only non-zeroes elements are stored)
x_train_df

number of emails= 4457
number of words= 5595


<4457x5595 sparse matrix of type '<class 'numpy.int64'>'
	with 35900 stored elements in Compressed Sparse Row format>

In [9]:
row_index=0 #select one email
print(x_train_df[row_index,:].todense().shape)
print("this is the non-sparse matrix=",x_train_df[row_index,:].todense())
ind=np.where(x_train_df[row_index,:].todense()[0,:]>0)[1]
print()
#original words in the email
print(x_train.values[row_index])
print()
#decoded numerical input 
print(cv.inverse_transform(x_train_df[row_index,:].todense()))
print()
#index of those words in x_train_df[row_index,:].todense()
print(ind)
print()
# number of times those words appears in the email
print(x_train_df[row_index,ind].todense())

(1, 5595)
this is the non-sparse matrix= [[0 0 0 ... 0 0 0]]

wish great day moji told offer alway speechless offer easili go great length behalf stun exam next friday keep touch sorri

[array(['alway', 'behalf', 'day', 'easili', 'exam', 'friday', 'go',
       'great', 'keep', 'length', 'moji', 'next', 'offer', 'sorri',
       'speechless', 'stun', 'told', 'touch', 'wish'], dtype='<U34')]

[ 162  458 1140 1398 1552 1790 1921 1991 2541 2676 3047 3214 3328 4456
 4483 4620 4944 4984 5405]

[[1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1]]


## Step 4: training the classifier and making predictions for the test set

In [11]:

#MultinomialNB
clf=MultinomialNB()
clf.fit(x_train_df,y_train)
prediction_train=clf.predict(x_train_df)
prediction_test=clf.predict(x_test_df)

## Step 5: computing accuracy and confusion matrix

In [12]:
#scores
print("Accuracy:"+str(accuracy_score(y_train,prediction_train)))
print()

Accuracy:0.9923715503702042



We care about the generalisation error, that is the performance on unseen data.

In [13]:

#scores
print("Accuracy:"+str(accuracy_score(y_test,prediction_test)))
print()

conf_mat=confusion_matrix(y_test, prediction_test)
print("Confusion Matrix")
print(conf_mat)


Accuracy:0.989237668161435

Confusion Matrix
[[965   5]
 [  7 138]]


#### Where can we find sparse matrices ?
You can manipulate them using scipy.sparse

In [15]:
import scipy.sparse as sc #this is the library

#x_train_df is a scipy sparse matrix, this avoids to store the zeroes
#to access to the non-zero element
i=0# email index
ind=sc.find(x_train_df[i,:]>0)[1]
print("indexes of non-zeroes elements=",ind)
x_train_df[0,ind].todense()


indexes of non-zeroes elements= [ 162  458 1140 1398 1552 1790 1921 1991 2541 2676 3047 3214 3328 4456
 4483 4620 4944 4984 5405]


matrix([[1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1]])

In [16]:
#test set
ind=sc.find(x_test_df[i,:]>0)[1]
print("indexes of non-zeroes elements=",ind)
x_test_df[0,ind].todense()


indexes of non-zeroes elements= [2870 3588]


matrix([[1, 1]])

## Question

We consider Movie Reviews Corpus, a dataset that includes  movie reviews that are categorized as positive or negative.

In [17]:
import random
import nltk

nltk.download('movie_reviews')
from nltk.corpus import movie_reviews
df = pd.DataFrame(columns=['v1', 'v2'])
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        df=df.append({'v1': category, 'v2': movie_reviews.words(fileid)}, ignore_index=True)
        
word_list=[sms for sms in df['v2']]
def normalize(words):
    current_words=list()
    for word in words:
        if word.lower() not in stop: #remove  the most common words
            if word.isalpha(): #remove punctuation
                updated_word=stemmer.stem(word) #stemming
                current_words.append(updated_word.lower()) #lower case
    return current_words
word_list=[normalize(word) for word in word_list]
df['v2']=[" ".join(word) for word in word_list]


[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/aman/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [18]:
df['v2']

0       plot two teen coupl go church parti drink driv...
1       happi bastard quick movi review damn bug got h...
2       movi like make jade movi viewer thank invent t...
3       quest camelot warner bros first featur length ...
4       synopsi mental unstabl man undergo psychothera...
                              ...                        
1995    wow movi everyth movi funni dramat interest we...
1996    richard gere command actor alway great film ev...
1997    glori star matthew broderick denzel washington...
1998    steven spielberg second epic film world war ii...
1999    truman true man burbank perfect name jim carre...
Name: v2, Length: 2000, dtype: object

Using the same steps as in the Spam filter example, apply MultinomialNB to this example

In [19]:
#split in training and testing
x_train, x_test, y_train, y_test=train_test_split(df['v2'], df['v1'], test_size=0.2, random_state=7)

In [20]:
#Step3
#it counts the words
cv=CountVectorizer()
#it returns the number of times a word appears in the i-th email
x_train_df=cv.fit_transform(x_train) #x_train_df is a matrix emails times words
print("number of reviews=",x_train_df.shape[0])
print("number of words=",x_train_df.shape[1])
x_test_df=cv.transform(x_test)

#this is a sparse matrix (it means that only non-zeroes elements are stored)
x_train_df

number of reviews= 1600
number of words= 22746


<1600x22746 sparse matrix of type '<class 'numpy.int64'>'
	with 398985 stored elements in Compressed Sparse Row format>

In [21]:
row_index=0 #select one email
print(x_train_df[row_index,:].todense().shape)
print("this is the non-sparse matrix=",x_train_df[row_index,:].todense())
ind=np.where(x_train_df[row_index,:].todense()[0,:]>0)[1]
print()
#original words in the review
print(x_train.values[row_index])
print()
#decoded numerical input 
print(cv.inverse_transform(x_train_df[row_index,:].todense()))
print()
#index of those words in x_train_df[row_index,:].todense()
print(ind)
print()
# number of times those words appears in the email
print(x_train_df[row_index,ind].todense())

(1, 22746)
this is the non-sparse matrix= [[0 0 0 ... 0 0 0]]

someon journey theater see comedi alway risk sit inan recent saw film like edtv offic space realli bad comedi hit miss moviego goe theater expect amus realli shame alleg comedi fail deliv weep dear reader latest comedi hollywood movi mill noth less sure bet austin power spi shag one funniest thing pleasur see long time complet looney delight parodi often pretenti jame bond flick comedi even half say even fifth could consist hyster would take resid local multiplex even movi credit fact sequel sleeper cult hit austin power intern man mysteri origin came nowher low budget eccentr movi mani expect flop like lesli nielsen parodi sudden quot teenag america iron power yeeeeah babi yeah almost becom icon late thus anoth instal inevit also undoubt welcom spi shag one unrestrain top comedi ever seen entir life bring back charact predecessor even briefli add new one austin power mike myer swing hipster transport go back time get back 

In [22]:
#Step 4
#MultinomialNB
clf=MultinomialNB()
clf.fit(x_train_df,y_train)
prediction_train=clf.predict(x_train_df)
prediction_test=clf.predict(x_test_df)

In [23]:
#Step5
#scores
print("Accuracy:"+str(accuracy_score(y_train,prediction_train)))
print()

Accuracy:0.96375



In [24]:

#scores
print("Accuracy:"+str(accuracy_score(y_test,prediction_test)))
print()

conf_mat=confusion_matrix(y_test, prediction_test)
print("Confusion Matrix")
print(conf_mat)


Accuracy:0.8275

Confusion Matrix
[[169  29]
 [ 40 162]]
