In [203]:
import re

import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
# nltk.download('stopwords') # uncomment if your machine doesn't contain stopwords already

In [193]:
def text_cleanup(original_text):
    """
    1. Original to lowercase.
    2. Remove all non-alphanumeric characters.
    3. Remove redundant whitespaces.
    4. Remove stopwords.
    5. Return the new text.
    """
    
    # 1. Original to lowercase
    lower_text = original_text.lower()
    
    # 2. Remove all non-alphanumeric characters
    alphanumeric_text = re.sub(r'[^\d\w\s]', '', lower_text)
    
    # 3. Remove redundant whitespaces.
    whitespaces = ' '.join([word for word in alphanumeric_text.split() if word != ' '])
    
    # 4. Remove stopwords
    stop = stopwords.words('english')
    without_stopwords = ' '.join([word for word in whitespaces.split() if word not in stop])
    
    return without_stopwords

In [194]:
# Read the .csv file and drop 
sms_messages = pd.read_csv('spam.csv', names=['label', 'message'], index_col=False, header=0, encoding="ISO-8859-1")
sms_messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [195]:
sms_messages['cleaned'] = sms_messages['message'].apply(lambda row: text_cleanup(row))

In [196]:
sms_messages['label'] = sms_messages.label.map({'ham': 0, 'spam': 1})
sms_messages

Unnamed: 0,label,message,cleaned
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u u å750 pound prize ...
5568,0,Will Ì_ b going to esplanade fr home?,ì_ b going esplanade fr home
5569,0,"Pity, * was in mood for that. So...any other s...",pity mood soany suggestions
5570,0,The guy did some bitching but I acted like i'd...,guy bitching acted like id interested buying s...


In [197]:
sms_messages.describe()

Unnamed: 0,label
count,5572.0
mean,0.134063
std,0.340751
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [198]:
vectorizer = CountVectorizer()
tokenizer = vectorizer.build_tokenizer()
X = sms_messages.cleaned
y = sms_messages.label
print(X)
print(y)

0       go jurong point crazy available bugis n great ...
1                                 ok lar joking wif u oni
2       free entry 2 wkly comp win fa cup final tkts 2...
3                     u dun say early hor u c already say
4             nah dont think goes usf lives around though
                              ...                        
5567    2nd time tried 2 contact u u å750 pound prize ...
5568                         ì_ b going esplanade fr home
5569                          pity mood soany suggestions
5570    guy bitching acted like id interested buying s...
5571                                       rofl true name
Name: cleaned, Length: 5572, dtype: object
0       0
1       0
2       1
3       0
4       0
       ..
5567    1
5568    0
5569    0
5570    0
5571    0
Name: label, Length: 5572, dtype: int64


In [199]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4179,)
(1393,)
(4179,)
(1393,)


In [205]:
X_train_dtm = vectorizer.fit_transform(X_train)
X_test_dtm = vectorizer.transform(X_test)

In [206]:
nb = MultinomialNB()
%time nb.fit(X_train_dtm, y_train)

CPU times: user 2.76 ms, sys: 1.36 ms, total: 4.12 ms
Wall time: 3.92 ms


MultinomialNB()

In [207]:
y_pred_class = nb.predict(X_test_dtm)

In [208]:
metrics.accuracy_score(y_test, y_pred_class)

0.9834888729361091

In [209]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[1205,    8],
       [  15,  165]])