In [46]:
import sys
import nltk
import pandas 
import numpy


In [47]:
import pandas as pd
import numpy as np

df = pd.read_table('SMSSPamCollection', header=None, encoding='utf-8')


In [48]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB
None


In [49]:
print(df.head())

      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [50]:
#check class distribution
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


In [51]:
#convert class labels to binary values
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
Y = encoder.fit_transform(classes)
print(classes[:10])
print(Y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [52]:
text_messages = df[1]
print(text_messages[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [53]:
# Replace email addresses with 'email'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddr')
# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

# Remove punctuation
processed = processed.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [54]:
# change words to lower case - Hello, HELLO, hello are all the same word
processed = processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in numbr a wkly comp to win fa cup ...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbrnd time we have tried numbr c...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [55]:
from nltk.corpus import stopwords

# remove stop words from text messages

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

In [56]:
# Remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))

In [57]:
from nltk.tokenize import word_tokenize
all_words = []
for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)

all_words = nltk.FreqDist(all_words)


In [58]:
#print total number of words and 15 most common words
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))


Number of words: 6579
Most common words: [('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 451), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [59]:
#use 1500 most common words as feature
word_features = list(all_words.keys())[:1500]

In [60]:
def find_features(messages):
    word = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in word)
        
    return features

#Lets see an example 
features = find_features(processed[0])

for key , value in features.items():
    if value == True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat
ok
lar
joke
wif
u
oni
free
entri
numbr
wkli
comp
win
fa
cup
final
tkt
numbrst
may
text
receiv
question
std
txt
rate
c
appli
numbrovernumbr
dun
say
earli
hor
alreadi
nah
think
goe
usf
live
around
though
freemsg
hey
darl
week
word
back
like
fun
still
tb
xxx
chg
send
moneysymbnumbr
rcv
even
brother
speak
treat
aid
patent
per
request
mell
oru
minnaminungint
nurungu
vettam
set
callertun
caller
press
copi
friend
winner
valu
network
custom
select
receivea
prize
reward
claim
call
code
klnumbr
valid
hour
mobil
month
r
entitl
updat
latest
colour
camera
co
gon
na
home
soon
want
talk
stuff
anymor
tonight
k
cri
enough
today
six
chanc
cash
pound
cshnumbr
cost
numbrp
day
numbrday
tsandc
repli
hl
info
urgent
membership
jackpot
www
dbuk
net
lccltd
pobox
numbrldnwnumbranumbrrwnumbr
search
right
thank
breather
promis
wont
take
help
grant
fulfil
wonder
bless
time
date
sunday
xxxmobilemovieclub
use
credit
click
wap
link
next
messa

In [61]:
processed[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [62]:
features

{'go': True,
 'jurong': True,
 'point': True,
 'crazi': True,
 'avail': True,
 'bugi': True,
 'n': True,
 'great': True,
 'world': True,
 'la': True,
 'e': True,
 'buffet': True,
 'cine': True,
 'got': True,
 'amor': True,
 'wat': True,
 'ok': True,
 'lar': True,
 'joke': True,
 'wif': True,
 'u': True,
 'oni': True,
 'free': True,
 'entri': True,
 'numbr': True,
 'wkli': True,
 'comp': True,
 'win': True,
 'fa': True,
 'cup': True,
 'final': True,
 'tkt': True,
 'numbrst': True,
 'may': True,
 'text': True,
 'receiv': True,
 'question': True,
 'std': True,
 'txt': True,
 'rate': True,
 'c': True,
 'appli': True,
 'numbrovernumbr': True,
 'dun': True,
 'say': True,
 'earli': True,
 'hor': True,
 'alreadi': True,
 'nah': True,
 'think': True,
 'goe': True,
 'usf': True,
 'live': True,
 'around': True,
 'though': True,
 'freemsg': True,
 'hey': True,
 'darl': True,
 'week': True,
 'word': True,
 'back': True,
 'like': True,
 'fun': True,
 'still': True,
 'tb': True,
 'xxx': True,
 'chg':

In [63]:
# Now lets do it for all the messages
messages = list(zip(processed, Y))

# define a seed for reproducibility
seed = 1
np.random.seed = seed
np.random.shuffle(messages)

# call find_features function for each SMS message
featuresets = [(find_features(text), label) for (text, label) in messages]

In [64]:
from sklearn import model_selection

training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state = seed)

In [65]:
print(len(training))
print(len(testing))

4179
1393


In [66]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [67]:
#Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)
print(models)

<zip object at 0x14be53910>


In [79]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define models to train
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 200),
    MultinomialNB()
]

models = list(zip(names, classifiers))

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 86.71931083991386
Decision Tree Accuracy: 86.71931083991386
Random Forest Accuracy: 86.71931083991386
Logistic Regression Accuracy: 86.71931083991386




SGD Classifier Accuracy: 86.71931083991386
Naive Bayes Accuracy: 86.71931083991386


In [80]:
from sklearn.ensemble import VotingClassifier

names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    
]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Voting Classifier: Accuracy: 86.71931083991386


In [82]:
# make class label prediction for testing set
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

In [84]:
#print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels,prediction),
    index = [['actual','actual'],['ham','spam']],
    columns = [['predicted', 'predicted'], ['ham', 'spam']]
)

              precision    recall  f1-score   support

           0       0.87      1.00      0.93      1208
           1       0.00      0.00      0.00       185

    accuracy                           0.87      1393
   macro avg       0.43      0.50      0.46      1393
weighted avg       0.75      0.87      0.81      1393



  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1208,0
actual,spam,185,0
