In [1]:
import pandas as pd
import numpy as np
import nltk
import sys

#load the dataset
df=pd.read_table('SMSspamCollection',header=None,encoding='utf-8')

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [2]:
#check class distribution
classes=df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


In [3]:
#preprocessing of data
from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()
y=encoder.fit_transform(classes)
print(classes[:10])
print(y[:10])

0     ham
1     ham
2    spam
3     ham
4     ham
5    spam
6     ham
7     ham
8    spam
9    spam
Name: 0, dtype: object
[0 0 1 0 0 1 0 0 1 1]


In [4]:
#store the smsmessage data
text_message=df[1]
print(text_message[:10])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
5    FreeMsg Hey there darling it's been 3 week's n...
6    Even my brother is not like to speak with me. ...
7    As per your request 'Melle Melle (Oru Minnamin...
8    WINNER!! As a valued network customer you have...
9    Had your mobile 11 months or more? U R entitle...
Name: 1, dtype: object


In [5]:
#use regular expression to replace email,phoneumber,specialharacter

#replace email address with emailaddr
processed=text_message.str.replace(r'^.+@[^\.].*\.[a-z]{2,} $','emailaddr')

#replace urls with webaddr
processed=processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\$*)?$','webaddress')

#replace money with moneysymbol
processed=processed.str.replace(r'$|\ $','moneysymb')

#replace 10 digit phonumber with phonenumber
processed=processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4} $','phonenum')

#replace normal n numbers with nmbr
processed=processed.str.replace(r'\d+(\.\d+)?','nmbr')
#replae punctuation
processed=processed.str.replace(r'[^\w\d\s]',' ')
#remove ws
processed=processed.str.replace(r'\s+',' ')
#remove leading and trailing ws
processed=processed.str.replace(r'^\s+|\s+?$','')

processed=processed.str.lower()
print(processed)

0       go until jurong point crazy available only in ...
1                       ok lar joking wif u oni moneysymb
2       free entry in nmbr a wkly comp to win fa cup f...
3       u dun say so early hor u c already then say mo...
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the nmbrnd time we have tried nmbr con...
5568        will ü b going to esplanade fr home moneysymb
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                   rofl its true to its namemoneysymb
Name: 1, Length: 5572, dtype: object


In [14]:
#nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))
processed=processed.apply(lambda x:' '.join(term for term in x.split() if term not in stop_words ))

In [15]:
#remove ing s or es
ps=nltk.PorterStemmer()
processed=processed.apply(lambda x:' '.join (ps.stem(term)for term in x.split()))

In [16]:
print(processed[:25])

0     go jurong point crazi avail bugi n great world...
1                       ok lar joke wif u oni moneysymb
2     free entri nmbr wkli comp win fa cup final tkt...
3         u dun say earli hor u c alreadi say moneysymb
4         nah think goe usf live around thoughmoneysymb
5     freemsg hey darl nmbr week word back like fun ...
6     even brother like speak treat like aid patent ...
7     per request mell mell oru minnaminungint nurun...
8     winner valu network custom select receivea nmb...
9     mobil nmbr month u r entitl updat latest colou...
10    gonna home soon want talk stuff anymor tonight...
11    six chanc win cash nmbr nmbr nmbr pound txt cs...
12    urgent nmbr week free membership nmbr nmbr pri...
13    search right word thank breather promi wont ta...
14                                date sunday moneysymb
15    xxxmobilemovieclub use credit click wap link n...
16                                 oh k watch moneysymb
17    eh u rememb nmbr spell name ye v naughti m

In [17]:
#nltk.download('punkt')
from nltk.tokenize import word_tokenize
all_word=[]
for message in processed:
    words=word_tokenize(message)
    for w in words:
        all_word.append(w)
all_word=nltk.FreqDist(all_word)            
            

In [18]:
print('Number of words: {}'.format(len(all_word)))
print('Most common words: {}'.format(all_word.most_common(15)))

Number of words: 7281
Most common words: [('moneysymb', 3169), ('nmbr', 2810), ('u', 1198), ('call', 663), ('go', 454), ('get', 452), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 300), ('ok', 281), ('free', 278), ('know', 269), ('love', 262), ('day', 260)]


In [19]:
word_features=list(all_word.keys())[:1500]

In [20]:
#find feature function
def find_features(message):
    words=word_tokenize(message)
    features={}
    for word in word_features:
        features[word]=(word in words)
    return features

#example
features=find_features(processed[0])
for key, value in features.items():
    if value==True:
        print(key)

go
jurong
point
crazi
avail
bugi
n
great
world
la
e
buffet
cine
got
amor
wat
moneysymb


In [23]:
#find features of all messages
messages= list(zip(processed,y))
#define a seed for reproductivity
seed=1
np.random.seed=seed
np.random.shuffle(messages) 

#call find feature function for each messages
featuresets = [(find_features(text),label) for (text,label) in messages]

In [37]:
#split faeture set to tarin and test data
from sklearn import model_selection
training,testing=model_selection.train_test_split(featuresets,test_size=0.40,random_state=seed)

In [38]:
print(len(training))
print(len(testing))

3343
2229


# scikit learn classifier with nltk

In [39]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix

In [40]:
names=['K nearest neighbor','Decision Tree','Random Forest','Logistic Regression','SGD Classifier','Naive Bayes','SVM Linear']
classifiers=[
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
]

models = zip(names,classifiers)

In [41]:
#wrap models into nltk
from nltk.classify.scikitlearn import SklearnClassifier
for name,model in models:
    nltk_model=SklearnClassifier(model)
    nltk_model.train(training)
    accuracy=nltk.classify.accuracy(nltk_model,testing)*100
    print('{}: Accuracy {}:'.format(name,accuracy))
    

K nearest neighbor: Accuracy 92.23867205024675:
Decision Tree: Accuracy 97.1736204576043:
Random Forest: Accuracy 98.34006280843428:
Logistic Regression: Accuracy 98.38492597577388:
SGD Classifier: Accuracy 98.65410497981158:
Naive Bayes: Accuracy 98.4297891431135:
SVM Linear: Accuracy 98.56437864513235:


In [42]:
from sklearn.ensemble import VotingClassifier
names=['K nearest neighbor','Decision Tree','Random Forest','Logistic Regression','SGD Classifier','Naive Bayes','SVM Linear']
classifiers=[
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter=100),
    MultinomialNB(),
    SVC(kernel='linear')
]

models = list(zip(names, classifiers))

nltk_ensemble=SklearnClassifier(VotingClassifier(estimators=models,voting='hard',n_jobs=-1))
nltk_ensemble.train(training)
accuracy=nltk.classify.accuracy(nltk_ensemble,testing)*100
print('Ensemble Method Accuracy {}:'.format(accuracy))

Ensemble Method Accuracy 98.65410497981158:


In [43]:
text_features,label=zip(*testing)
prediction=nltk_ensemble.classify_many(text_features)

In [44]:
print(classification_report(label,prediction))
pd.DataFrame(
    confusion_matrix(label,prediction),
     index = [['actual','actual'],['ham','spam']],
     columns = [['predicted','predicted'],['ham','spam']])

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1925
           1       1.00      0.90      0.95       304

    accuracy                           0.99      2229
   macro avg       0.99      0.95      0.97      2229
weighted avg       0.99      0.99      0.99      2229



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1925,0
actual,spam,30,274
