In [22]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
import glob
from sklearn.model_selection import train_test_split
import unicodedata
import sys
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
import os
import time

from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


In [23]:
tbl = dict.fromkeys(
    i for i in range(sys.maxunicode)
    if unicodedata.category(chr(i)).startswith('P')
)

In [24]:
def load_sms_data(path):
    return pd.read_csv(path, encoding='utf-8', sep=',', names=["class", "msg"])

In [25]:
def remove_punctuation(text):
    return text.translate(tbl)

In [26]:
def clean_sms(sms_message):
    return remove_punctuation(sms_message).lower()

In [27]:
def class_str_to_int(class_str):
    if class_str == "ham":
        return -1
    elif class_str == "spam":
        return 1
    else:
        return None

In [28]:
def accuracy(preds, targets):
    return ((preds == targets).sum())/len(targets)

In [29]:
sms_data_df = load_sms_data("Data/sms/smsspamcollection/sms_spam3.csv")

In [30]:
sms_data_df

Unnamed: 0,class,msg
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
5,spam,FreeMsg Hey there darling it's been 3 week's n...
6,ham,Even my brother is not like to speak with me. ...
7,ham,As per your request 'Melle Melle (Oru Minnamin...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...


In [31]:
# clean messages
sms_data_df['clean_msg'] = sms_data_df['msg'].astype('str').apply(clean_sms)
sms_data_df['target'] = sms_data_df['class'].astype(
    'str').apply(class_str_to_int)

sms_data_df

Unnamed: 0,class,msg,clean_msg,target
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...,-1
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,-1
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...,1
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say,-1
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i dont think he goes to usf he lives aroun...,-1
5,spam,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey there darling its been 3 weeks now...,1
6,ham,Even my brother is not like to speak with me. ...,even my brother is not like to speak with me t...,-1
7,ham,As per your request 'Melle Melle (Oru Minnamin...,as per your request melle melle oru minnaminun...,-1
8,spam,WINNER!! As a valued network customer you have...,winner as a valued network customer you have b...,1
9,spam,Had your mobile 11 months or more? U R entitle...,had your mobile 11 months or more u r entitled...,1


In [46]:
# email_data_df = pd.read_pickle('email_cleaned')

x = sms_data_df['clean_msg']
y = sms_data_df['target']

In [47]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.8)

In [48]:
print("Train set: %s" % len(x_train))
print("Test set: %s" % len(x_test))

Train set: 1114
Test set: 4460


In [49]:
vectorizer = CountVectorizer()

train_matrix = vectorizer.fit_transform(x_train)
test_matrix = vectorizer.transform(x_test)

In [50]:
#print(vectorizer.get_feature_names())
print(len(vectorizer.get_feature_names()))

3587


In [51]:
print(train_matrix.shape, test_matrix.shape)

(1114, 3587) (4460, 3587)


In [52]:
def perform_comparison(classifiers, train_matrix, test_matrix):
    res = dict()
    for model_data in classifiers:
        name = model_data['name']
        model = model_data['model']
        
        model.fit(train_matrix, y_train)
        y_test_pred = model.predict(test_matrix)
        y_train_pred = model.predict(train_matrix)
        res[name] = accuracy(y_train_pred, y_train), accuracy(y_test_pred, y_test)
        
    return res

In [53]:
classifiers = [
    {
        'name': "Nearest Neighbors k=3",
        'model': KNeighborsClassifier(3)
    },
    {
        'name': "Nearest Neighbors k=30",
        'model': KNeighborsClassifier(30)
    },
    {
        'name': "Nearest Neighbors k=100",
        'model': KNeighborsClassifier(100)
    },
    {
        'name': "Linear SVM",
        'model': SVC(kernel="linear", C=0.025)
    },
    {
        'name': "Decision Tree",
        'model': DecisionTreeClassifier(max_depth=5)
    },
    {
        'name': "Random Forest",
        'model': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
    },
    {
        'name': "Naive Bayes",
        'model': MultinomialNB()
    }
]

In [54]:
result = perform_comparison(classifiers, train_matrix, test_matrix)
print("CountVectorizer")
for name in result.keys():
    print("Model name: %s" % name)
    print("Accuracy for: \t train set: %.3f \t test set: %.3f" % result[name])
    print()

CountVectorizer
Model name: Nearest Neighbors k=3
Accuracy for: 	 train set: 0.921 	 test set: 0.885

Model name: Nearest Neighbors k=30
Accuracy for: 	 train set: 0.872 	 test set: 0.865

Model name: Nearest Neighbors k=100
Accuracy for: 	 train set: 0.872 	 test set: 0.865

Model name: Linear SVM
Accuracy for: 	 train set: 0.978 	 test set: 0.955

Model name: Decision Tree
Accuracy for: 	 train set: 0.957 	 test set: 0.938

Model name: Random Forest
Accuracy for: 	 train set: 0.872 	 test set: 0.865

Model name: Naive Bayes
Accuracy for: 	 train set: 0.996 	 test set: 0.971



In [55]:
tfidf_vectorizer = TfidfVectorizer()

tfidf_train_matrix = tfidf_vectorizer.fit_transform(x_train)
tfidf_test_matrix = tfidf_vectorizer.transform(x_test)

In [56]:
tfidf_result = perform_comparison(classifiers, tfidf_train_matrix, tfidf_test_matrix)
print("TfidfVectorizer")
for name in tfidf_result.keys():
    print("Model name: %s" % name)
    print("Accuracy for: \t train set: %.3f \t test set: %.3f" % tfidf_result[name])
    print()

TfidfVectorizer
Model name: Nearest Neighbors k=3
Accuracy for: 	 train set: 0.983 	 test set: 0.955

Model name: Nearest Neighbors k=30
Accuracy for: 	 train set: 0.939 	 test set: 0.926

Model name: Nearest Neighbors k=100
Accuracy for: 	 train set: 0.897 	 test set: 0.883

Model name: Linear SVM
Accuracy for: 	 train set: 0.872 	 test set: 0.865

Model name: Decision Tree
Accuracy for: 	 train set: 0.964 	 test set: 0.919

Model name: Random Forest
Accuracy for: 	 train set: 0.872 	 test set: 0.865

Model name: Naive Bayes
Accuracy for: 	 train set: 0.951 	 test set: 0.909

