### A simple Spam Classifier using SVM and Gaussian Naive Bayes Algorithm

In [102]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import HashingVectorizer
from collections import Counter
%matplotlib inline

#### Download the data from the link
'https://drive.google.com/open?id=0ByJLBTmJojjzdTdvRC10TnhCa2M'

In [7]:
ham = pd.read_csv('./spam_filtering/ham', sep='|', names = ['text'], dtype=str)
spam = pd.read_csv('./spam_filtering/spam', sep='|', names = ['text'], dtype=str)
ham.head(4)

Unnamed: 0,text
0,Rofl. Its true to its name
1,The guy did some bitching but I acted like i'd...
2,"Pity, * was in mood for that. So...any other s..."
3,Will ü b going to esplanade fr home?


In [8]:
spam.head(4)

Unnamed: 0,text
0,You have 1 new message. Please call 08712400200.
1,Urgent! Please call 09061743811 from landline....
2,Dear 0776xxxxxxx U've been invited to XCHAT. T...
3,U 447801259231 have a secret admirer who is lo...


In [9]:
# Preparing the text data
# Removing stop words and special characters
def make_dictionary(data_1, data_2):
    emails_1 = list(data_1.text)  # list of emails  
    emails_2 = list(data_2.text)
    all_words = []       
    
    for mail in emails_1:    
                    words = mail.split()
                    all_words += words
    for mail in emails_2:    
                    words = mail.split()
                    all_words += words
    
    dictionary = Counter(all_words)
    return dictionary

In [10]:
dictionary = make_dictionary(spam, ham)

In [21]:
# Delete words which are not alphabetical and whose length is 1.
list_to_remove = list(dictionary.keys())
for item in list_to_remove:
    if item.isalpha() == False: 
        del dictionary[item]
    elif len(item) == 1:
        del dictionary[item]
dictionary = dictionary.most_common(3000)

#### Bag of Words

Transform texts into bag of words matrix(number of columns == number of unique words, number of rows == number of documents(emails))

In [27]:
# Convert each mail into 3000 feature vector, where each feature is a key from above dictionary.
def extract_features(mail): 
    mails = list(mail.text)
    features_matrix = np.zeros((len(mails),3000))
    mail_no = 0
    for mail in mails:
        words = mail.split()
        for word in words:
            wordID = 0
            for i,d in enumerate(dictionary):
                if d[0] == word:
                    wordID = i
                    break
            features_matrix[mail_no,wordID] = words.count(word)
        mail_no = mail_no + 1     
    return features_matrix

In [36]:
spam_input = extract_features(spam)
ham_input = extract_features(ham)

In [141]:
spam_input.shape, ham_input.shape

((747, 3000), (4803, 3000))

In [154]:
spam_train = spam_input[:700]
ham_train = ham_input[:4000]
spam_test = spam_input[700:]
ham_test = ham_input[4000:]

In [155]:
spam_train.shape, ham_train.shape, spam_test.shape, ham_test.shape

((700, 3000), (4000, 3000), (47, 3000), (803, 3000))

In [156]:
train_X = np.vstack((spam_train,ham_train))
train_y = [1 for i in range(700)] + [0 for i in range(4000)]

In [157]:
test_X = np.vstack((spam_test,ham_test))
test_y = [1 for i in range(47)] + [0 for i in range(803)]

In [158]:
train_X.shape, len(train_y), test_X.shape, len(test_y)

((4700, 3000), 4700, (850, 3000), 850)

In [159]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.svm import SVC, NuSVC, LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

In [160]:
model1 = MultinomialNB()
model2 = LinearSVC()

model1.fit(train_X,train_y)
model2.fit(train_X,train_y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [161]:
result_1 = model1.predict(test_X)
result_2 = model2.predict(test_X)

In [162]:
print('Metric for MultinomialNB')
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(test_y, result_1)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(test_y, result_1)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(test_y, result_1)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(test_y, result_1)
print('F1 score: %f' % f1)

# kappa
kappa = cohen_kappa_score(test_y, result_1)
print('Cohens kappa: %f' % kappa)
# ROC AUC
auc = roc_auc_score(test_y, result_1)
print('ROC AUC: %f' % auc)
# confusion matrix
matrix = confusion_matrix(test_y, result_1)
print(matrix)

Metric for MultinomialNB
Accuracy: 0.981176
Precision: 0.803922
Recall: 0.872340
F1 score: 0.836735
Cohens kappa: 0.826765
ROC AUC: 0.929944
[[793  10]
 [  6  41]]


In [163]:
print('Metric for Linear SVM')
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(test_y, result_2)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(test_y, result_2)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(test_y, result_2)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(test_y, result_1)
print('F1 score: %f' % f1)

# kappa
kappa = cohen_kappa_score(test_y, result_2)
print('Cohens kappa: %f' % kappa)
# ROC AUC
auc = roc_auc_score(test_y, result_2)
print('ROC AUC: %f' % auc)
# confusion matrix
matrix = confusion_matrix(test_y, result_2)
print(matrix)

Metric for Linear SVM
Accuracy: 0.983529
Precision: 0.923077
Recall: 0.765957
F1 score: 0.836735
Cohens kappa: 0.828614
ROC AUC: 0.881111
[[800   3]
 [ 11  36]]
