In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat
import re
from nltk.stem import PorterStemmer

In [2]:
file_contents = open(r'E:\Mat_Work\machine-learning-ex6\ex6\emailSample1.txt','r').read()
vocabList = open(r'E:\Mat_Work\machine-learning-ex6\ex6\vocab.txt','r').read()

In [3]:
vocabList = vocabList.split('\n')[:-1]
vocabList_d = {}
i = 0
for ea in vocabList:
    value,key = ea.split('\t')[:]
    vocabList_d[key] = value

In [4]:
def processEmail(email_contents,vocabList_d):
    """
    Preprocesses the body of an email and returns a list of indices of the words contained in the email. 
    """
    # Lower case
    email_contents = email_contents.lower()
    
    # Handle numbers
    email_contents = re.sub("[0-9]+","number",email_contents)
    
    # Handle URLS
    email_contents = re.sub("[http|https]://[^\s]*","httpaddr",email_contents)
    
    # Handle Email Addresses
    email_contents = re.sub("[^\s]+@[^\s]+","emailaddr",email_contents)
    
    # Handle $ sign
    email_contents = re.sub("[$]+","dollar",email_contents)
    
    # Strip all special characters
    specialChar = ["<","[","^",">","+","?","!","'",".",",",":"]
    for char in specialChar:
        email_contents = email_contents.replace(char,"")
    email_contents = email_contents.replace("\n"," ")    
    
    # Stem the word
    ps = PorterStemmer()
    email_contents = [ps.stem(token) for token in email_contents.split(" ")]
    email_contents= " ".join(email_contents)
    
    # Process the email and return word_indices
    
    word_indices=[]
    
    for char in email_contents.split():
        if len(char) >1 and char in vocabList_d:
            word_indices.append(int(vocabList_d[char]))
    
    return word_indices

In [5]:
word_indices= processEmail(file_contents,vocabList_d)

In [6]:
def emailFeatures(word_indices,vocabList_d):
    n = len(vocabList_d)
    features = np.zeros((n,1))
    for i in word_indices:
        features[i] = 1
    return features

In [7]:
features = emailFeatures(word_indices,vocabList_d)
print("Length of feature vector: ",len(features))
print("Number of non-zero entries: ",np.sum(features))

Length of feature vector:  1899
Number of non-zero entries:  43.0


In [8]:
spam_mat = loadmat('E:\Mat_Work\machine-learning-ex6\ex6\spamTrain.mat')
X_train = spam_mat['X']
y_train = spam_mat['y']

In [15]:
from sklearn.svm import SVC
C = 0.1
spam_classifier = SVC(C=C,kernel='linear')
spam_classifier.fit(X_train,y_train.ravel())
print('Training accuracy:',(spam_classifier.score(X_train,y_train.ravel()))*100,'%')

Training accuracy: 99.825 %


In [17]:
spam_mat_test = loadmat('E:\Mat_Work\machine-learning-ex6\ex6\spamTest.mat')
X_test = spam_mat_test['Xtest']
y_test = spam_mat_test['ytest']
spam_classifier.predict(X_test)
print("Test Accuracy:",(spam_classifier.score(X_test,y_test.ravel()))*100,"%")

Test Accuracy: 98.9 %


In [20]:
weights = spam_classifier.coef_[0]
weights_col = np.hstack((np.arange(1,1900).reshape(1899,1),weights.reshape(1899,1)))
df = pd.DataFrame(weights_col)
df.sort_values(by=[1],ascending=False,inplace=True)
predictors = []
idx = []
for i in df[0][:15]:
    for keys, values in vocabList_d.items():
        if str(int(i)) == values:
            predictors.append(keys)
            idx.append(int(values))
print("Top predictors of spam:")
for _ in range(15):
    print(predictors[_],"\t\t",round(df[1][idx[_]-1],6))

Top predictors of spam:
our 		 0.500614
click 		 0.465916
remov 		 0.422869
guarante 		 0.383622
visit 		 0.36771
basenumb 		 0.345064
dollar 		 0.323632
will 		 0.269724
price 		 0.267298
pleas 		 0.261169
most 		 0.257298
nbsp 		 0.253941
lo 		 0.253467
ga 		 0.248297
hour 		 0.246404
