In [43]:
# load libraries
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import scipy as sci
import sklearn as sk
import re #regular expression for e-mail processing
from stemming.porter2 import stem
import nltk, nltk.stem.porter

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
pd.set_option('display.max_seq_items', None)
 
%matplotlib inline

import seaborn as sns
sns.set_context('notebook')
sns.set_style('white')

In [44]:
# load data
def getVocabDict(reverse=False):
    vocab_dict = {}
    with open("vocab.txt") as f:
        for line in f:
            (val, key) = line.split()
            if not reverse:
                vocab_dict[key] = int(val)
            else:
                vocab_dict[int(val)] = key                
    return vocab_dict

In [45]:
# mapping the email
def preProcess( email ):
    # lower-casing
    email = email.lower()
    # HTML tags striping, HTML tags are with <>
    email = re.sub('<[^<>]+>', ' ', email);
    # Normalizing URLs:
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)
    # Normalizing Email adress
    email = re.sub('[^\s]+@[^\s]+','emailaddr', email)
    # Normalizing numbers
    email = re.sub('[0-9]+','number', email)
    # Normalizing dollars
    email = re.sub('[$]+','dollar', email)   
    return(email)

In [46]:
# tokenize
def tokenize( email ):
    # Word Stemming
    stemmer = nltk.stem.porter.PorterStemmer()    
    # split the email
    tokens = re.split('[^0-9a-z]', email)
    # iterate each token
    tokenlist = []
    for token in tokens:
        #Use the Porter stemmer to stem the word
        stemmed = stemmer.stem( token )
        #Store a list of all stemmed words
        if len(stemmed) > 0:
            tokenlist.append(stemmed)    
    return(tokenlist)

In [47]:
# vocabularize
def vocabularize( email , vocab_dict):
    return([vocab_dict[token] for token in email if token in vocab_dict])        

In [97]:
# process email
def processEmail( email ):
    # preprocessing
    email = preProcess(email)
    # tokenize
    email = tokenize(email)
    # mapping to vocabulary
    vocab_dict = getVocabDict(reverse=False)
    email = vocabularize(email, vocab_dict)
    # extract feature
    n_vocab = len(vocab_dict)
    feature = np.zeros((n_vocab,1))
    for idx in email:
        feature[idx-1] = 1
    return(feature.reshape(1,-1))
email = open( 'emailSample1.txt', 'r' ).read()
email = processEmail(email)

In [53]:
# load traning and test data
# Training set
datafile = 'spamTrain.mat'
mat = sci.io.loadmat( datafile )
X, y = mat['X'], mat['y']

# Test set
datafile = 'spamTest.mat'
mat = sci.io.loadmat( datafile )
Xtest, ytest = mat['Xtest'], mat['ytest']

In [60]:
# initialize a SVM
my_svm = sk.svm.SVC(C=0.1,kernel='linear')
# train the SVM
my_svm.fit(X,y.ravel())
# test the SVM
trained_score=my_svm.score(X,y.ravel())
tested_score=my_svm.score(Xtest,ytest.ravel())
print(trained_score*100.0)
print(tested_score*100.0)

99.825
98.9


In [87]:
# top spam words
vocab_dict_reverse = getVocabDict(reverse=True)
param_sorted=np.argsort(my_svm.coef_,axis=None)[::-1]
top_spam_words=[ vocab_dict_reverse[x] for x in param_sorted[:15] ]
bottom_spam_words=[ vocab_dict_reverse[x] for x in param_sorted[-15:] ]
print(top_spam_words)
print(bottom_spam_words)

['otherwis', 'clearli', 'remot', 'gt', 'visa', 'base', 'doesn', 'wife', 'previous', 'player', 'mortgag', 'natur', 'll', 'futur', 'hot']
['http', 'toll', 'xp', 'ratio', 'august', 'unsubscrib', 'useless', 'numberth', 'round', 'linux', 'datapow', 'wrong', 'urgent', 'that', 'spam']


In [99]:
# try own emails
print(my_svm.predict(processEmail(open('emailSample2.txt','r').read())))
print(my_svm.predict(processEmail(open('spamSample1.txt','r').read())))
print(my_svm.predict(processEmail(open('spamSample2.txt','r').read())))

[0]
[1]
[1]
