In [34]:
import numpy as np

In [35]:
# for SVM implementation
from sklearn import svm

In [36]:
# for processing words in the emails
import re # regular expression manipulation
from nltk.stem import PorterStemmer # for converting words into their 'stems'

In [37]:
# for loading the data
from scipy.io import loadmat

In [38]:
# part 1
# pre processing data

In [39]:
# read email text file
spam1Path = "D:\Programming\TestData\spamSample1.txt"

spam1Content = open(spam1Path, 'r').read()

In [40]:
spam1Content

'Do You Want To Make $1000 Or More Per Week?\n\n \n\nIf you are a motivated and qualified individual - I \nwill personally demonstrate to you a system that will \nmake you $1,000 per week or more! This is NOT mlm.\n\n \n\nCall our 24 hour pre-recorded number to get the \ndetails.  \n\n \n\n000-456-789\n\n \n\nI need people who want to make serious money.  Make \nthe call and get the facts. \n\nInvest 2 minutes in yourself now!\n\n \n\n000-456-789\n\n \n\nLooking forward to your call and I will introduce you \nto people like yourself who\nare currently making $10,000 plus per week!\n\n \n\n000-456-789\n\n\n\n3484lJGv6-241lEaN9080lRmS6-271WxHo7524qiyT5-438rjUv5615hQcf0-662eiDB9057dMtVl72\n\n'

In [41]:
# read vocab list
vocabPath = "D:\Programming\TestData\Vocab.txt"

vocabList = open(vocabPath).read()

In [42]:
vocabList
# note tabs \t and new lines \n

'1\taa\n2\tab\n3\tabil\n4\tabl\n5\tabout\n6\tabov\n7\tabsolut\n8\tabus\n9\tac\n10\taccept\n11\taccess\n12\taccord\n13\taccount\n14\tachiev\n15\tacquir\n16\tacross\n17\tact\n18\taction\n19\tactiv\n20\tactual\n21\tad\n22\tadam\n23\tadd\n24\taddit\n25\taddress\n26\tadministr\n27\tadult\n28\tadvanc\n29\tadvantag\n30\tadvertis\n31\tadvic\n32\tadvis\n33\tae\n34\taf\n35\taffect\n36\taffili\n37\tafford\n38\tafrica\n39\tafter\n40\tag\n41\tagain\n42\tagainst\n43\tagenc\n44\tagent\n45\tago\n46\tagre\n47\tagreement\n48\taid\n49\tair\n50\tal\n51\talb\n52\talign\n53\tall\n54\tallow\n55\talmost\n56\talon\n57\talong\n58\talreadi\n59\talsa\n60\talso\n61\taltern\n62\talthough\n63\talwai\n64\tam\n65\tamaz\n66\tamerica\n67\tamerican\n68\tamong\n69\tamount\n70\tamp\n71\tan\n72\tanalysi\n73\tanalyst\n74\tand\n75\tani\n76\tanim\n77\tannounc\n78\tannual\n79\tannuiti\n80\tanoth\n81\tanswer\n82\tanti\n83\tanumb\n84\tanybodi\n85\tanymor\n86\tanyon\n87\tanyth\n88\tanywai\n89\tanywher\n90\taol\n91\tap\n92\tapolog\

In [43]:
# convert vocab list into a dictionary

vocabList = vocabList.split('\n')[:-1]

vocabList_dict = {}

for row in vocabList:
    value, key = row.split('\t')[:]
    vocabList_dict[key] = value


In [44]:
vocabList_dict # list of words and corresponding indices

{'aa': '1',
 'ab': '2',
 'abil': '3',
 'abl': '4',
 'about': '5',
 'abov': '6',
 'absolut': '7',
 'abus': '8',
 'ac': '9',
 'accept': '10',
 'access': '11',
 'accord': '12',
 'account': '13',
 'achiev': '14',
 'acquir': '15',
 'across': '16',
 'act': '17',
 'action': '18',
 'activ': '19',
 'actual': '20',
 'ad': '21',
 'adam': '22',
 'add': '23',
 'addit': '24',
 'address': '25',
 'administr': '26',
 'adult': '27',
 'advanc': '28',
 'advantag': '29',
 'advertis': '30',
 'advic': '31',
 'advis': '32',
 'ae': '33',
 'af': '34',
 'affect': '35',
 'affili': '36',
 'afford': '37',
 'africa': '38',
 'after': '39',
 'ag': '40',
 'again': '41',
 'against': '42',
 'agenc': '43',
 'agent': '44',
 'ago': '45',
 'agre': '46',
 'agreement': '47',
 'aid': '48',
 'air': '49',
 'al': '50',
 'alb': '51',
 'align': '52',
 'all': '53',
 'allow': '54',
 'almost': '55',
 'alon': '56',
 'along': '57',
 'alreadi': '58',
 'alsa': '59',
 'also': '60',
 'altern': '61',
 'although': '62',
 'alwai': '63',
 'am': 

In [45]:
# pre-process emails: extract word indices from each email

def preProcessEmail(emailContent, vocabList_dict):
    # convert to all lower case
    emailContent = emailContent.lower()
    
    # strip html tags, substitute with space
    emailContent = re.sub("<[^<>]+>", " ", emailContent)
    
    # replace actual numbers with text 'number'
    emailContent = re.sub("[0-9]+", "number", emailContent)
    
    # replace URLs with 'httpaddr'
    emailContent = re.sub("(http|https)://[^\s]*", "httpaddr", emailContent)
    
    # replace email addresses with 'emailaddr'
    emailContent = re.sub("[^\s]+@[^\s]+", "emailaddr", emailContent)
    
    # replace $ with 'dollar'
    emailContent = re.sub("[$]+", "dollar", emailContent)    
  
    # replace non-alphanumeric characters (i.e. special characters) with spaces
    emailContent = re.sub("[^a-zA-Z0-9]", " ", emailContent)        
  
    # stem the words
    ps = PorterStemmer()
    emailContent = [ps.stem(token) for token in emailContent.split(" ")]
    # above converts emailContent into a list
    
    # process email and return word indices
    wordIndices = []
    
    for word in emailContent: # indexing into a list
        if word in vocabList_dict:
            wordIndices.append(int(vocabList_dict[word]))            
    
    return wordIndices

In [46]:
# create a features vector for each email from its word index
# the features vector is the size of the vocab list
# if a word in the email, and is present in the vocablist then it's index is marked as '1' in the features vector

def emailFeatures(wordIndices, vocabList_dict):
    
    """
    # if an email has the text:
    #
    # The quick brown fox jumped over the lazy dog.
    #
    # Then, the word_indices vector for this text might look like:
    #               
    # 60  100   33   44   10     53  60  58   5
    # 
    # and the features vector might look like this 
    #
    # [1 0 0 0 0... 1...0... 0 0 1....1] => same size as vocab list

    """
    n = len(vocabList_dict)

    featuresVec = np.zeros((n,1))

    for i in wordIndices:
        featuresVec[i] = 1

    return featuresVec

In [47]:
# extract word indices from spam1 email

wordIndices = preProcessEmail(spam1Content, vocabList_dict)

In [48]:
len(wordIndices), len(vocabList_dict)

(95, 1899)

In [49]:
# convert email1 into features vector using its word indices

spam1Features = emailFeatures(wordIndices, vocabList_dict)

In [50]:
print("length of spam1Features = ", len(spam1Features))
print("nr of non zero entries in spam1Features  = ", np.sum(spam1Features))

length of spam1Features =  1899
nr of non zero entries in spam1Features  =  48.0


In [51]:
spam1Features

array([[0.],
       [0.],
       [0.],
       ...,
       [1.],
       [0.],
       [0.]])

In [52]:
# part 2
# training classifiers using the given data in spamTrain.mat


In [53]:
# load training data
trainPath = "D:\Programming\TestData\spamTrain.mat"

trainData = loadmat(trainPath) # for loading data from matlab files

In [54]:
trainData.keys()

dict_keys(['__header__', '__version__', '__globals__', 'X', 'y'])

In [55]:
XTrain, yTrain = trainData['X'], trainData['y']

In [56]:
XTrain.shape, yTrain.shape
# XTrain has 4000 sample (email feature vectors), each sample/row of length 1899 (same as vocab list)

((4000, 1899), (4000, 1))

In [57]:
# support vector machine from sklearn.svm

#spamSVC = svm.SVC(C=0.1,kernel='linear')
spamSVC = svm.SVC(kernel='linear')
spamSVC.fit(XTrain, yTrain.ravel())

trainingScore = spamSVC.score(XTrain, yTrain.ravel())

print("Training Accuracy:", trainingScore * 100, "%")

Training Accuracy: 99.97500000000001 %


In [58]:
# slight detour

In [59]:
# optimal weights computed / assigned to the features

weights = spamSVC.coef_

weights.shape, weights

((1, 1899),
 array([[-1.44288629e-04,  2.06502671e-02,  2.86525927e-02, ...,
         -1.63829815e-01, -1.45504787e-02,  7.84855049e-02]]))

In [60]:
sortedIndices = np.argsort(weights)
sortedIndices
# weights in sorted order, giving their indices in sorted order
# these indices indicate the indices of the words in vocabList_dict which are given corresponding weights
# by the learning algo SVC

array([[1603, 1880, 1436, ..., 1847,  656, 1190]], dtype=int64)

In [61]:
# back from detour

In [62]:
# load test data
testPath = "D:\Programming\TestData\spamTest.mat"

testData = loadmat(testPath) # for loading data from matlab files

In [63]:
testData.keys()

dict_keys(['__header__', '__version__', '__globals__', 'Xtest', 'ytest'])

In [64]:
Xtest, ytest = testData['Xtest'], testData['ytest']

In [65]:
Xtest.shape, ytest.shape
# Xtest has 1000 sample (email feature vectors), each sample/row of length 1899 (same as vocab list)

((1000, 1899), (1000, 1))

In [66]:
yPred = spamSVC.predict(Xtest)

In [67]:
yPred.shape, yPred

((1000,),
 array([1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1,
        0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1,
        1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
        0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
        0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
        0, 1, 0, 1, 0, 0, 1,

In [68]:
testScore = spamSVC.score(Xtest, ytest)

In [69]:
print("Test Accuracy:", testScore * 100, "%")

Test Accuracy: 97.8 %


In [70]:
spamSVC.predict(spam1Features.T)
# 1 => spam

array([1], dtype=uint8)

In [71]:
# trial runs

In [72]:
# read email text file
email1Path = "D:\Programming\TestData\emailMinalSample1.txt"

email1Content = open(email1Path, 'r').read()

In [73]:
wordIndices = preProcessEmail(email1Content, vocabList_dict)

In [74]:
email1Features = emailFeatures(wordIndices, vocabList_dict)

In [75]:
len(wordIndices), len(email1Features), np.sum(email1Features) #nr of non zero entries

(93, 1899, 61.0)

In [76]:
spamSVC.predict(email1Features.T)
# 0 => not spam

array([0], dtype=uint8)

In [77]:
# read email text file
spam2Path = "D:\Programming\TestData\spamSample2.txt"

spam2Content = open(spam2Path, 'r').read()

In [78]:
wordIndices = preProcessEmail(spam2Content, vocabList_dict)

In [79]:
spam2Features = emailFeatures(wordIndices, vocabList_dict)

In [80]:
spamSVC.predict(spam2Features.T)

# 1 => spam

array([1], dtype=uint8)

In [81]:
# read email text file
email2Path = "D:\Programming\TestData\emailMinalSample2.txt"

email2Content = open(email2Path, 'r').read()

In [82]:
wordIndices = preProcessEmail(email2Content, vocabList_dict)

In [83]:
email2Features = emailFeatures(wordIndices, vocabList_dict)

In [85]:
spamSVC.predict(email2Features.T)

# 0 => not spam (this mail sample is really small; result is inaccurate)

array([1], dtype=uint8)

In [86]:
# read email text file
email3Path = "D:\Programming\TestData\emailSample2.txt"

email3Content = open(email3Path, 'r').read()

In [87]:
wordIndices = preProcessEmail(email3Content, vocabList_dict)

In [88]:
email3Features = emailFeatures(wordIndices, vocabList_dict)

In [89]:
spamSVC.predict(email3Features.T)

# 1 => spam 

array([0], dtype=uint8)