# The spam fillter

## 1. Pre processing

In [18]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk.stem.porter
from scipy.io import loadmat
from sklearn import svm


###  Output the email content 

In [2]:
!ls data

emailSample1.txt  ex6data1.mat	ex6data3.mat	 spamSample2.txt  spamTrain.mat
emailSample2.txt  ex6data2.mat	spamSample1.txt  spamTest.mat	  vocab.txt


In [3]:
!cat data/emailSample1.txt

> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com



### strip 

In [13]:
def preProcess(email):
    email = email.lower()
    
    # Strip html tags (strings that look like <blah> where 'blah' does not
    # contain '<' or '>')... replace with a space
    email = re.sub('<[^<>]+>', ' ', email)
    
    #Any numbers get replaced with the string 'number'
    email = re.sub('[0-9]+', 'number', email)
    
    #Anything starting with http or https:// replaced with 'httpaddr'
    email = re.sub('(http|https)://[^\s]*', 'httpaddr', email)
    
    #Strings with "@" in the middle are considered emails --> 'emailaddr'
    email = re.sub('[^\s]+@[^\s]+', 'emailaddr', email)
    
    #The '$' sign gets replaced with 'dollar'
    email = re.sub('[$]+', 'dollar', email)
    
    return email

## 2. Email to tokens list 

In [21]:
def email2TokensList(raw_email):
    stemmer = nltk.stem.porter.PorterStemmer()
    email = preProcess(raw_email)
    tokens = re.split('[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]', 
                      email)
    token_list = []
    for token in tokens:
        #Remove any non alphanumeric characters
        token = re.sub('[^a-zA-Z0-9]', '', token);

        #Use the Porter stemmer to stem the word
        stemmed = stemmer.stem( token )
        
        #Throw out empty tokens
        if not len(token): continue
            
        #Store a list of all unique stemmed words
        token_list.append(stemmed)
        
    return token_list

## 3. Vocabulary list 

In [15]:
def getVocabDict(reverse=False):
    vocab_dict = {}
    with open('data/vocab.txt') as f:
        for line in f:
            val, key = line.split()
            if not reverse:
                vocab_dict[key] = int(val)
            else:
                vocab_dict[int(val)] = key
    return vocab_dict

In [16]:
def email2VocabIndices(raw_email, vocab_dict):
    token_list = email2TokensList(raw_email)
    indices_list = [vocab_dict[token] for token in token_list if token in vocab_dict]
    return indices_list

## 4. Extracting features from email 

In [23]:
def email2Vector(raw_email, vocab_dict):
    n = len(vocab_dict)
    result = np.zeros((n, 1))
    vocab_indices = email2VocabIndices(raw_email, vocab_dict)
    for i in vocab_indices:
        result[i] = 1
    return result

## 5. Process the email

In [25]:
vocab_dict = getVocabDict()
with open('data/emailSample1.txt', 'r') as e:
    raw_email = e.read()
    vec = email2Vector(raw_email, vocab_dict)
    print(f'Length of the features vector is {len(vec)}')
    print(f'Number of non-zero entries is {np.sum(vec == 1)}')

Length of the features vector is 1899
Number of non-zero entries is 45


## 6. Training SVM for spam Classification

In [26]:
mat = loadmat('data/spamTrain.mat')
X, y = mat['X'], mat['y']

mat = loadmat('data/spamTest.mat')
Xtest, ytest = mat['Xtest'], mat['ytest']

In [33]:
#m > n
X.shape, Xtest.shape 

((4000, 1899), (1000, 1899))

In [29]:
pos = np.array([X[i] for i in range(X.shape[0]) if y[i] == 1])
neg = np.array([X[i] for i in range(X.shape[0]) if y[i] == 0])
print('Total number of training emails = ', X.shape[0])
print('Number of training spam emails = ', pos.shape[0])
print('Number of training nonspam emails = ', neg.shape[0])

Total number of training emails =  4000
Number of training spam emails =  1277
Number of training nonspam emails =  2723


## 7. Linear SVM

In [42]:
linear_svm = svm.SVC(C=0.1, kernel='linear')

# Now we fit the SVM to our X matrix, given the labels y
linear_svm.fit( X, y.flatten() )

SVC(C=0.1, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [43]:
train_predictions = linear_svm.predict(X).reshape((y.shape[0],1))
train_acc = 100. * float(sum(train_predictions == y))/y.shape[0]
print('Training accuracy = %0.2f%%' % train_acc)

test_predictions = linear_svm.predict(Xtest).reshape((ytest.shape[0],1))
test_acc = 100. * float(sum(test_predictions == ytest))/ytest.shape[0]
print('Test set accuracy = %0.2f%%' % test_acc)

Training accuracy = 99.83%
Test set accuracy = 98.90%


## 8. Gaussion SVM

In [40]:
gaus_svm = svm.SVC(0.3, kernel='rbf', gamma = np.power(0.3, -2))
gaus_svm.fit( X, y.flatten())

KeyboardInterrupt: 

In [41]:
train_predictions = gaus_svm.predict(X).reshape((y.shape[0],1))
train_acc = 100. * float(sum(train_predictions == y))/y.shape[0]
print('Training accuracy = %0.2f%%' % train_acc)

test_predictions = gaus_svm.predict(Xtest).reshape((ytest.shape[0],1))
test_acc = 100. * float(sum(test_predictions == ytest))/ytest.shape[0]
print('Test set accuracy = %0.2f%%' % test_acc)

NotFittedError: This SVC instance is not fitted yet. Call 'fit' with appropriate arguments before using this method.

## 9. Top predictors for spam 

In [37]:
# Determine the words most likely to indicate an e-mail is a spam
# From the trained SVM we can get a list of the weight coefficients for each
# word (technically, each word index)

vocab_dict_flipped = getVocabDict(reverse=True)

#Sort indicies from most important to least-important (high to low weight)
sorted_indices = np.argsort( linear_svm.coef_, axis=None )[::-1]
print("The 15 most important words to classify a spam e-mail are:")
print([ vocab_dict_flipped[x] for x in sorted_indices[:15] ])
print("")
print("The 15 least important words to classify a spam e-mail are:")
print([ vocab_dict_flipped[x] for x in sorted_indices[-15:] ])
print("")

# Most common word (mostly to debug):
most_common_word = vocab_dict_flipped[sorted_indices[0]]
print('# of spam containing \"%s\" = %d/%d = %0.2f%%'% \
    (most_common_word, sum(pos[:,1190]),pos.shape[0],  \
     100.*float(sum(pos[:,1190]))/pos.shape[0]))
print('# of NON spam containing \"%s\" = %d/%d = %0.2f%%'% \
    (most_common_word, sum(neg[:,1190]),neg.shape[0],      \
     100.*float(sum(neg[:,1190]))/neg.shape[0]))

The 15 most important words to classify a spam e-mail are:
['otherwis', 'clearli', 'remot', 'gt', 'visa', 'base', 'doesn', 'wife', 'previous', 'player', 'mortgag', 'natur', 'll', 'futur', 'hot']

The 15 least important words to classify a spam e-mail are:
['http', 'toll', 'xp', 'ratio', 'august', 'unsubscrib', 'useless', 'numberth', 'round', 'linux', 'datapow', 'wrong', 'urgent', 'that', 'spam']

# of spam containing "otherwis" = 804/1277 = 62.96%
# of NON spam containing "otherwis" = 301/2723 = 11.05%


## 10. What about linear logisitic regression

In [38]:
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

In [44]:
print(metrics.classification_report(ytest, test_predictions))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99       692
           1       0.97      0.99      0.98       308

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       0.99      0.99      0.99      1000



In [45]:
logit = LogisticRegression()
logit.fit(X, y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [46]:
pred = logit.predict(Xtest)
print(metrics.classification_report(ytest, pred))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00       692
           1       0.99      0.99      0.99       308

    accuracy                           0.99      1000
   macro avg       0.99      0.99      0.99      1000
weighted avg       0.99      0.99      0.99      1000



### 逻辑回归和线性svm差别不大 