## Spam Filter using Apache SpamAssasin Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


spam_data_dir = './datasets/spam_data/'

def getDataDir(dirname):
    return spam_data_dir + dirname

In [2]:
import os
from pathlib import Path
from email.parser import BytesParser
from email.policy import default

def getDataAsEmails(target_dir):
    result = []
    parser = BytesParser(policy=default)
    for child in Path(target_dir).iterdir():
        if child.is_file():
            with open(os.fspath(child), 'rb') as f:
                emailMessage = parser.parse(fp=f)
                result.append(emailMessage)
            
    return np.array(result, dtype = object)
            
easy_ham = getDataAsEmails(getDataDir('easy_ham'))
easy_ham2 = getDataAsEmails(getDataDir('easy_ham_2'))

hard_ham = getDataAsEmails(getDataDir('hard_ham'))

spam = getDataAsEmails(getDataDir('spam'))
spam2 = getDataAsEmails(getDataDir('spam_2'))


In [3]:
ham = np.concatenate([easy_ham, easy_ham2, hard_ham])
spam = np.concatenate([spam, spam2])

labels = np.array([0] * len(ham) + [1] * len(spam), dtype=int)
data = np.concatenate([ham, spam])

Storing data as email.Message objects because this gives the most flexibility in the Pipeline for new messages. Also keep preprocessing of data outside the Pipeline to a minimum

In [4]:
print('Ham size:', len(ham))
print('Spam size:', len(spam))
print('Total:', len(data))

Ham size: 6951
Spam size: 2398
Total: 9349


## Split data into train set and test set

In [5]:
from sklearn.model_selection import StratifiedShuffleSplit

stratifiedSplit = StratifiedShuffleSplit(random_state=42, test_size=0.2)
for train_index, test_index in stratifiedSplit.split(data, labels):
    X_train, X_test = data[train_index], data[test_index] 
    y_train, y_test = labels[train_index], labels[test_index]

In [6]:
print('Training data size:', len(X_train))
print('Test data size:', len(X_test))

Training data size: 7479
Test data size: 1870


## Checking out data

In [7]:
from collections import Counter
ham_content_types = [mail.get_content_type() for mail in ham]
spam_content_types = [mail.get_content_type() for mail in spam]

ham_type_counter = Counter(ham_content_types)
spam_type_counter = Counter(spam_content_types)

print('Ham Content Types:')
print(ham_type_counter.most_common(10))
print('\nSpam Content Types:')
print(spam_type_counter.most_common(10))

Ham Content Types:
[('text/plain', 6371), ('text/html', 240), ('multipart/signed', 180), ('multipart/alternative', 110), ('multipart/mixed', 37), ('multipart/related', 8), ('multipart/report', 5)]

Spam Content Types:
[('text/plain', 1038), ('text/html', 953), ('multipart/alternative', 216), ('multipart/mixed', 142), ('multipart/related', 48), ('text/plain charset=us-ascii', 1)]


Ham contains a lot of text/plain messages, while spam contains equal amount text/html. I'm going to discard multipart data for simplicity for now. Maybe Beautiful Soup can be useful in data preprocessing for html email types

In [8]:
from bs4 import BeautifulSoup
from pprint import pprint
import re

def getEmailData(email):
    result = {}
    result["content_type"] = email.get_content_type()
    result["content"] = "N/A"
    
    if email.is_multipart():
        return result
    else:
        if result["content_type"] == 'text/plain':
            result["content"] = email.get_payload().strip()
            return result
        elif result["content_type"] == 'text/html':
            payload = email.get_payload()
            soup = BeautifulSoup(payload)
            if soup.a != None:
                soup.a.replaceWith('<p>').string = 'URL'
            content = soup.get_text().strip()
            result["content"] = re.sub(r'\\.+\s|\s+', " ", content)
            return result
    return result


Now moving on to encoding words and frequencies

In [9]:
import nltk
from nltk.stem import LancasterStemmer

tokens = nltk.word_tokenize(getEmailData(spam[23])['content'])
stemmer = LancasterStemmer()

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

ham_data = []
for ham_dat in ham:
    hamail = getEmailData(ham_dat)
    if hamail == None:
        continue
    tokens = nltk.word_tokenize(hamail['content'])
    hamail['tokens'] = ' '.join([stemmer.stem(token) for token in tokens])
    ham_data.append(hamail)

vectorizer = CountVectorizer()

In [11]:
X = vectorizer.fit_transform([ham['content'] for ham in ham_data])

In [12]:
X.toarray()

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [3, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

It's clear from above that all the values have been tokenized from the email content. Now, it's time to create a pipeline to make the above transformations

# The Pipeline

## Step 1 - A Transformer to transform email.Message objects

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin

class TransformEmailMessages(BaseEstimator, TransformerMixin):
    def fit(self, X, y):
        return self
    def transform(self, X):
        transformedX = []
        for datum in X:
            trDatum = getEmailData(datum)
            if trDatum != None:
                transformedX.append(pd.Series(trDatum))
                
        return pd.DataFrame(transformedX)

## Step 2 - Vectorize the email content and encode content-types

In [14]:
class TokenizeWords(BaseEstimator, TransformerMixin):
        
    def fit(self, X, y):
        self.XSize = len(X)
        return self
    def transform(self, X):
        resultX = []
        for datum in X:
            tokens = nltk.word_tokenize(datum)
            tokens = ' '.join([stemmer.stem(token) for token in tokens])
            
            resultX.append(tokens)
            
        return pd.DataFrame(resultX, dtype='string')

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder

FirstPipeline = Pipeline(steps=[
    ('transform', TransformEmailMessages()),
    ('encodeHeaders', ColumnTransformer([
#         ('encodeContentType', OrdinalEncoder(), ['content_type']),
        ('tokenize', TokenizeWords(), ['content'])
#         ('vectorize', CountVectorizer(), slice(1, -1))
    ], remainder='passthrough'))
    
])

In [16]:
X_tr = TransformEmailMessages().fit_transform(X_train, y_train)
vectorizer = CountVectorizer().fit(X_tr['content'])

X_tr['content_type'] = OrdinalEncoder().fit_transform(np.array(X_tr['content_type']).reshape(-1, 1))
tokens = TokenizeWords().fit_transform(X_tr['content'], y_train)
tokens[:3]

Unnamed: 0,0
0,thi is an autom respons to a mess you hav sent...
1,20020819 lockergnom tech spec < p > 08.19.2002...
2,


In [17]:
X_Content = vectorizer.fit_transform(tokens[0])

In [18]:
from sklearn.neighbors import KNeighborsClassifier
knnc = KNeighborsClassifier()
knnc.fit(X_Content, y_train)

KNeighborsClassifier()

In [19]:
from sklearn.model_selection import cross_val_score

cross_val_score(knnc, X_Content, y_train, cv=5)

array([0.90173797, 0.90574866, 0.90574866, 0.91176471, 0.91036789])

In [20]:
from sklearn.metrics import precision_score, recall_score, f1_score

knn_pred = knnc.predict(X_Content)

knnprecision = precision_score(y_train, knn_pred)
knnrecall = recall_score(y_train, knn_pred)

In [21]:
print('precision:', knnprecision)
print('recall: ', knnrecall)

precision: 0.8220831470719714
recall:  0.9588112617309698


In [22]:
f1_score(y_train, knn_pred)

0.8851985559566786

Wow, maybe data has overfit!

# Testing on test set

In [23]:
X_test_tr = TransformEmailMessages().fit_transform(X_test, y_test)

tokens = TokenizeWords().fit_transform(X_test_tr['content'], y_train)
X_test_Content = vectorizer.transform(tokens[0])

In [24]:
from sklearn.metrics import precision_score, recall_score, f1_score

knn_test_pred = knnc.predict(X_test_Content)

knnprecision = precision_score(y_test, knn_test_pred)
knnrecall = recall_score(y_test, knn_test_pred)

print('precision:', knnprecision)
print('recall: ', knnrecall)

f1_score(y_test, knn_test_pred)

precision: 0.7918871252204586
recall:  0.9354166666666667


0.8576886341929322

That's an average f1-score but good to know recall is over 93% but for classifying spam, precision should be higher as getting a few spam messages into the inbox won't be bad while getting important mails as spam could be disastrous! The data has overfit a little, maybe choosing a simpler classifier or adding more data (e.g. headers) can help