4. Build a spam classifier (a more challenging exercise):

    • Download examples of spam and ham from Apache SpamAssassin’s public
    datasets.

    • Unzip the datasets and familiarize yourself with the data format.

    • Split the datasets into a training set and a test set.
    
    • Write a data preparation pipeline to convert each email into a feature vector.
    Your preparation pipeline should transform an email into a (sparse) vector that
    indicates the presence or absence of each possible word. For example, if all
    emails only ever contain four words, “Hello,” “how,” “are,” “you,” then the email
    “Hello you Hello Hello you” would be converted into a vector [1, 0, 0, 1]
    (meaning [“Hello” is present, “how” is absent, “are” is absent, “you” is
    present]), or [3, 0, 0, 2] if you prefer to count the number of occurrences of
    each word.
    
    You may want to add hyperparameters to your preparation pipeline to control
    whether or not to strip off email headers, convert each email to lowercase,
    remove punctuation, replace all URLs with “URL,” replace all numbers with
    “NUMBER,” or even perform stemming (i.e., trim off word endings; there are
    Python libraries available to do this).
    Finally, try out several classifiers and see if you can build a great spam classi
    fier, with both high recall and high precision.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import os
from pathlib import Path

ham_path = os.path.join(os.getcwd(), "easy_ham")
spam_path = os.path.join(os.getcwd(), "spam")

In [3]:
def is_email_file(path):
    # Check filename starts with digit and content has 'Subject:' header
    if path.is_file() and path.name[0].isdigit():
        text = path.read_text(encoding="latin1")
        if "Subject:" in text or "From:" in text:
            return True
    return False

def load_emails(folder):
    email_dir = Path(folder)
    emails = []

    # iterdir() returns a list of Path objects in the email_dir
    for path in email_dir.iterdir():
        if is_email_file(path):
            try:
                text = path.read_text(encoding="latin1")
                emails.append(text)
            except:
                print(f"Skipping file {path.name}")

        else:
            print(f"{path.name} is not an email file")

    return emails

In [4]:
spam_emails = load_emails(spam_path)
ham_emails = load_emails(ham_path)

0000.7b1b73cf36cf9dbc3d64e3f2ee2b91f1 is not an email file


In [5]:
print(len(spam_emails))
print(len(ham_emails))

500
2551


In [6]:
data = [(email_str, 1) for email_str in spam_emails]
data += [(email_str, 0) for email_str in ham_emails]

len(data)

3051

# BETTER URL HANDLING NEEDED

In [7]:
from email.parser import Parser
from email import policy 
from bs4 import BeautifulSoup
import regex as re

# Preprocesser
class PreProcessing(BaseEstimator, TransformerMixin): 
    
    def __init__(self, strip_headers=True, lowercase=True, remove_punc=True,
                 replace_url=True, remove_numbers=True):

        self.strip_headers = strip_headers
        self.lowercase = lowercase
        self.remove_punc = remove_punc
        self.replace_url = replace_url
        self.remove_numbers = remove_numbers
    

    def __html_to_text(self, html):
        soup = BeautifulSoup(html, "html.parser")
        return soup.get_text()

    # Merges header with the email's text/plain content
    def __merge_header_content(self, text_plain, email_msg_obj):
        email_with_header = ""

        for item in email_msg_obj.items():
            email_with_header += f"{item[0]}: {item[1]}\n"

        email_with_header = email_with_header + text_plain

        return email_with_header  
    
    # Returns string, EmailMessage
    def __extract_email(self, email_str):
        # Instantiates an EmailMessage object from string value
        email_message = Parser(policy=policy.default).parsestr(email_str)       

        # if msg's mail is singlepart returns content as string
        if email_message.is_multipart() is False:
            content_type = email_message.get_content_type()

            charset = email_message.get_content_charset()
            if not charset:
                charset = "latin1"

            # Each part is decoded to byte strings
            payload = email_message.get_payload(decode=True)
            payload = payload if payload is not None else b""

            if isinstance(payload, bytes):
                try:
                    # According to part's charset, byte string is decoded to Unicode Python string
                    text = payload.decode(charset, errors="replace")
                except LookupError:
                    charset = "latin1"
                    text = payload.decode(charset, errors="replace")
            else:
                text = payload

            text = text or ""

            # if content type is html, it is converted to text 
            text = self.__html_to_text(text) if content_type == "text/html" else text

            return text, email_message
                
        # else returns only text/plain part's content as string
        else:
            text_html = ""

            for part in email_message.iter_parts():
                content_type = part.get_content_type()

                charset = part.get_content_charset()
                if not charset:
                    charset = "latin1"

                # Each part is decoded to byte strings
                payload = part.get_payload(decode=True)
                payload = payload if payload is not None else b""

                if isinstance(payload, bytes):
                    try:
                        # According to part's charset, byte string is decoded to Unicode Python string
                        text = payload.decode(charset, errors="replace")
                    except LookupError:
                        charset = "latin1"
                        text = payload.decode(charset, errors="replace")
                else:
                    text = payload

                text = text or ""

                if content_type == "text/plain":
                    return text, email_message
                elif content_type == "text/html":
                    text_html = self.__html_to_text(text)

            # if no text/plain it either falls back to html, if both are empty returns empty
            return text_html, email_message


    def __remove_punc(self, email_str):
        # \p{P} matches any kind of punctuation character from any language
        return re.sub(r'\p{P}+', '', email_str)

    def __remove_numbers(self, email_str):
        return re.sub(r'\b\d+(?:\.\d+)?\b', 'NUMBER', email_str)

    # TODO: BETTER URL HANDLING NEEDED
    def __remove_url(self, email_str):
        return re.sub(r'https?://\S+|www\.\S+', 'URL', email_str)
       
    

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        processed_data = []

        # X(data): [(email_str, label), (email_str, label)...]
        for item in X:
            email_str = item[0]
            label = item[1]

            text_plain, email_message = self.__extract_email(email_str)

            if self.strip_headers:
                email_str = text_plain
            else:
                email_str = self.__merge_header_content(text_plain, email_message)

            if self.lowercase:
                email_str = email_str.lower()
            if self.replace_url:
                email_str = self.__remove_url(email_str)
            
            if self.remove_punc:
                email_str = self.__remove_punc(email_str)        

            if self.remove_numbers:
                email_str = self.__remove_numbers(email_str)            


            processed_data.append((email_str, label))
        
        return processed_data
    

preprocesser = PreProcessing()

In [8]:
data = preprocesser.transform(data)

In [9]:
import joblib

In [10]:
# joblib.dump(data, "processed_data.joblib")

In [11]:
data = joblib.load("processed_data.joblib")

In [12]:
import nltk
# nltk.download("stopwords")
from nltk.corpus import stopwords

In [13]:
from nltk.stem import PorterStemmer

In [None]:
class WordCountVectorizer(BaseEstimator, TransformerMixin):

    def __init__(self, stemming=True, vocab_size=5000):
        self.stop_words = set(stopwords.words("english"))
        self.stemming = stemming
        self.vocab_size = vocab_size
    
    def create_vocab(self, X, y):
        token_list = []
        X_tokens = []

        stop_words = self.stop_words
        # PorterStemmer behaves determinestically
        stemmer = PorterStemmer()
        if self.stemming:
            stop_words = [stemmer.stem(word) for word in stop_words]

        for email_str in X:
            email_str = email_str.replace("\n", " ")
            email_str = email_str.strip()
            email_str = self.__subs_datetime(email_str)

            row_tokens = [token for token in re.findall(r"\w+|[^\w\s]", email_str)]
            if self.stemming:
                row_tokens = [stemmer.stem(token, to_lowercase=False) for token in row_tokens]

            row_tokens = [self.validate_and_tokenize(token, stop_words) for token in row_tokens]
            row_tokens = [token for token in row_tokens if token is not None]

            X_tokens.append(row_tokens)
            token_list.extend(row_tokens)

        # Our vocab_size will determine feauture counts, if remaining tokens are not in this list
        # they will be counted as OTHER_ELEMENT
        features = pd.Series(token_list).value_counts(ascending=False).head(self.vocab_size).index.to_list()

        # X_tokens is X, contains list of tokens of each row
        return features, X_tokens


    
    def count_matrixation(self, X, features):
        # X is X_tokens, 26000 rows
        email_dict = {}
        count_dict_base = {feature : 0 for feature in features}   # each key of email_dict will contain a dictionary for count dict for every feature
        count_dict_base["OTHER_ELEMENT"] = 0

        for idx, email_tokens in enumerate(X):
            tokens_value_counts = pd.Series(email_tokens).value_counts().to_dict()
            count_dict = count_dict_base.copy()

            for key, value in tokens_value_counts.items():
                if key in features:
                    count_dict[key] = value
                else:
                    count_dict["OTHER_ELEMENT"] += value

            email_dict[f"email{idx}"] = count_dict       

        return pd.DataFrame(email_dict).T

    
    def validate_and_tokenize(self, token, stop_words):
        token = token.strip()
        if len(token) < 20 and len(token) > 0 and token.lower() not in stop_words:
            return token
        


    def __subs_datetime(self, email_str):
        datetime_regex = r'''
            \b
            (                                # group entire match
                \d{8}[Tt]?\d{6}              # 20021009T225035 or 20021009t205951
                (?:Z|[+-]?\d{4})?            # optional timezone like Z or +0800
                |
                \d{4}[-./]\d{2}[-./]\d{2}    # 2002-12-01, 2002/12/01, 2002.12.01
                (?:[Tt]\d{2}[:.]?\d{2}[:.]?\d{2})?  # optional time like T23:45:12
            )
            \b
        '''
        return re.sub(datetime_regex, 'DATETIME', email_str, flags=re.IGNORECASE | re.VERBOSE)



    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return self

vector_transformer = WordCountVectorizer()

In [15]:
X, y = zip(*data)
columns, X_tokens = vector_transformer.create_vocab(X, y)

In [16]:
len(columns)

26471

In [17]:
len(X_tokens)

3051

In [18]:
matrix = vector_transformer.count_matrixation(X_tokens, columns)

In [19]:
matrix

Unnamed: 0,save,NUMBER,life,insur,spend,quot,ensur,famili,financi,secur,...,trebl,radioact,reactor,spineless,poet,215pm,poetri,sperm,powergen,whitehal
email0,8,3,6,4,1,5,1,2,1,1,...,0,0,0,0,0,0,0,0,0,0
email1,0,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
email2,0,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
email3,0,15,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
email4,0,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
email3046,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
email3047,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
email3048,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
email3049,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# joblib.dump(matrix, "X_matrixed.joblib")

In [21]:
matrix.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3051 entries, email0 to email3050
Columns: 26471 entries, save to whitehal
dtypes: int64(26471)
memory usage: 616.2+ MB
