In [1]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

def load_spam_data():
    tarball_path_spam = Path("datasets/spam_data.tar.bz2")
    tarball_path_ham = Path("datasets/ham_data.tar.bz2")
    
    if not tarball_path_spam.is_file() and not tarball_path_ham.is_file():
        Path("datasets").mkdir(parents=True , exist_ok=True)
        
        spam_url = "https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2"
        ham_url = "https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2"
        
        urllib.request.urlretrieve(spam_url , tarball_path_spam)
        with tarfile.open(tarball_path_spam) as tb:
            tb.extractall(path="datasets" , filter="data")
            
        urllib.request.urlretrieve(ham_url , tarball_path_ham)
        with tarfile.open(tarball_path_ham) as tb:
            tb.extractall(path="datasets" , filter="data")
            
    return [Path("datasets/spam") , Path("datasets/easy_ham")]


In [2]:
spam_dir , ham_dir = load_spam_data()

In [3]:
spam_filenames = [f for f in sorted(spam_dir.iterdir()) if len(f.name) > 20]
ham_filenames = [f for f in sorted(ham_dir.iterdir()) if len(f.name) > 20]

In [4]:
print(f"Num of spam: {len(spam_filenames)}")
print(f"Num of ham: {len(ham_filenames)}")

Num of spam: 501
Num of ham: 2551


Using Python's email lib to decode the mails

In [5]:
import email
import email.policy

def get_email(filename):
    mail_parser = email.parser.BytesParser(policy=email.policy.default)
    with open(filename , "rb") as f:
        return mail_parser.parse(f)

Decode all email files

In [6]:
spam_emails = [get_email(f) for f in spam_filenames]
ham_emails = [get_email(f) for f in ham_filenames]

In [7]:
#print(spam_emails[2]) #or print("spam_emails[2].get_content().strip()")

Split data into train and test set:

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails , dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails)) #ham => 0 | spam => 1

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , random_state=42)

Get an insight on the structure of the mails:

In [9]:
print(ham_emails[10].get_payload()) #=> emails with more content (e.g. larger conversations, etc.)

[<email.message.EmailMessage object at 0x12303ae10>, <email.message.EmailMessage object at 0x122f3edd0>]


In [10]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        multipart = ", ".join([get_email_structure(sub_email) for sub_email in payload])
        return f"multipart({multipart})"
    else:
        return email.get_content_type()

In [11]:
get_email_structure(ham_emails[10])

'multipart(text/plain, application/pgp-signature)'

In [12]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

Compute the most common structures to get an idea on how typical ham and spam mails are constructed/structured:

In [13]:
structures_counter(ham_emails).most_common()

[('text/plain', 2453),
 ('multipart(text/plain, application/pgp-signature)', 72),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [14]:
structures_counter(spam_emails).most_common()

[('text/plain', 222),
 ('text/html', 181),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 19),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

First conclusion: The structure is a useful information. For example ham mails mostly consist of text only and sometimes use pgp signature, while spam almost always consists of html and no pgp.

## Preprocessing

1. Convert HTML Tags into plain text using BeautifulSoup

In [15]:
from bs4 import BeautifulSoup

def email_html_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        soup = BeautifulSoup(html, 'html.parser')
        clean_text = soup.get_text(separator=' ' , strip=True)
        return clean_text

In [16]:
html_spam_emails = [mail for mail in X_train[y_train == 1] if get_email_structure(mail) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:400], "...")

<html>
<head>
</head>
<center>
<h1>
<b><font face="Arial Black"><font color="#0000FF"><font size=+2>&nbsp;
Free Personal and Business Grants</font></font></font></b></h1></center>

<p>&nbsp;
<center><table BORDER=0 CELLSPACING=0 CELLPADDING=10 WIDTH="419" BGCOLOR="#0000FF" >
<tr>
<td WIDTH="397" BGCOLOR="#FFFF00">
<center>
<h2>
<font face="Arial Narrow">" Qualify for <u>at least</u> $25,000 in fre ...


In [17]:
print(email_html_to_text(sample_html_spam)[:100], "...")

Free Personal and Business Grants " Qualify for at least $25,000 in free
grants money - Guaranteed!  ...


Works good.

We will remove URLs and replace them with the word 'URL', using urlextract lib. Here: Prepare the lib.

In [18]:
#%pip install -q -U urlextract #installing urlextract if not available

import urlextract

url_extractor = urlextract.URLExtract()

#for testing:
some_text = "Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"
print(str(url_extractor.find_urls(some_text)))

['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']


We will reduce the words to their stem/root form (stemming). Necessary for text analysis.
Using  Natural Language Toolkit (NLTK) for this task. Here: Just prepare the stemmer:

In [19]:
import nltk

stemmer = nltk.PorterStemmer()


Now, lets put everything togeter and create a transformer that that transforms emails to word counter.

In [20]:
import re
from sklearn.base import BaseEstimator , TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator , TransformerMixin):
    def __init__(self , strip_headers=True , lower_case=True ,
                remove_punctuation=True , replace_urls=True ,
                replace_numbers=True , stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
        
        
    def fit(self , X , y=None): #processing here not necessary, we only transform
        return self
    
    
    def transform(self , X , y=None):
        X_transformed = []
        for email in X:
            text = email_html_to_text(email) or ""
            
            #set lower case if demanded
            if self.lower_case: text = text.lower()
                
            #replace urls in text with constant string " URL ", if demanded
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url) , reverse=True)
                for url in urls: text = text.replace(url , " URL ")
                    
            #replace numbers with constant string "NUMBER", if demanded
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?' , "NUMBER" , text)
            
            #remove punctuation if demanded
            if self.remove_punctuation:
                text = re.sub(r'\W+' , ' ' , text , flags=re.M)
                
            #start counting words and use stemming if requested
            word_counts = Counter(text.split()) #split text along spaces between words (not sufficient for e.g. Chinese language)
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word , count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            
            X_transformed.append(word_counts)
            
        return np.array(X_transformed)

Test our brandnew transformer on a few emails:

In [21]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
#X_few_wordcounts #remove comment for printout

Now we need to transform the word counts into vectors. We build another transformer for this task. Its fit() method will build an ordered list of the most common words and its transform() method will use this list to convert the word counts into vectors. The output will be a sparse matrix.

In [22]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator , TransformerMixin):
    def __init__(self , vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
        
        
    def fit(self , X , y=None):
        total_count = Counter()
        
        for word_count in X:
            for word , count in word_count.items():
                total_count[word] += min(count , 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index , (word , count) in enumerate(most_common)}
        
        return self
    
    
    def transform(self , X , y=None):
        rows = []
        cols = []
        data = []
        for row , word_count in enumerate(X):
            for word , count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word , 0))
                data.append(count)
                
        return csr_matrix((data , (rows , cols)) , 
                        shape=(len(X) , self.vocabulary_size + 1))

Also test this transformer on some word counts:

In [23]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors.toarray()

array([[ 51,   2,   0,   2,   4,   1,   1,   3,   2,   2,   2],
       [116,   8,   7,   5,   3,   5,   3,   2,   3,   1,   2],
       [ 46,   3,   5,   0,   0,   1,   1,   0,   0,   2,   0]])

How to read this sparse matrix: The second row stands for the second mail. The 116 means this mail contains 116 words that are not part of the vocabulary. The 8 next to it means that the first word in the vocabuliry is present 8 times in that particular email. The 7 next to it means the second word is present 7 times, and so on. The vocabulary can be seen via the vocabulary_ internal variable:

In [24]:
vocab_transformer.vocabulary_

{'i': 1,
 'number': 2,
 'the': 3,
 'it': 4,
 'to': 5,
 'that': 6,
 't': 7,
 'look': 8,
 'at': 9,
 'a': 10}

So coming back to the sparse matrix from above: The second mail has 116 times the word 'I' in it. The word 'number' is present 8 times, and so on.

## Train Classifier

We are now ready to train the spam classifier. 

In [25]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount" , EmailToWordCounterTransformer()) ,
    ("wordcount_to_vector" , WordCounterToVectorTransformer())
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(max_iter=1000 , random_state=42)
score = cross_val_score(log_clf , X_train_transformed , y_train , cv=3)
score.mean()

0.9877094471190412

98,7% is not bad for a start.

## Measure accuracy

In [28]:
from sklearn.metrics import precision_score , recall_score

X_test_transformed = preprocess_pipeline.transform(X_test) #do not fit on test data!

log_clf = LogisticRegression(max_iter=1000 , random_state=42)
log_clf.fit(X_train_transformed , y_train)

y_pred = log_clf.predict(X_test_transformed)

print(f"Precision: {precision_score(y_test , y_pred): .2%}")
print(f"Recall: {recall_score(y_test , y_pred): .2%}")

Precision:  91.67%
Recall:  95.65%
