# Spam Classifier

The puporse of this project is to detect an Email as Spam or not.

In [128]:
# First, let's fetch the data:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20050311_spam_2.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(ham_url=HAM_URL, spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (('easy_ham.tar.bz2',ham_url),('spam_2.tar.bz2',spam_url)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.close()

In [129]:
fetch_spam_data()

In [130]:
# Next, let's load all the emails:

HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam_2")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]


In [131]:
len(ham_filenames)

2500

In [132]:
len(spam_filenames)

1396

In [133]:
# Let's use Python's email module to parse these emails (this handles headers, encoding, and so on):
import email
import email.policy

def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory = "spam_2" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [134]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [135]:
# Let's look at one example of ham email, to get a feel of what the data looks like:

print(ham_emails[1])


Return-Path: <Steve_Burt@cursor-system.com>
Delivered-To: zzzz@localhost.netnoteinc.com
Received: from localhost (localhost [127.0.0.1])
	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id BE12E43C34
	for <zzzz@localhost>; Thu, 22 Aug 2002 07:46:38 -0400 (EDT)
Received: from phobos [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:46:38 +0100 (IST)
Received: from n20.grp.scd.yahoo.com (n20.grp.scd.yahoo.com    [66.218.66.76])
 by dogma.slashnull.org (8.11.6/8.11.6) with SMTP id    g7MBkTZ05087 for
 <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 12:46:29 +0100
X-Egroups-Return: =?utf-8?q?sentto-2242572-52726-1030016790-zzzz=3Dspamassas?=
 =?utf-8?q?sin=2Etaint=2Eorg=40returns=2Egroups=2Eyahoo=2Ecom?=
Received: from [66.218.67.196] by n20.grp.scd.yahoo.com with NNFMP;
    22 Aug 2002 11:46:30 -0000
X-Sender: steve.burt@cursor-system.com
X-Apparently-To: zzzzteana@yahoogroups.com
Received: (EGP: mail-8_1_0_1); 22 Aug 2002 11:4

In [136]:
# Let's look at one example of spam email, to get a feel of what the data looks like:

print(spam_emails[3])

Return-Path: merchantsworld2001@juno.com
Delivery-Date: Thu May 16 11:03:55 2002
Received: from mandark.labs.netnoteinc.com ([213.105.180.140]) by
    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g4GA3qe29480 for
    <jm@jmason.org>; Thu, 16 May 2002 11:03:52 +0100
Received: from webnote.net (mail.webnote.net [193.120.211.219]) by
    mandark.labs.netnoteinc.com (8.11.2/8.11.2) with ESMTP id g4GA3oD28650 for
    <jm@netnoteinc.com>; Thu, 16 May 2002 11:03:51 +0100
Received: from webcust2.hightowertech.com (webcust2.hightowertech.com
    [216.41.166.100]) by webnote.net (8.9.3/8.9.3) with ESMTP id BAA11067 for
    <jm@netnoteinc.com>; Thu, 16 May 2002 01:58:00 +0100
Received: from html ([199.35.236.73]) by webcust2.hightowertech.com  with
    Microsoft SMTPSVC(5.5.1877.197.19); Wed, 15 May 2002 13:50:57 -0700
From: jordan23@mailexcite.com
To: ranmoore@swbell.net
Subject: New Improved Fat Burners, Now With TV Fat Absorbers! Time:7:20:54 AM
Date: Thu, 31 Jul 1980 07:20:54
MIME-Versio

In [137]:
# Some emails are actually multipart, with images and attachments (which can have their own attachments). 
# Let's look at the various types of structures of emails we have:

def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [138]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures


In [139]:
# let's see the structure of Ham Emails
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [140]:
# let's see the structure of Spam Emails
structures_counter(spam_emails).most_common()

[('text/plain', 597),
 ('text/html', 589),
 ('multipart(text/plain, text/html)', 114),
 ('multipart(text/html)', 29),
 ('multipart(text/plain)', 25),
 ('multipart(multipart(text/html))', 18),
 ('multipart(multipart(text/plain, text/html))', 5),
 ('multipart(text/plain, application/octet-stream, text/plain)', 3),
 ('multipart(text/html, text/plain)', 2),
 ('multipart(text/html, image/jpeg)', 2),
 ('multipart(multipart(text/plain), application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(multipart(text/plain, text/html), image/jpeg, image/jpeg, image/jpeg, image/jpeg, image/jpeg)',
  1),
 ('multipart(multipart(text/plain, text/html), image/jpeg, image/jpeg, image/jpeg, image/jpeg, image/gif)',
  1),
 ('text/plain charset=us-ascii', 1),
 ('multipart(multipart(text/html), image/gif)', 1),
 ('multipart(multipart(text/plain, text/html), application/octet-stream, application/octet-stream, applic

In [141]:
# Now let's take a look at the email headers:
for header, value in spam_emails[0].items():
    print(header,":",value)

Return-Path : <ilug-admin@linux.ie>
Delivered-To : yyyy@localhost.netnoteinc.com
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 9E1F5441DD	for <jm@localhost>; Tue,  6 Aug 2002 06:48:09 -0400 (EDT)
Received : from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for jm@localhost (single-drop); Tue, 06 Aug 2002 11:48:09 +0100 (IST)
Received : from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g72LqWv13294 for    <jm-ilug@jmason.org>; Fri, 2 Aug 2002 22:52:32 +0100
Received : from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org    (8.9.3/8.9.3) with ESMTP id WAA31224; Fri, 2 Aug 2002 22:50:17 +0100
Received : from bettyjagessar.com (w142.z064000057.nyc-ny.dsl.cnc.net    [64.0.57.142]) by lugh.tuatha.org (8.9.3/8.9.3) with ESMTP id WAA31201 for    <ilug@linux.ie>; Fri, 2 Aug 2002 22:50:11 +0100
Received : from 64.0.57.142 [202.63.165.34] by bettyjagessa

In [142]:
spam_emails[0]["Subject"]

'[ILUG] STOP THE MLM INSANITY'

In [143]:
from bs4 import BeautifulSoup, NavigableString, Tag

# Function to convert html emails content to plain text

def html_to_plain_text(html):
    "Creates a formatted text email message as a string from a rendered html template (page)"
    soup = BeautifulSoup(html, 'html.parser')
    # Ignore anything in head
    body, text = soup.body, []
    if body:
        for element in body.descendants:
            # We use type and not isinstance since comments, cdata, etc are subclasses that we don't want
            if type(element) == NavigableString:
                parent_tags = (t for t in element.parents if type(t) == Tag)
                hidden = False
                for parent_tag in parent_tags:
                    # Ignore any text inside a non-displayed tag
                    # We also behave is if scripting is enabled (noscript is ignored)
                    # The list of non-displayed tags and attributes from the W3C specs:
                    if (parent_tag.name in ('area', 'base', 'basefont', 'datalist', 'head', 'link',
                                            'meta', 'noembed', 'noframes', 'param', 'rp', 'script',
                                            'source', 'style', 'template', 'track', 'title', 'noscript') or
                        parent_tag.has_attr('hidden') or
                        (parent_tag.name == 'input' and parent_tag.get('type') == 'hidden')):
                        hidden = True
                        break
                if hidden:
                    continue

                # remove any multiple and leading/trailing whitespace
                string = ' '.join(element.string.split())
                if string:
                    if element.parent.name == 'a':
                        a_tag = element.parent
                        # replace link text with the link
                        string = a_tag.get('href')
                        # concatenate with any non-empty immediately previous string
                        if (type(a_tag.previous_sibling) == NavigableString and
                                a_tag.previous_sibling.string.strip() and string):
                            text[-1] = text[-1] + ' ' + string
                            continue
                    elif element.previous_sibling and element.previous_sibling.name == 'a' and string:
                        if text:
                            text[-1] = text[-1] + ' ' + string
                        continue
                    elif element.parent.name == 'p' and string:
                        # Add extra paragraph formatting newline
                        string = '\n' + string
                    if string:
                        text += [string]
                    else:
                        text +=[""]
                        
        doc = '\n'.join(text)
        return doc
    else:
        return ''

In [144]:
# Let's see how it works for some HTML spam emails:

html_spam_emails = [email for email in spam_emails
                    if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content())



<html><body>

<center>
<font face="arial"><b>Talk on Tele  with locals in your area who want to meet for real encounters. 
 No pre recorded bull this is the real deal.
<p>

US residents: the 9<!--dads tools-->00-370-54<!--starter-->65 or  8<!--ender-->88-400-1<!--end-->919. - 99<!--hi hi -->
 cents / min
<p>

For CA callers try our special California  line, California is so popular we had to create a seperate system just for them
<p>
: 1-<!--moms-->9<!--low-->00-505-7575.
<p>
must be 18<!--none-->+ be careful when making sexual dates and meetings. Cali 9<!--polic cars-->00# is $1.99 per min

</html>

211075433222



In [145]:
# And this is the resulting plain text:

print(html_to_plain_text(sample_html_spam.get_content()))

Talk on Tele with locals in your area who want to meet for real encounters. No pre recorded bull this is the real deal.

US residents: the 9

00-370-54

65 or 8

88-400-1

919. - 99

cents / min

For CA callers try our special California line, California is so popular we had to create a seperate system just for them

: 1-

9

00-505-7575.

must be 18

+ be careful when making sexual dates and meetings. Cali 9

00# is $1.99 per min


In [146]:
# function to read the contents of an email. Here we are considering only text or html content.

def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [147]:
print(email_to_text(sample_html_spam))

Talk on Tele with locals in your area who want to meet for real encounters. No pre recorded bull this is the real deal.

US residents: the 9

00-370-54

65 or 8

88-400-1

919. - 99

cents / min

For CA callers try our special California line, California is so popular we had to create a seperate system just for them

: 1-

9

00-505-7575.

must be 18

+ be careful when making sexual dates and meetings. Cali 9

00# is $1.99 per min


### Train Test Split
Now its time to split our data into a training set and a testing set!

In [148]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails, dtype=object)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<b> We will create a transformer that will convert emails to word counters. Note that we will split sentences into words using Python's split() method, which uses whitespaces for word boundaries.</b>

In [149]:
from sklearn.base import BaseEstimator, TransformerMixin
import urlextract
import re
import nltk

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
        self.url_extractor = urlextract.URLExtract()
        self.stemmer = nltk.PorterStemmer()
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls is not None:
                urls = list(set(self.url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and self.stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = self.stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [150]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'number': 8, 'to': 4, 'a': 4, 'the': 4, 'url': 2, 'ubi': 2, 'china': 2, 'for': 2, 'local': 2, 'date': 1, 'numbertnumb': 1, 'had': 1, 'alway': 1, 'want': 1, 'make': 1, 'pc': 1, 'game': 1, 'market': 1, 'but': 1, 'factor': 1, 'kept': 1, 'idea': 1, 'on': 1, 'hold': 1, 'in': 1, 'januari': 1, 'right': 1, 'incent': 1, 'motiv': 1, 'tri': 1, 'project': 1, 'final': 1, 'arriv': 1, 'licens': 1, 'music': 1, 'up': 1, 'popular': 1, 'anim': 1, 'properti': 1}),
       Counter({'number': 6, 'to': 5, 'you': 5, 'the': 4, 'of': 4, 'receiv': 4, 'thi': 3, 'not': 3, 'email': 3, 'our': 3, 'offer': 3, 'plan': 2, 'at': 2, 'and': 2, 'enrol': 2, 'is': 2, 'in': 2, 'are': 2, 'by': 2, 'sign': 1, 'up': 1, 'for': 1, 'full': 1, 'access': 1, 'medic': 1, 'llc': 1, 'applic': 1, 'must': 1, 'be': 1, 'least': 1, 'pay': 1, 'a': 1, 'one': 1, 'time': 1, 'fee': 1, 'regardless': 1, 'depend': 1, 'non': 1, 'insur': 1, 'healthcar': 1, 'avail': 1, 'washington': 1, 'sent': 1, 'unsolicit': 1, 'it': 1, 'becaus': 1, 'reque

Now we have the word counts, and we need to convert them to vectors. For this, we will build another transformer whose fit() method will build the vocabulary (an ordered list of the most common words) and whose transform() method will use the vocabulary to convert word counts to vectors. The output is a sparse matrix.


In [151]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [152]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors


<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 30 stored elements in Compressed Sparse Row format>

In [153]:
X_few_vectors.toarray()

array([[ 35,   8,   4,   4,   0,   4,   2,   2,   0,   1,   0],
       [107,   6,   5,   4,   4,   1,   1,   1,   1,   2,   5],
       [230,  20,   9,   9,  10,   6,   6,   5,   7,   4,   2]])

What does this matrix mean? Well, the 107 in the second row, first column, means that the second email contains 107 words that are not part of the vocabulary. The 6 next to it means that the first word in the vocabulary is present 6 times in this email. The 5 next to it means that the second word is present 5 times, and so on. We can look at the vocabulary to know which words we are talking about. The first word is "number", the second word is "a", etc.



In [154]:
vocab_transformer.vocabulary_

{'number': 1,
 'to': 2,
 'the': 3,
 'of': 4,
 'a': 5,
 'for': 6,
 'url': 7,
 'that': 8,
 'in': 9,
 'you': 10}

We are now ready to train our first spam classifier! Let's transform the whole dataset:



In [155]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

### Training the LogisticRegression model

In [156]:
from sklearn.linear_model import LogisticRegression


log_clf = LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
log_clf.fit(X_train_transformed,y_train)



### Predictions and Evaluation of LogisticRegression
Create predictions from the test set and create a classification report and a confusion matrix.

In [157]:
X_test_transformed = preprocess_pipeline.transform(X_test)
predictions = log_clf.predict(X_test_transformed)

In [158]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       496
           1       0.96      0.99      0.97       284

    accuracy                           0.98       780
   macro avg       0.98      0.98      0.98       780
weighted avg       0.98      0.98      0.98       780



In [159]:
print(confusion_matrix(y_test,predictions))

[[484  12]
 [  3 281]]


In [160]:
from sklearn.metrics import precision_score, recall_score

print("Precision: {:.2f}%".format(100 * precision_score(y_test, predictions)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, predictions)))

Precision: 95.90%
Recall: 98.94%


### Training the Random Forest model

In [161]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=300)

rfc.fit(X_train_transformed,y_train)

### Predictions and Evaluation
Let's predict off the y_test values and evaluate our model.

In [162]:
rfc_predictions = rfc.predict(X_test_transformed)

In [163]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test,rfc_predictions))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       496
           1       0.99      0.96      0.98       284

    accuracy                           0.98       780
   macro avg       0.99      0.98      0.98       780
weighted avg       0.98      0.98      0.98       780



In [164]:
print(confusion_matrix(y_test,rfc_predictions))

[[494   2]
 [ 10 274]]


In [165]:
from sklearn.metrics import precision_score, recall_score

print("Precision: {:.2f}%".format(100 * precision_score(y_test, rfc_predictions)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, rfc_predictions)))

Precision: 99.28%
Recall: 96.48%


In [196]:
# Function to read an email.
import email
from email import policy
from email.parser import BytesParser
import glob
import os

def read_eml_files(path='./datasets/spam/'):
    eml_files = glob.glob(path + '*.eml') # get all .eml files in a list
    email_contents =[]
    for eml_file in eml_files:
        print(eml_file)
        with open(os.path.join(eml_file), "rb") as f:
            email_contents.append(email.parser.BytesParser(policy=email.policy.default).parse(f))
    return email_contents

In [197]:
test_emails = read_eml_files()

./datasets/spam/report.eml
./datasets/spam/Assured great returns and cashback await you🤑.eml


In [198]:
test_emails_transformed = preprocess_pipeline.transform(test_emails)
predictions_emails = rfc.predict(test_emails_transformed)

In [199]:
print(predictions_emails)

[0 1]
