In [47]:
# The content of this notebook is based on the content of the solution to the spam classifier problem in 
# Aurelien Geron's book, Hands-On ML. Much of the code used here exactly matches the code that he provides

In [45]:
import numpy as np
import pandas as pd

In [1]:
import tarfile
import os
from six.moves import urllib

BASE_URL = "http://spamassassin.apache.org/old/publiccorpus/"

HAM_FOLDER = "ham"
SPAM_FOLDER = "spam"

HAM_URL_SUFFIX = "20021010_easy_ham.tar.bz2"
SPAM_URL_SUFFIX = "20021010_spam.tar.bz2"

def fetch_spam_data(destination_folder, destination_file, source_file, source_base_url = BASE_URL):
    if not os.path.isdir(destination_folder):
        os.makedirs(destination_folder)
    path = os.path.join(destination_folder, destination_file)
    if not os.path.isfile(path):
        urllib.request.urlretrieve(source_base_url + destination_file, path)
    print("path:", path)
    tar_data_file = tarfile.open(path)
    tar_data_file.extractall(destination_folder)
    tar_data_file.close()

In [2]:
fetch_spam_data(HAM_FOLDER, HAM_URL_SUFFIX, HAM_URL_SUFFIX)
fetch_spam_data(SPAM_FOLDER, SPAM_URL_SUFFIX, SPAM_URL_SUFFIX)

path: ham/20021010_easy_ham.tar.bz2
path: spam/20021010_spam.tar.bz2


In [28]:
ACTUAL_HAM_DIR = os.path.join(HAM_FOLDER + "/easy_ham")
ACTUAL_SPAM_DIR = os.path.join(SPAM_FOLDER + "/spam")
ham_filenames = [filename for filename in sorted(os.listdir(ACTUAL_HAM_DIR)) if len(filename) > 20][1:]
spam_filenames = [filename for filename in sorted(os.listdir(ACTUAL_SPAM_DIR)) if len(filename) > 20][1:]

In [29]:
len(ham_filenames)

2550

In [30]:
len(spam_filenames)

500

In [31]:
import email
import email.policy

def load_email(data_directory, data_file):
    with open(os.path.join(data_directory, data_file), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [32]:
ham_emails = [load_email("ham/easy_ham", filename) for filename in ham_filenames]
spam_emails = [load_email("spam/spam", filename) for filename in spam_filenames]

In [33]:
print(ham_emails[1].get_content().strip())

Man Threatens Explosion In Moscow 

Thursday August 22, 2002 1:40 PM
MOSCOW (AP) - Security officers on Thursday seized an unidentified man who
said he was armed with explosives and threatened to blow up his truck in
front of Russia's Federal Security Services headquarters in Moscow, NTV
television reported.
The officers seized an automatic rifle the man was carrying, then the man
got out of the truck and was taken into custody, NTV said. No other details
were immediately available.
The man had demanded talks with high government officials, the Interfax and
ITAR-Tass news agencies said. Ekho Moskvy radio reported that he wanted to
talk with Russian President Vladimir Putin.
Police and security forces rushed to the Security Service building, within
blocks of the Kremlin, Red Square and the Bolshoi Ballet, and surrounded the
man, who claimed to have one and a half tons of explosives, the news
agencies said. Negotiations continued for about one and a half hours outside
the building, ITAR-

In [34]:
print(spam_emails[6].get_content().strip())

Help wanted.  We are a 14 year old fortune 500 company, that is
growing at a tremendous rate.  We are looking for individuals who
want to work from home.

This is an opportunity to make an excellent income.  No experience
is required.  We will train you.

So if you are looking to be employed from home with a career that has
vast opportunities, then go:

http://www.basetel.com/wealthnow

We are looking for energetic and self motivated people.  If that is you
than click on the link and fill out the form, and one of our
employement specialist will contact you.

To be removed from our link simple go to:

http://www.basetel.com/remove.html


4139vOLW7-758DoDY1425FRhM1-764SMFc8513fCsLl40


In [35]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return f"multipart({', '.join([get_email_structure(sub_email) for sub_email in payload])})"
    else:
        return email.get_content_type()

In [36]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [37]:
structures_counter(ham_emails).most_common()

[('text/plain', 2452),
 ('multipart(text/plain, application/pgp-signature)', 72),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [38]:
structures_counter(spam_emails).most_common()

[('text/plain', 221),
 ('text/html', 181),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 19),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [42]:
for header, value in spam_emails[0].items():
    print(header,":",value)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.example.com
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.example.com (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@example.com>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : quoted-printable


In [43]:
spam_emails[0]["Subject"]

'Life Insurance - Why Pay More?'

In [48]:
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

In [49]:
import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [None]:
html_spam_emails = [email for email in X_train[y_train == 1] if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]