In [95]:
import os
import tarfile
import urllib
from email import policy
from email.parser import BytesParser
from collections import Counter

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [6]:
BASE_URL = 'http://spamassassin.apache.org/old/publiccorpus/'
HAM_URL = BASE_URL + '20030228_easy_ham.tar.bz2'
SPAM_URL = BASE_URL + '20030228_spam.tar.bz2'

BASE_DIR = 'datasets/spam'

In [4]:
HAM_URL

'http://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2'

In [5]:
SPAM_URL

'http://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2'

In [46]:
def fetch_spam_data(download_url, file_dir):
    if not os.path.isdir(file_dir):
        os.makedirs(file_dir)
    filename = os.path.basename(download_url)
    file_path = os.path.join(file_dir, filename)
    print("downloading from:", download_url)
    urllib.request.urlretrieve(download_url, file_path)
    print('extracting to:', file_dir)
    with tarfile.open(file_path) as tar:
        tar.extractall(file_dir)
    print('deleting:', file_path)
    os.remove(file_path)
    
def load_email(filename, file_dir):
    with open(os.path.join(file_dir, filename), 'rb') as f:
        return BytesParser(policy=policy.default).parse(f)

In [26]:
fetch_spam_data(HAM_URL, BASE_DIR)

downloading from: http://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2
extracting to: datasets/spam
deleting: datasets/spam\20030228_easy_ham.tar.bz2


In [27]:
fetch_spam_data(SPAM_URL, BASE_DIR)

downloading from: http://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2
extracting to: datasets/spam
deleting: datasets/spam\20030228_spam.tar.bz2


In [29]:
spam_dir = os.path.join(BASE_DIR, 'spam')
ham_dir = os.path.join(BASE_DIR, 'easy_ham')
spam_dir, ham_dir

('datasets/spam\\spam', 'datasets/spam\\easy_ham')

In [47]:
spam_files = [doc for doc in sorted(os.listdir(spam_dir)) if not doc.startswith('cmd')]
ham_files = [doc for doc in sorted(os.listdir(ham_dir)) if not doc.startswith('cmd')]

In [49]:
len(spam_files), len(ham_files)

(500, 2500)

In [50]:
spam_docs = [load_email(email, spam_dir) for email in spam_files]

In [53]:
ham_docs = [load_email(email, ham_dir) for email in ham_files]

In [52]:
spam_docs[0].get_content()

'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\n<HTML><HEAD>\n<META content="text/html; charset=windows-1252" http-equiv=Content-Type>\n<META content="MSHTML 5.00.2314.1000" name=GENERATOR></HEAD>\n<BODY><!-- Inserted by Calypso -->\n<TABLE border=0 cellPadding=0 cellSpacing=2 id=_CalyPrintHeader_ rules=none \nstyle="COLOR: black; DISPLAY: none" width="100%">\n  <TBODY>\n  <TR>\n    <TD colSpan=3>\n      <HR color=black noShade SIZE=1>\n    </TD></TR></TD></TR>\n  <TR>\n    <TD colSpan=3>\n      <HR color=black noShade SIZE=1>\n    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso --><FONT \ncolor=#000000 face=VERDANA,ARIAL,HELVETICA size=-2><BR></FONT></TD></TR></TABLE><!-- End Calypso --><FONT color=#ff0000 \nface="Copperplate Gothic Bold" size=5 PTSIZE="10">\n<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=#ff0000 \nface="Copperplate Gothic Bold" size=5 PTSIZE="10">\n<CENTER>Why Spend More Than You Have To?\n<CENTER><FONT co

In [54]:
ham_docs[0].get_content()

'    Date:        Wed, 21 Aug 2002 10:54:46 -0500\n    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>\n    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>\n\n\n  | I can\'t reproduce this error.\n\nFor me it is very repeatable... (like every time, without fail).\n\nThis is the debug log of the pick happening ...\n\n18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}\n18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury\n18:19:04 Ftoc_PickMsgs {{1 hit}}\n18:19:04 Marking 1 hits\n18:19:04 tkerror: syntax error in expression "int ...\n\nNote, if I run the pick command by hand ...\n\ndelta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury\n1 hit\n\nThat\'s where the "1 hit" comes from (obviously).  The version of nmh I\'m\nusing is ...\n\ndelta$ pick -version\npick -- nmh-1.0.4 [compiled on fuchsia.c

In [55]:
ham_docs[0].get_content_type()

'text/plain'

In [58]:
ham_docs[0].get_payload()

'    Date:        Wed, 21 Aug 2002 10:54:46 -0500\n    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>\n    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>\n\n\n  | I can\'t reproduce this error.\n\nFor me it is very repeatable... (like every time, without fail).\n\nThis is the debug log of the pick happening ...\n\n18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}\n18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury\n18:19:04 Ftoc_PickMsgs {{1 hit}}\n18:19:04 Marking 1 hits\n18:19:04 tkerror: syntax error in expression "int ...\n\nNote, if I run the pick command by hand ...\n\ndelta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury\n1 hit\n\nThat\'s where the "1 hit" comes from (obviously).  The version of nmh I\'m\nusing is ...\n\ndelta$ pick -version\npick -- nmh-1.0.4 [compiled on fuchsia.c

In [73]:
i = np.random.randint(500)
ham_docs[i].get_payload()

'Mr Fork writes:\n  Jim Whitehead writes:\n> > For toddlers, pressing play must cause the music to start immediately,\n> > within half a second, for the toddler to get the causality and not press\n> the\n> > button multiple times.\n> Or some sound indicating that the music will start real soon now.\n\nA tonal countdown would be nice.\n\n> > What would the ideal toddler CD player be like? It would immediately start\n> > playing a CD after it was loaded.\n> It\'d be an MP3 player with solid state storage... instant on.\n\nHmm. Seems like every CD player should include the\ncapability to rip, encode, and cache the last few CDs \ninserted. \n\nPlayback would then never need to face seek delays... \nafter the initial ripping, the only use of the laser \npickup would be recognizing which CD is inserted --\nwhich might be doable faster than a seek-and-start-at-\nfirst-track operation.\n\nYou could also take the CD out while it is "playing".\n\nHmm. If the CD is still in the cache, maybe you d

In [74]:
{mail.get_content_type() for mail in spam_docs}

{'multipart/alternative',
 'multipart/mixed',
 'multipart/related',
 'text/html',
 'text/plain'}

In [78]:
a = [mail for mail in spam_docs if isinstance(mail.get_payload(), list)]

In [81]:
a[0].get_payload()[0].get_payload()

'URGENT PRIVATE & EXTREMELY CONFIDENTIAL\n\n\n\nDear =2C\n\nWith profound interest and in utmost confidence=2C I am\nsoliciting your immediate assistance or co-operation\nas to enable us round up an opportunity within my\ncapability as a result of the death of one of our\ncontractor =28Beneficiary=29=2E You should not be surprised\nas to how I got your contact=2C you were highly\nrecommended to me with the believe that you are\ncompetent=2C reliable=2C Trustworthy and confident=2E\n\nI am  Dr=2E Bello Ahmed=2C Chief Auditor=2C Special Project\nand Foreign Contract Regularization and Disbursement=2C\nin the Office of the Auditor General of the Federation\nof Federal Republic of Nigeria=2E We work in hand with\nthe Senate Committee on Foreign Contract Payment=2E Our\nduty is to ensure that all contractors are paid their\ncontract sum in due time=2E\n\nThis last payment quarter=2C a total of 30 contractors\nwere short listed for payment and about 25 of them\nhave been paid remaining about

In [82]:
{type(mail) for mail in ham_docs}

{email.message.EmailMessage}

In [87]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(', '.join(get_email_structure(subemail) 
                                                for subemail in payload))
    else:
        return email.get_content_type()
    
def structures_counters(emails):
    c = Counter([get_email_structure(email) for email in emails])
    return c

In [84]:
{get_email_structure(email) for email in spam_docs}

{'multipart(multipart(text/html))',
 'multipart(multipart(text/html), application/octet-stream, image/jpeg)',
 'multipart(multipart(text/plain, text/html), image/gif)',
 'multipart(text/html)',
 'multipart(text/html, application/octet-stream)',
 'multipart(text/html, text/plain)',
 'multipart(text/plain)',
 'multipart(text/plain, application/octet-stream)',
 'multipart(text/plain, image/jpeg)',
 'multipart(text/plain, text/html)',
 'multipart/alternative',
 'text/html',
 'text/plain'}

In [91]:
structures_counters(ham_docs).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, multipart(text/plain))', 1)]

In [90]:
structures_counters(spam_docs).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart/alternative', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1)]

In [92]:
email = a[10]
email

<email.message.EmailMessage at 0x2916330c0b8>

In [94]:
for k,v in email.items():
    print("{} ::: {}".format(k, v))

Return-Path ::: <mando@insiq.us>
Delivered-To ::: zzzz@localhost.spamassassin.taint.org
Received ::: from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 1B07847CCA	for <zzzz@localhost>; Mon, 26 Aug 2002 10:42:12 -0400 (EDT)
Received ::: from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Mon, 26 Aug 2002 15:42:12 +0100 (IST)
Received ::: from mail1.insuranceiq.com (host66.insuranceiq.com    [65.217.159.66] (may be forged)) by dogma.slashnull.org (8.11.6/8.11.6)    with ESMTP id g7PNukZ31378 for <zzzz@jmason.org>; Mon, 26 Aug 2002 00:56:47    +0100
Received ::: from mail pickup service by mail1.insuranceiq.com with Microsoft    SMTPSVC; Sun, 25 Aug 2002 19:57:34 -0400
Subject ::: More cash for the business you already write - IMMEDIATELY
To ::: zzzz@spamassassin.taint.org
Date ::: Sun, 25 Aug 2002 19:57:34 -0400
From ::: IQ - M&O Marketing <mando@insiq.us>
Message-Id ::: <3c317101c24c93$2e9d9

In [96]:
X = np.asarray(ham_docs + spam_docs)
y = np.asarray([0]*len(ham_docs) + [1]*len(spam_docs))

X.shape, y.shape

((3000,), (3000,))

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

In [98]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((2400,), (2400,), (600,), (600,))

In [100]:
pd.value_counts(y_train, normalize=True)

0    0.83125
1    0.16875
dtype: float64

In [101]:
pd.value_counts(y_test, normalize=True)

0    0.841667
1    0.158333
dtype: float64

In [106]:
list(a[8].walk())

[<email.message.EmailMessage at 0x2916373b240>,
 <email.message.EmailMessage at 0x29162453278>]

In [109]:
html_doc = spam_docs[0].get_content()

In [110]:
from bs4 import BeautifulSoup

In [116]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [144]:
def email_to_text(email):
    for part in email.walk():
        content_type = part.get_content_type()
        if content_type not in {'text/plain', 'text/html'}:
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        
        if content_type == 'text/plain':
            return content.strip().replace('\n', ' ')
        elif content_type=='text/html':
            soup = BeautifulSoup(content, 'html.parser')
            return soup.get_text().strip().replace('\n', ' ')

In [145]:
email_to_text(a[8])

'ATTN: President,  From: Mrs.Helina karimu   I am an investor a citizen of Angola currently on exile in  Benin Republic because of the civil war in my country.I  wish to invest in a country with political stability,  reliable, dependable infrastructure and security of life  and property.   I was given your contact address by a foriegner who was on  a working visit in Cotonou. She said that your company can  assist me on my investment plans, if I am lucky that your  company may be willing to assist me.It may interest you to  know that I am having $30.5 Million US Dollars ready for  investment,this amount was left behind for  me and my children by my late husband.  I am willing to invest in a company with potentials for  growth and stability including your company if your bye-  laws allows for foreign investors or any other good and  profitable business that you may suggest.I will be very  happy if this enquiry receive urgent attention.  You should mail your acceptance by sending to me y

In [146]:
email_to_text(spam_docs[0])

"Save up to 70% on Life Insurance. Why Spend More Than You Have To?  Life Quote Savings           Ensuring your        family's financial security is very important. Life Quote Savings makes        buying life insurance simple and affordable. We Provide FREE Access to The        Very Best Companies and The Lowest Rates.      Life Quote Savings is FAST, EASY and              SAVES you money! Let us help you get started with the best values in              the country on new coverage. You can SAVE hundreds or even thousands              of dollars by requesting a FREE quote from Lifequote Savings. Our              service will take you less than 5 minutes to complete. Shop and              compare. SAVE up to 70% on all types of Life insurance!     Click Here For Your              Free Quote!  Protecting your family is the best investment you'll ever            make!         If you are in receipt of this email        in error and/or wish to be removed from our list, PLEASE CLICK HERE AND

In [148]:
try:
    import nltk
    stemmer = nltk.stem.PorterStemmer()
    for w in ['compute', 'computer', 'computation', 'computes', 'computed']:
        print("{} => {}".format(w, stemmer.stem(w)))
except ImportError:
    stemmer = None

compute => comput
computer => comput
computation => comput
computes => comput
computed => comput
