Spam Detector Using Machine Learning

In [6]:
import tarfile
from urllib import request
from pathlib import Path

def fetch_spam_data():
    spam_root = "http://spamassassin.apache.org/old/publiccorpus/"
    ham_url = spam_root + "20030228_easy_ham.tar.bz2"
    spam_url = spam_root + "20030228_spam.tar.bz2"

    spam_path = Path() / "datasets" / "spam"
    spam_path.mkdir(parents=True, exist_ok=True)
    for dir_name, tar_name, url in (("easy_ham", "ham", ham_url),
                                    ("spam", "spam", spam_url)):
        if not (spam_path / dir_name).is_dir():
            path = (spam_path / tar_name).with_suffix(".tar.bz2")
            print("Downloading", path)
            request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path / dir_name for dir_name in ("easy_ham", "spam")]

In [7]:
ham_dir, spam_dir = fetch_spam_data()

Downloading datasets\spam\ham.tar.bz2
Downloading datasets\spam\spam.tar.bz2


In [8]:
ham_filename = [ f for f in sorted(ham_dir.iterdir()) if len(f.name) > 20]
spam_filename = [ f for f in sorted(spam_dir.iterdir()) if len(f.name) > 20]

In [9]:
len(ham_filename)

2500

In [10]:
len(spam_filename)

500

In [12]:
import email
import email.policy

def load_email(filepath):
    with open(filepath, 'rb') as file:
        return email.parser.BytesParser(policy=email.policy.default).parse(file)

In [13]:
ham_email = [load_email(filepath) for filepath in ham_filename]
spam_email = [load_email(filepath) for filepath in spam_filename]

In [14]:
print(ham_email[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [36]:
print(spam_email[63].get_content().strip())

Market Internet Access 

No Investment Needed 

Premium Internet Access for only $14.95 per month or less! 

Earn $1 per Subscriber per month

Go To:

http://new.isp.50megs.com/
3442BvLB9-565fAFx0200Lbck9-698onqh7l33


In [18]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        mulitpart = ", ".join([get_email_structure(sub_email) for sub_email in payload])
        return f"multipart({mulitpart})"
    else:
        return email.get_content_type()

In [25]:
from collections import Counter

def structure_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] +=1
    return structures

In [26]:
structure_counter(ham_email).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [27]:
structure_counter(spam_email).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [32]:
for header, value in spam_email[120].items():
    print(header, ':', value)

Return-Path : <cmolano@hotmail.com>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 5F24F43F99	for <zzzz@localhost>; Tue, 27 Aug 2002 08:12:45 -0400 (EDT)
Received : from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Tue, 27 Aug 2002 13:12:45 +0100 (IST)
Received : from jim.hradac.com (hradac.com [66.136.141.249]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7RC2cZ06577 for    <webmaster@efi.ie>; Tue, 27 Aug 2002 13:03:51 +0100
Received : from masqueradingv.bb ([200.251.234.66]) by jim.hradac.com with    Microsoft SMTPSVC(5.0.2195.4905); Sun, 25 Aug 2002 12:46:46 -0500
Message-Id : <000062bb38f3$000039c3$00007cde@mammothweather.com>
To : wjicwpw@jbyv.fi, petter.sundblad@mbox300.swipnet.se, sherriej@toile.qc.ca, rocky@midwest.com.cn
Cc : joey@xenios.qldnet.com.au, tommylee@chello.at, webmaster@efi.ie, f5mag@hol.f

In [30]:
spam_email[120]['Subject']

"DON'T LET A COMPUTER VIRUS RUIN YOUR DAY!          12879"

In [37]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_email + spam_email, dtype=object)
y = np.array([0] * len(ham_email) + [1] * len(spam_email))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)