Spam Detector Using Machine Learning

In [31]:
import tarfile
from urllib import request
from pathlib import Path


def fetch_spam_data():
    spam_root = "http://spamassassin.apache.org/old/publiccorpus/"
    ham_url = spam_root + "20030228_easy_ham.tar.bz2"
    spam_url = spam_root + "20030228_spam.tar.bz2"

    spam_path = Path() / "datasets" / "spam"
    spam_path.mkdir(parents=True, exist_ok=True)
    for dir_name, tar_name, url in (
        ("easy_ham", "ham", ham_url),
        ("spam", "spam", spam_url),
    ):
        if not (spam_path / dir_name).is_dir():
            path = (spam_path / tar_name).with_suffix(".tar.bz2")
            print("Downloading", path)
            request.urlretrieve(url, path)
            tar_bz2_file = tarfile.open(path)
            tar_bz2_file.extractall(path=spam_path)
            tar_bz2_file.close()
    return [spam_path / dir_name for dir_name in ("easy_ham", "spam")]

In [32]:
ham_dir, spam_dir = fetch_spam_data()

In [33]:
ham_filename = [f for f in sorted(ham_dir.iterdir()) if len(f.name) > 20]
spam_filename = [f for f in sorted(spam_dir.iterdir()) if len(f.name) > 20]

In [34]:
len(ham_filename)

2500

In [35]:
len(spam_filename)

500

In [36]:
import email
import email.policy


def load_email(filepath):
    with open(filepath, "rb") as file:
        return email.parser.BytesParser(policy=email.policy.default).parse(file)

In [37]:
ham_email = [load_email(filepath) for filepath in ham_filename]
spam_email = [load_email(filepath) for filepath in spam_filename]

In [38]:
print(ham_email[1].get_content().strip())

Martin A posted:
Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the
 limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the
 Mount Athos monastic community, was ideal for the patriotic sculpture. 
 
 As well as Alexander's granite features, 240 ft high and 170 ft wide, a
 museum, a restored amphitheatre and car park for admiring crowds are
planned
---------------------
So is this mountain limestone or granite?
If it's limestone, it'll weather pretty fast.

------------------------ Yahoo! Groups Sponsor ---------------------~-->
4 DVDs Free +s&p Join Now
http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
---------------------------------------------------------------------~->

To unsubscribe from this group, send an email to:
forteana-unsubscribe@egroups.com

 

Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/


In [39]:
print(spam_email[63].get_content().strip())

Market Internet Access 

No Investment Needed 

Premium Internet Access for only $14.95 per month or less! 

Earn $1 per Subscriber per month

Go To:

http://new.isp.50megs.com/
3442BvLB9-565fAFx0200Lbck9-698onqh7l33


In [40]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        mulitpart = ", ".join([get_email_structure(sub_email) for sub_email in payload])
        return f"multipart({mulitpart})"
    else:
        return email.get_content_type()

In [41]:
from collections import Counter


def structure_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [42]:
structure_counter(ham_email).most_common()

[('text/plain', 2408),
 ('multipart(text/plain, application/pgp-signature)', 66),
 ('multipart(text/plain, text/html)', 8),
 ('multipart(text/plain, text/plain)', 4),
 ('multipart(text/plain)', 3),
 ('multipart(text/plain, application/octet-stream)', 2),
 ('multipart(text/plain, text/enriched)', 1),
 ('multipart(text/plain, application/ms-tnef, text/plain)', 1),
 ('multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)',
  1),
 ('multipart(text/plain, video/mng)', 1),
 ('multipart(text/plain, multipart(text/plain))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)',
  1),
 ('multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))',
  1),
 ('multipart(text/plain, application/x-java-applet)', 1)]

In [43]:
structure_counter(spam_email).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [44]:
for header, value in spam_email[120].items():
    print(header, ":", value)

Return-Path : <cmolano@hotmail.com>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 5F24F43F99	for <zzzz@localhost>; Tue, 27 Aug 2002 08:12:45 -0400 (EDT)
Received : from phobos [127.0.0.1]	by localhost with IMAP (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Tue, 27 Aug 2002 13:12:45 +0100 (IST)
Received : from jim.hradac.com (hradac.com [66.136.141.249]) by    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7RC2cZ06577 for    <webmaster@efi.ie>; Tue, 27 Aug 2002 13:03:51 +0100
Received : from masqueradingv.bb ([200.251.234.66]) by jim.hradac.com with    Microsoft SMTPSVC(5.0.2195.4905); Sun, 25 Aug 2002 12:46:46 -0500
Message-Id : <000062bb38f3$000039c3$00007cde@mammothweather.com>
To : wjicwpw@jbyv.fi, petter.sundblad@mbox300.swipnet.se, sherriej@toile.qc.ca, rocky@midwest.com.cn
Cc : joey@xenios.qldnet.com.au, tommylee@chello.at, webmaster@efi.ie, f5mag@hol.f

In [45]:
spam_email[120]["Subject"]

"DON'T LET A COMPUTER VIRUS RUIN YOUR DAY!          12879"

In [46]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_email + spam_email, dtype=object)
y = np.array([0] * len(ham_email) + [1] * len(spam_email))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [47]:
import re
from html import unescape


def html_to_plain_text(html):
    text = re.sub("<head.*?>.*?</head>", "", html, flags=re.M | re.S | re.I)
    text = re.sub("<a\s.*?>", "HYPERLINK ", text, flags=re.M | re.S | re.I)
    text = re.sub("<.*?>", "", text, flags=re.M | re.S)
    text = re.sub(r"(\s*\n)+", "\n", text, flags=re.M | re.S)
    return unescape(text)

In [48]:
html_spam_emails = [
    email
    for email in X_train[y_train == 1]
    if get_email_structure(email) == "text/html"
]

if len(html_spam_emails) > 0:
    sample_html_spam = html_spam_emails[1]
    print(sample_html_spam.get_content().strip()[:1000], "...")
else:
    print("No HTML spam emails found.")

<html>
<head>

</head>
<body bgcolor=#FFFFFF text=#000000 >
<table width=100% height=100% border=0 cellspacing=0 cellpadding=0 >
  <tr>
    <td valign=middle > 
      <div align=center> 
        <table width=550 border=0 cellspacing=0 cellpadding=0>
          <tr> 
            <td width=22 height=50 bgcolor=D5D5D5>&nbsp;</td>
            <td width=213 height=50 bgcolor=D5D5D5>&nbsp;<font size="2" face="Verdana, Arial, Helvetica, sans-serif"><strong>&nbsp;&nbsp;The 
              Best Mortage  Rates</strong></font></td>
            <td colspan=2 height=50 bgcolor=979797>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<font face=Verdana, Arial, Helvetica, sans-serif color=#FFFFFF><b>Simple, 
              Easy and FREE</b></font></td>
            <td width=22 height=50 bgcolor=D5D5D5>&nbsp; </td>
          </tr>
          <tr> 
            <td width=22 height=25 bgcolor=D5D5D5>&nbsp;</td>
            <td width=213 height=25 bgcolor=979797>&nbsp;</td>
            <td colspan=2 height=25 bgcolor=A6A6A6>&nbs

In [49]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")


             
               The
              Best Mortage  Rates
                 Simple,
              Easy and FREE
             
             
             
             
             
             
                        Have
                          HUNDREDS of lenders compete for your loan!
                        Refinancing
                        New Home Loans
                          Debt Consolidation
Second Mortgage
                        Home Equity
                              HYPERLINK Click Here To
                            JUMP-START
                            your Plans for
                            the Future!!!
                      Dear
                        Homeowner,
                        Interest
                          Rates are at their lowest point in 40 years! We help
                          you find the best rate for your situation by matching
                          your needs with hundreds of lenders!
                            HY

In [50]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [51]:
print(email_to_text(sample_html_spam)[:100], "...")


             
               The
              Best Mortage  Rates
                 Simple,
        ...


In [52]:
import nltk

stemmer = nltk.PorterStemmer()
for word in (
    "Computations",
    "Computational",
    "Computing",
    "Computed",
    "Compute",
    "Complusive",
):
    print(f"{word} -> {stemmer.stem(word)}")

Computations -> comput
Computational -> comput
Computing -> comput
Computed -> comput
Compute -> comput
Complusive -> complus


In [53]:
import urlextract

url_extract = urlextract.URLExtract()
text = "Detecting written text and https://youtube.com/saintmsginsan"
print(url_extract.find_urls(text))

['https://youtube.com/saintmsginsan']


In [56]:
from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_header=True, lower_case=True,
                 remove_punctuation=True, replace_urls=True,
                 replace_numbers=True, stemming=True):
        self.strip_header = strip_header
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extract is not None:
                urls = list(set(url_extract.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, "URL")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts =stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [58]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'chuck': 1, 'murcko': 1, 'wrote': 1, 'stuff': 1, 'yawn': 1, 'r': 1}),
       Counter({'the': 11, 'of': 9, 'and': 8, 'all': 3, 'christian': 3, 'to': 3, 'by': 3, 'jefferson': 2, 'i': 2, 'have': 2, 'superstit': 2, 'one': 2, 'on': 2, 'been': 2, 'ha': 2, 'half': 2, 'rogueri': 2, 'teach': 2, 'jesu': 2, 'some': 1, 'interest': 1, 'quot': 1, 'url': 1, 'thoma': 1, 'examin': 1, 'known': 1, 'word': 1, 'do': 1, 'not': 1, 'find': 1, 'in': 1, 'our': 1, 'particular': 1, 'redeem': 1, 'featur': 1, 'they': 1, 'are': 1, 'alik': 1, 'found': 1, 'fabl': 1, 'mytholog': 1, 'million': 1, 'innoc': 1, 'men': 1, 'women': 1, 'children': 1, 'sinc': 1, 'introduct': 1, 'burnt': 1, 'tortur': 1, 'fine': 1, 'imprison': 1, 'what': 1, 'effect': 1, 'thi': 1, 'coercion': 1, 'make': 1, 'world': 1, 'fool': 1, 'other': 1, 'hypocrit': 1, 'support': 1, 'error': 1, 'over': 1, 'earth': 1, 'six': 1, 'histor': 1, 'american': 1, 'john': 1, 'e': 1, 'remsburg': 1, 'letter': 1, 'william': 1, 'short': 1, 'again': 1, 'becom

In [60]:
from scipy.sparse import csr_matrix


class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size

    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[: self.vocabulary_size]
        self.vocabulary_ = {
            word: index + 1 for index, (word, count) in enumerate(most_common)
        }
        return self
    
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)),
                          shape=(len(X), self.vocabulary_size + 1))
                

In [61]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 20 stored elements and shape (3, 11)>