In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import pickle
import joblib
import copy
import os
from collections import Counter
import urllib.request
import tarfile
import email.parser
import email.policy
from bs4 import BeautifulSoup
from html import unescape
from sklearn import preprocessing, model_selection, base, metrics, linear_model, pipeline, ensemble, svm, multiclass, neighbors
from scipy.sparse import csr_matrix

In [3]:
DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join('data')


def fetch_spam_data(ham_url=HAM_URL, spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", ham_url), ("spam.tar.bz2", spam_url)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.close()


fetch_spam_data()

In [4]:
ham_files = [file for file in os.listdir(os.path.join(SPAM_PATH, 'easy_ham'))]
spam_files = [file for file in os.listdir(os.path.join(SPAM_PATH, 'spam'))]

In [5]:
len(ham_files), len(spam_files)

(2501, 501)

In [6]:
def load_email(filename, is_spam=False):
    dir = os.path.join(SPAM_PATH, 'easy_ham') if not is_spam else os.path.join(SPAM_PATH, 'spam')
    with open(os.path.join(dir, filename), 'rb') as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [7]:
ham_emails = [load_email(file, is_spam=False) for file in ham_files]
spam_emails = [load_email(file, is_spam=True) for file in spam_files]

In [8]:
len(spam_emails), len(ham_emails)

(501, 2501)

In [9]:
print(ham_emails[0].get_content().strip())

Date:        Wed, 21 Aug 2002 10:54:46 -0500
    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>
    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>


  | I can't reproduce this error.

For me it is very repeatable... (like every time, without fail).

This is the debug log of the pick happening ...

18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}
18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury
18:19:04 Ftoc_PickMsgs {{1 hit}}
18:19:04 Marking 1 hits
18:19:04 tkerror: syntax error in expression "int ...

Note, if I run the pick command by hand ...

delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury
1 hit

That's where the "1 hit" comes from (obviously).  The version of nmh I'm
using is ...

delta$ pick -version
pick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55:56 

In [10]:
type(ham_emails[0])

email.message.EmailMessage

In [11]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [12]:
structures_counter(ham_emails)

Counter({'text/plain': 2409,
         'multipart(text/plain, application/pgp-signature)': 66,
         'multipart(text/plain, text/html)': 8,
         'multipart(text/plain, text/plain)': 4,
         'multipart(text/plain)': 3,
         'multipart(text/plain, application/octet-stream)': 2,
         'multipart(text/plain, text/enriched)': 1,
         'multipart(text/plain, application/ms-tnef, text/plain)': 1,
         'multipart(multipart(text/plain, text/plain, text/plain), application/pgp-signature)': 1,
         'multipart(text/plain, video/mng)': 1,
         'multipart(text/plain, multipart(text/plain))': 1,
         'multipart(text/plain, application/x-pkcs7-signature)': 1,
         'multipart(text/plain, multipart(text/plain, text/plain), text/rfc822-headers)': 1,
         'multipart(text/plain, multipart(text/plain, text/plain), multipart(multipart(text/plain, application/x-pkcs7-signature)))': 1,
         'multipart(text/plain, application/x-java-applet)': 1})

In [13]:
structures_counter(spam_emails)

Counter({'text/plain': 219,
         'text/html': 183,
         'multipart(text/plain, text/html)': 45,
         'multipart(text/html)': 20,
         'multipart(text/plain)': 19,
         'multipart(multipart(text/html))': 5,
         'multipart(text/plain, image/jpeg)': 3,
         'multipart(text/html, application/octet-stream)': 2,
         'multipart(text/plain, application/octet-stream)': 1,
         'multipart(text/html, text/plain)': 1,
         'multipart(multipart(text/html), application/octet-stream, image/jpeg)': 1,
         'multipart(multipart(text/plain, text/html), image/gif)': 1,
         'multipart/alternative': 1})

In [34]:
# creating dataset
X = np.array(ham_emails+spam_emails, dtype=object)
y = np.array([0 for i in range(len(ham_emails))]+[1 for i in range(len(spam_emails))])

In [15]:
# shuffling the set
np.random.seed(42)
full = np.c_[X, y]
np.random.shuffle(full)

In [16]:
X, y = full[:, 0], full[:, 1]

In [18]:
joblib.dump(X, 'data/X.pkl')
joblib.dump(y, 'data/y.pkl')

['data/y.pkl']

In [19]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y ,test_size=0.2, random_state=42)

In [None]:
ord = preprocessing.OrdinalEncoder()
y_train = ord.fit_transform(y_train.reshape(-1,1))
y_test = ord.fit_transform(y_test.reshape(-1,1))

In [20]:
html_type_emails = [em for em in X_train if em.get_content_type()=='text/html']
html_type_emails[0].get_content()

'<HTML><HEAD>\n<META http-equiv=Content-Type content="text/html; charset=iso-8859-1">\n</HEAD><BODY><CENTER>\n<A href="http://reallymarket.com/user0201/index.asp?Afft=QM99"> \n<IMG src="http://61.129.68.17/mortad2.gif" border=0></A>\n<BR><BR><FONT face=Arial,Helvetica color=#000000 size=1>\nCopyright 2002 - All rights reserved<BR><BR>If you would no longer like us \nto contact you or feel that you have<BR>received this email in error, \nplease <A href="http://reallymarket.com/light/watch.asp">click here \nto unsubscribe</A>.</FONT></CENTER></BODY></HTML>\n\n\n\n\n'

In [21]:
soup = BeautifulSoup(html_type_emails[0].get_content(), 'html.parser')
print(soup.prettify())

<html>
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
 </head>
 <body>
  <center>
   <a href="http://reallymarket.com/user0201/index.asp?Afft=QM99">
    <img border="0" src="http://61.129.68.17/mortad2.gif"/>
   </a>
   <br/>
   <br/>
   <font color="#000000" face="Arial,Helvetica" size="1">
    Copyright 2002 - All rights reserved
    <br/>
    <br/>
    If you would no longer like us 
to contact you or feel that you have
    <br/>
    received this email in error, 
please
    <a href="http://reallymarket.com/light/watch.asp">
     click here 
to unsubscribe
    </a>
    .
   </font>
  </center>
 </body>
</html>


In [22]:
soup.find_all('a')[0]['href']

'http://reallymarket.com/user0201/index.asp?Afft=QM99'

In [23]:
def get_text(e_mail):
    for part in e_mail.walk():
        ctype = part.get_content_type()
        if not ctype in ['text/plain', 'text/html']:
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == 'text/plain':
            return content
        else:
            soup = BeautifulSoup(content, 'html.parser')
            content = '\n'.join([i for i in filter(bool, soup.get_text().split('\n'))])
        return content

In [28]:
class WordExtractor(base.BaseEstimator, base.TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = get_text(email) or ''
            word_counter = Counter(text.split())
            X_transformed.append(word_counter)
        return np.array(X_transformed)

In [51]:
class WordToVec(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, vocab_size=1000):
        self.vocab_size = vocab_size
        self.vocab = None
    
    def fit(self, X, y=None):
        total_counter = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_counter[word] += min(count, 10)
        most_freq = total_counter.most_common()[:self.vocab_size]
        self.vocab = {word: index+1 for index, (word, count) in enumerate(most_freq)}
        return self
    
    def transform(self, X, y=None):
        rows=[]
        cols=[]
        data=[]
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                cols.append(self.vocab.get(word, 0))
                rows.append(row)
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocab_size+1))

In [56]:
transf_pipe = pipeline.Pipeline([('wordex', WordExtractor()),
                                 ('wordtovec', WordToVec(1000))])

X_train_prep = transf_pipe.fit_transform(X_train)

In [58]:
X_train_prep.toarray()

array([[ 76,   8,   5, ...,   0,   0,   0],
       [ 17,   1,   1, ...,   0,   0,   0],
       [ 28,   5,   3, ...,   0,   0,   0],
       ...,
       [243,  19,   2, ...,   0,   0,   0],
       [ 25,   6,   0, ...,   0,   0,   0],
       [169,  11,  10, ...,   0,   0,   0]], dtype=int32)

In [63]:
transf_pipe.named_steps['wordtovec'].vocab

{'the': 1,
 'to': 2,
 'a': 3,
 'and': 4,
 'of': 5,
 '>': 6,
 'is': 7,
 'in': 8,
 'I': 9,
 'for': 10,
 'that': 11,
 'you': 12,
 'it': 13,
 'on': 14,
 'with': 15,
 'this': 16,
 'be': 17,
 'have': 18,
 'are': 19,
 'from': 20,
 'or': 21,
 'as': 22,
 'not': 23,
 'your': 24,
 'The': 25,
 'at': 26,
 '-': 27,
 'an': 28,
 'but': 29,
 'can': 30,
 'by': 31,
 'was': 32,
 'my': 33,
 'will': 34,
 'all': 35,
 '--': 36,
 'has': 37,
 'if': 38,
 'about': 39,
 'they': 40,
 'more': 41,
 'This': 42,
 'do': 43,
 'just': 44,
 'get': 45,
 'out': 46,
 'would': 47,
 'we': 48,
 'one': 49,
 'so': 50,
 'use': 51,
 'like': 52,
 'which': 53,
 'who': 54,
 'list': 55,
 'only': 56,
 'their': 57,
 'our': 58,
 'up': 59,
 'some': 60,
 'If': 61,
 'any': 62,
 'been': 63,
 'what': 64,
 'email': 65,
 'than': 66,
 'You': 67,
 'no': 68,
 'people': 69,
 'other': 70,
 'mailing': 71,
 "don't": 72,
 'there': 73,
 'when': 74,
 '2002': 75,
 'new': 76,
 'me': 77,
 'wrote:': 78,
 'It': 79,
 'Date:': 80,
 "I'm": 81,
 'make': 82,
 'into'

In [85]:
log_clf = linear_model.LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42)
score = model_selection.cross_val_score(log_clf, X_train_prep, y_train.ravel(), cv=3, verbose=3, scoring='precision')
score.mean()

[CV] END ................................ score: (test=0.985) total time=   1.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV] END ................................ score: (test=0.979) total time=   1.3s
[CV] END ................................ score: (test=0.983) total time=   0.8s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9820895755305868

In [86]:
log_clf.fit(X_train_prep, y_train.ravel())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [88]:
X_test_prep = transf_pipe.transform(X_test)

y_pred = log_clf.predict(X_test_prep)

print("Precision: {:.2f}%".format(100 * metrics.precision_score(y_test.ravel(), y_pred.ravel())))
print("Recall: {:.2f}%".format(100 * metrics.recall_score(y_test.ravel(), y_pred.ravel())))

Precision: 50.00%
Recall: 36.56%


-_- will get back to this later 