In [82]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20021010_hard_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20021010_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()

In [83]:
fetch_spam_data()

In [84]:
HAM_DIR = os.path.join(SPAM_PATH, "hard_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [85]:
len(ham_filenames)

250

In [86]:
len(spam_filenames)

500

In [89]:
import email
import email.policy

def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory = "spam" if is_spam else "hard_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [90]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [92]:
print(ham_emails[1].get_content().strip())

<html>
<head>
	<title>Shopper Newsletter: Alerts</title>

</head>
<body bgcolor="#EEEEEE" alink="#0000FF" vlink="#0000FF">
<center>
<A href="top"></A><!-- header -->
	<table border=0 cellpadding=0 cellspacing=0 width=612 bgcolor="#ffffff">
	<tr valign=top>
		<td width=442 colspan=5>
			<img src="http://www.cnet.com/i/sh/nl/442_shopperALERT.gif" width="442" height="60" border="0" alt="CNET Shopper Newsletter Alerts"><br>
				<table width=442 cellpadding=0 cellspacing=0 border=0 bgcolor="#CCCCCC">
					<tr>
						<td bgcolor="#999999" colspan=1>
							<img src="http://home.cnet.com/b.gif" width="1" height="1" border="0"><br>
						</td>
						<td bgcolor="#666666" colspan=1>
							<img src="http://home.cnet.com/b.gif" width="1" height="1" border="0"><br>
						</td>
						<td bgcolor="#666666" colspan=2>
							<img src="http://home.cnet.com/b.gif" width="1" height="1" border="0"><br>
						</td>
					</tr>
					<tr>
						<td bgcolor="#999999" width=1 rowspan=2>
							<img src="http:

In [93]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    payload = email.get_payload()
    if isinstance(payload, list):
        return "multipart({})".format(", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [94]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [95]:
structures_counter(ham_emails).most_common()

[('text/html', 120),
 ('text/plain', 86),
 ('multipart(text/plain, text/html)', 38),
 ('multipart(text/html)', 2),
 ('multipart(text/plain, image/png, image/png)', 1),
 ('multipart(multipart(text/plain, text/html))', 1),
 ('multipart(text/plain, application/x-pkcs7-signature)', 1),
 ('multipart(text/plain, image/bmp)', 1)]

In [96]:
structures_counter(spam_emails).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart(text/plain, text/html)', 45),
 ('multipart(text/html)', 20),
 ('multipart(text/plain)', 19),
 ('multipart(multipart(text/html))', 5),
 ('multipart(text/plain, image/jpeg)', 3),
 ('multipart(text/html, application/octet-stream)', 2),
 ('multipart(text/plain, application/octet-stream)', 1),
 ('multipart(text/html, text/plain)', 1),
 ('multipart(multipart(text/html), application/octet-stream, image/jpeg)', 1),
 ('multipart(multipart(text/plain, text/html), image/gif)', 1),
 ('multipart/alternative', 1)]

In [97]:
spam_emails[0]["Subject"]

'Life Insurance - Why Pay More?'

In [98]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(ham_emails + spam_emails)
y = np.array([0] * len(ham_emails) + [1] * len(spam_emails))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [99]:
import re
from html import unescape

def html_to_plain_text(html):
    text = re.sub('<head.*?>.*?</head>', '', html, flags=re.M | re.S | re.I)
    text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
    text = re.sub('<.*?>', '', text, flags=re.M | re.S)
    text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
    return unescape(text)

In [100]:
html_spam_emails = [email for email in X_train[y_train==1]
                    if get_email_structure(email) == "text/html"]
sample_html_spam = html_spam_emails[7]
print(sample_html_spam.get_content().strip()[:1000], "...")

Me and my friends have this brand new idea, a Live Webcam <a href="http://%31%30%31%31%30%31%31%31%30%31%31%31%31%30%31%30%31%30%31%30%31%30%31%31%31%30%31%30%31%30%31%30%31%30%31%30%31%30%31%30%31%30%31%31%30%31%30%31%30%31%30%31%30%31%30%31%30%31%30%31%30%31%31%30%31%30%31%31%30%31%30%31%30%31%30%31%30%31%30%31%30%31%30%31%30%31%30%31%30%31%31%30%31%30%31%30%31%30%31%30%31%30%31%30%31%30%31%31%30%30%30%31%30%31%31%30%31%31%31%30%31%30%31%30%31%30%31%30%31%30%30%30%31%31%30%31%30%31%30%31%31%30%31%30%31%30%31%30%31%31%30%31@%34%2E%34%37%2E%39%36%2E%31%34%31/msga.html"> Click Here <a/>
<br>
<br>
<br>
<font size="1">This is NOT SPAM - You have received this e-mail because 
at one time or another you entered the weekly draw at one of
our portals or FFA sites. We comply with all proposed and current laws 
on commercial e-mail under (Bill s. 1618 TITLE III passed by the 105th 
Congress).
 If you have received this e-mail in error, we apologize for the 
inconvenience and ask that you remove

In [101]:
print(html_to_plain_text(sample_html_spam.get_content())[:1000], "...")

Me and my friends have this brand new idea, a Live Webcam  HYPERLINK  Click Here
This is NOT SPAM - You have received this e-mail because
at one time or another you entered the weekly draw at one of
our portals or FFA sites. We comply with all proposed and current laws
on commercial e-mail under (Bill s. 1618 TITLE III passed by the 105th
Congress).
 If you have received this e-mail in error, we apologize for the
inconvenience and ask that you remove yourself.
Click   HYPERLINK Here to Unsubscribe
fysibvcgjyuwinmyvbpjtaebsymyukbrkn
 ...


In [102]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [103]:
print(email_to_text(sample_html_spam)[:100], "...")

Me and my friends have this brand new idea, a Live Webcam  HYPERLINK  Click Here
This is NOT SPAM -  ...


In [104]:
try:
    import nltk

    stemmer = nltk.PorterStemmer()
    for word in ("Computations", "Computation", "Computing", "Computed", "Compute", "Compulsive"):
        print(word, "=>", stemmer.stem(word))
except ImportError:
    print("Error: stemming requires the NLTK module.")
    stemmer = None

Computations => comput
Computation => comput
Computing => comput
Computed => comput
Compute => comput
Compulsive => compuls


In [105]:
try:
    import urlextract # may require an Internet connection to download root domain names
    
    url_extractor = urlextract.URLExtract()
    print(url_extractor.find_urls("Will it detect github.com and https://youtu.be/7Pq-S557XQU?t=3m32s"))
except ImportError:
    print("Error: replacing URLs requires the urlextract module.")
    url_extractor = None

['github.com', 'https://youtu.be/7Pq-S557XQU?t=3m32s']


In [106]:
from sklearn.base import BaseEstimator, TransformerMixin

class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [107]:
X_few = X_train[:3]
X_few_wordcounts = EmailToWordCounterTransformer().fit_transform(X_few)
X_few_wordcounts

array([Counter({'of': 30, 'to': 28, 'the': 28, 'and': 20, 'thi': 17, 'your': 14, 'in': 12, 'we': 12, 'a': 11, 'you': 11, 'number': 9, 'kabila': 9, 'congo': 8, 'countri': 8, 'is': 8, 'my': 8, 'us': 8, 'i': 7, 'presid': 7, 'assist': 6, 'out': 6, 'fund': 6, 'url': 5, 'by': 5, 'laurent': 5, 'our': 5, 'as': 5, 'will': 5, 'with': 5, 'from': 4, 'col': 4, 'democrat': 4, 'republ': 4, 'code': 4, 'inform': 4, 'that': 4, 'money': 4, 'secur': 4, 'for': 4, 'be': 4, 'who': 4, 'move': 4, 'are': 4, 'michael': 3, 'bundu': 3, 'no': 3, 'intl': 3, 'access': 3, 'email': 3, 'immedi': 3, 'pleas': 3, 'me': 3, 'need': 3, 'one': 3, 'hi': 3, 'were': 3, 'late': 3, 'then': 3, 'other': 3, 'here': 3, 'can': 3, 'have': 3, 'contact': 3, 'confidenti': 3, 'not': 3, 'through': 3, 'work': 3, 'dial': 3, 'tel': 2, 'so': 2, 'trust': 2, 'foreign': 2, 'partner': 2, 'may': 2, 'but': 2, 'indulg': 2, 'view': 2, 'it': 2, 'name': 2, 'aid': 2, 'bless': 2, 'militari': 2, 'rebel': 2, 'some': 2, 'state': 2, 'group': 2, 'privat': 2, 'whe

In [108]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [109]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(X_few_wordcounts)
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.int64'>'
	with 32 stored elements in Compressed Sparse Row format>

In [110]:
X_few_vectors.toarray()

array([[ 547,   28,    9,   28,   30,   12,   11,   20,   14,    5,   17],
       [ 188,   15,   15,    6,    3,    3,    2,    1,    0,    5,    2],
       [1103,   42,   24,   45,   18,   19,   28,   17,   13,   10,    8]],
      dtype=int64)

In [111]:
vocab_transformer.vocabulary_

{'the': 1,
 'number': 2,
 'to': 3,
 'of': 4,
 'in': 5,
 'a': 6,
 'and': 7,
 'your': 8,
 'url': 9,
 'thi': 10}

In [119]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", EmailToWordCounterTransformer()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

## Logictic regression

In [120]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
score = cross_val_score(log_clf, X_train_transformed, y_train, cv=3, verbose=3)
score.mean()

[CV]  ................................................................
[CV] ..................................... , score=0.94, total=   0.1s
[CV]  ................................................................
[CV] ..................................... , score=0.94, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.955, total=   0.1s


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.2s finished


0.945

In [115]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

log_clf = LogisticRegression(solver="lbfgs", random_state=42)
log_clf.fit(X_train_transformed, y_train)

y_pred = log_clf.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100s * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 92.38%
Recall: 98.98%




## XGBoost

In [127]:
import xgboost as xgb
model_xgb = xgb.XGBClassifier(n_estimators=100000, max_depth=8, learning_rate=0.1)

In [128]:
model_xgb.fit(X_train_transformed, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=8, min_child_weight=1, missing=None, n_estimators=100000,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [130]:
from sklearn.metrics import precision_score, recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

y_pred = model_xgb.predict(X_test_transformed)

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 93.27%
Recall: 98.98%


## Neural network

In [134]:
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [135]:
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

In [143]:
def create_baseline():
    # create model
    model = Sequential()
    model.add(Dense(100, input_dim=1001, kernel_initializer='normal', activation='relu'))
    model.add(Dense(100, kernel_initializer='normal', activation='relu'))
    model.add(Dense(100, kernel_initializer='normal', activation='relu'))
    model.add(Dense(100, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
estimator = KerasClassifier(build_fn=create_baseline, epochs=100, batch_size=5, verbose=0)
kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
results = cross_val_score(estimator, X_train_transformed, y_train, cv=kfold)
print("Results: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))