## Fetching the Data

In [1]:
import os
import tarfile
import urllib.request

root_download = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = root_download + "20030228_easy_ham.tar.bz2"
SPAM_URL = root_download + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(ham_url=HAM_URL, spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", ham_url), ("spam.tar.bz2", spam_url)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.close()

In [2]:
fetch_spam_data()

In [3]:
easy_ham_dir = os.path.join(SPAM_PATH, 'easy_ham')
spam_dir = os.path.join(SPAM_PATH, 'spam')

In [4]:
import numpy as np

In [5]:
#deleting the last element - 'cmds'.
ham_names = np.array(sorted(os.listdir(easy_ham_dir)))[:-1]
spam_names = np.array(sorted(os.listdir(spam_dir)))[:-1]

In [6]:
import email
import email.policy

In [7]:
def get_email(category, filename):
  directory = easy_ham_dir if category == 'ham' else spam_dir
  with open(os.path.join(directory, filename), 'rb') as f:
    return email.parser.BytesParser(policy = email.policy.default).parse(f)

In [8]:
hams = [get_email('ham', name) for name in ham_names]
spams = [get_email('spam', name) for name in spam_names] 

## Discovering the types of emails

In [9]:
from collections import Counter

In [10]:
TYPES = [email.get_content_type() for email in (hams)]
c = Counter(TYPES)
c.items()

dict_items([('text/plain', 2408), ('multipart/signed', 68), ('multipart/alternative', 9), ('multipart/mixed', 10), ('multipart/related', 3), ('multipart/report', 2)])

In [11]:
TYPES = [email.get_content_type() for email in (spams)]
c = Counter(TYPES)
c.items()

dict_items([('text/html', 183), ('text/plain', 218), ('multipart/mixed', 43), ('multipart/alternative', 47), ('multipart/related', 9)])

## Preparing training and testing data

In [109]:
X = np.array(hams + spams, dtype = 'object')
y = np.array([1] * len(hams) + [0] * len(spams))

In [110]:
from sklearn.model_selection import train_test_split

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Preprocessing

In [15]:
from bs4 import BeautifulSoup
from collections import Counter
import re
import nltk
import urlextract

In [27]:
def html_to_plain_text_test(html):
    soup = BeautifulSoup(html, 'lxml')
    for a_tag in soup.find_all('a'):
        a_tag.replace_with(" HYPERLINK ")
    return soup.get_text(separator = " ", strip = True)

def email_to_text_test(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text_test(html)
    
def refine_test(email_text):
    url_extractor = urlextract.URLExtract()
    urls = url_extractor.find_urls(email_text)
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    email_text = re.sub(email_pattern, "email_address", email_text)
    for url in urls:
        email_text = re.sub(re.escape(url), 'url', email_text)
    email_text = re.sub(r'\d+', '', email_text)
    email_text = re.sub(r'[^\w\s]', '', email_text)
    return email_text

def preprocess_test(email_parsed):
    email_text = email_to_text_test(email_parsed)
    email_text = refine_test(email_text)
    cnt = Counter()
    stemmer = nltk.PorterStemmer()
    for word in email_text.split():
        cnt[stemmer.stem(word.lower())] += 1
    return cnt

## Natural Language Processing

In [104]:
class PCounter(BaseEstimator, TransformerMixin):
    
    def __init__(self, s = 0):
        self.s = s
        
    def fit(self, X, y=None):
        return self

    def html_to_plain_text(self, html):
        soup = BeautifulSoup(html, 'lxml')
        for a_tag in soup.find_all('a'):
            a_tag.replace_with(" HYPERLINK ")
        return soup.get_text(separator=" ", strip=True)

    def email_to_text(self, email):
        html = None
        for part in email.walk():
            ctype = part.get_content_type()
            try:
                content = part.get_content()
            except:  # in case of encoding issues
                content = str(part.get_payload())
            if ctype != "text/html":
                return content
            else:
                html = content
        if html:
            return self.html_to_plain_text(html)
        

    def refine(self, email_text):
        url_extractor = urlextract.URLExtract()
        urls = url_extractor.find_urls(email_text)#HERE
        email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
        email_text = re.sub(email_pattern, "email_address", email_text)
        for url in urls:
            email_text = re.sub(re.escape(url), 'url', email_text)
        email_text = re.sub(r'\d+', '', email_text)
        email_text = re.sub(r'[^\w\s]', '', email_text)
        return email_text

    def preprocess(self, email_parsed):
        email_text = self.email_to_text(email_parsed)
        email_text = self.refine(email_text)
        cnt = Counter()
        stemmer = nltk.PorterStemmer()
        for word in email_text.split():
            cnt[stemmer.stem(word.lower())] += 1
        return cnt

    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            self.s += 1
            X_transformed.append(self.preprocess(email))
        return np.array(X_transformed)
        

In [105]:
pc = PCounter()
pc.transform(X_train[[802]])

array([Counter({'div': 48, 'aligncent': 24, 'a': 15, 'receiv': 12, 'email': 12, 'you': 12, 'of': 12, 'our': 12, 'to': 10, 'or': 10, 'the': 10, 'nbsp': 10, 'thi': 9, 'p': 8, 'one': 8, 'strongfont': 8, 'faceari': 8, 'cartridg': 8, 'are': 6, 'offer': 6, 'in': 6, 'border': 6, 'srcurl': 6, 'width': 6, 'hrefurl': 6, 'at': 6, 'multipart': 4, 'html': 4, 'head': 4, 'bodi': 4, 'aligncenterspan': 4, 'becaus': 4, 'special': 4, 'from': 4, 'market': 4, 'partner': 4, 'if': 4, 'have': 4, 'do': 4, 'not': 4, 'pleas': 4, 'with': 4, 'on': 4, 'price': 4, 'as': 4, 'font': 4, 'img': 4, 'height': 4, 'web': 4, 'site': 4, 'for': 4, 'all': 4, 'best': 4, 'list': 4, 'boundari': 3, 'contenttyp': 2, 'charsetiso': 2, 'contenttransferencod': 2, 'bit': 2, 'stylefontsizeptfontfamilyarialcolorblacky': 2, 'optedin': 2, 'optind': 2, 'through': 2, 'feel': 2, 'error': 2, 'wish': 2, 'addit': 2, 'repli': 2, 'word': 2, 'quotremovequot': 2, 'subject': 2, 'line': 2, 'follow': 2, 'unsubscrib': 2, 'instruct': 2, 'belowop': 2, 'op':

In [106]:
from scipy.sparse import csr_matrix

class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    def fit(self, X, y=None):
        total_count = Counter()
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for row, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))

In [107]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
X_few_vectors = vocab_transformer.fit_transform(pc.transform(X_train[0:3]))
X_few_vectors

<3x11 sparse matrix of type '<class 'numpy.intc'>'
	with 27 stored elements in Compressed Sparse Row format>

In [108]:
X_few_vectors.toarray()

array([[ 77,   3,   4,   2,   2,   1,   1,   3,   1,   1,   1],
       [128,   8,   5,   3,   5,   5,   5,   1,   3,   1,   3],
       [ 12,   1,   0,   3,   0,   0,   0,   1,   0,   2,   0]],
      dtype=int32)

In [112]:
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ("email_to_wordcount", PCounter()),
    ("wordcount_to_vector", WordCounterToVectorTransformer()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [114]:
X_train_transformed.toarray()

array([[26,  1,  3, ...,  0,  0,  0],
       [24,  4,  2, ...,  0,  0,  0],
       [86, 10, 10, ...,  0,  0,  0],
       ...,
       [11,  3,  0, ...,  0,  0,  0],
       [ 6,  2,  0, ...,  0,  0,  0],
       [ 4,  0,  4, ...,  0,  0,  0]], dtype=int32)

In [116]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [118]:
reg = LogisticRegression(solver="lbfgs", max_iter=1000)
score = cross_val_score(reg, X_train_transformed, y_train, cv=3, verbose=3)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.973) total time=   0.4s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] END ................................ score: (test=0.959) total time=   0.3s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s


[CV] END ................................ score: (test=0.969) total time=   0.3s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.3s finished


In [119]:
score.mean()

0.9666666666666667

In [120]:
reg = LogisticRegression(solver="lbfgs", max_iter=1000)
reg.fit(X_train_transformed, y_train)
y_pred = reg.predict(preprocess_pipeline.transform(X_test))

In [122]:
from sklearn.metrics import precision_score, recall_score

print("Precision: {:.2f}%".format(100 * precision_score(y_test, y_pred)))
print("Recall: {:.2f}%".format(100 * recall_score(y_test, y_pred)))

Precision: 96.34%
Recall: 99.21%
