## Fetching the Data

In [1]:
import os
import tarfile
import urllib.request

root_download = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = root_download + "20030228_easy_ham.tar.bz2"
SPAM_URL = root_download + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(ham_url=HAM_URL, spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2", ham_url), ("spam.tar.bz2", spam_url)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=spam_path)
        tar_bz2_file.close()

In [2]:
fetch_spam_data()

In [3]:
easy_ham_dir = os.path.join(SPAM_PATH, 'easy_ham')
spam_dir = os.path.join(SPAM_PATH, 'spam')

In [4]:
import numpy as np

In [5]:
#deleting the last element - 'cmds'.
ham_names = np.array(sorted(os.listdir(easy_ham_dir)))[:-1]
spam_names = np.array(sorted(os.listdir(spam_dir)))[:-1]

In [6]:
import email
import email.policy

In [7]:
def get_email(category, filename):
  directory = easy_ham_dir if category == 'ham' else spam_dir
  with open(os.path.join(directory, filename), 'rb') as f:
    return email.parser.BytesParser(policy = email.policy.default).parse(f)

In [8]:
hams = [get_email('ham', name) for name in ham_names]
spams = [get_email('spam', name) for name in spam_names] 

## Discovering the types of emails

In [9]:
from collections import Counter

In [10]:
TYPES = [email.get_content_type() for email in (hams)]
c = Counter(TYPES)
c.items()

dict_items([('text/plain', 2408), ('multipart/signed', 68), ('multipart/alternative', 9), ('multipart/mixed', 10), ('multipart/related', 3), ('multipart/report', 2)])

In [11]:
TYPES = [email.get_content_type() for email in (spams)]
c = Counter(TYPES)
c.items()

dict_items([('text/html', 183), ('text/plain', 218), ('multipart/mixed', 43), ('multipart/alternative', 47), ('multipart/related', 9)])

## Preparing training and testing data

In [12]:
X = np.array(hams + spams, dtype = 'object')
y = np.array([1] * len(hams) + [0] * len(spams))

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

## Preprocessing

In [15]:
from bs4 import BeautifulSoup
from collections import Counter
import re
import nltk
import urlextract

In [27]:
def html_to_plain_text_test(html):
    soup = BeautifulSoup(html, 'lxml')
    for a_tag in soup.find_all('a'):
        a_tag.replace_with(" HYPERLINK ")
    return soup.get_text(separator = " ", strip = True)

def email_to_text_test(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text_test(html)
    
def refine_test(email_text):
    url_extractor = urlextract.URLExtract()
    urls = url_extractor.find_urls(email_text)
    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    email_text = re.sub(email_pattern, "email_address", email_text)
    for url in urls:
        email_text = re.sub(re.escape(url), 'url', email_text)
    email_text = re.sub(r'\d+', '', email_text)
    email_text = re.sub(r'[^\w\s]', '', email_text)
    return email_text

def preprocess_test(email_parsed):
    email_text = email_to_text_test(email_parsed)
    email_text = refine_test(email_text)
    cnt = Counter()
    stemmer = nltk.PorterStemmer()
    for word in email_text.split():
        cnt[stemmer.stem(word.lower())] += 1
    return cnt

## Natural Language Processing

In [30]:
class PCounter(BaseEstimator, TransformerMixin, s):
    def __init__(self):
        pass
        self.s = 0
    def fit(self, X, y=None):
        return self

    def html_to_plain_text(self, html):
        soup = BeautifulSoup(html, 'lxml')
        for a_tag in soup.find_all('a'):
            a_tag.replace_with(" HYPERLINK ")
        return soup.get_text(separator=" ", strip=True)

    def email_to_text(self, email):
        html = None
        for part in email.walk():
            ctype = part.get_content_type()
            if not ctype in ("text/plain", "text/html"):
                continue
            try:
                content = part.get_content()
            except:  # in case of encoding issues
                content = str(part.get_payload())
            if ctype == "text/plain":
                return content
            else:
                html = content
        if html:
            return self.html_to_plain_text(html)

    def refine(self, email_text):
        url_extractor = urlextract.URLExtract()
        urls = url_extractor.find_urls(email_text)#HERE
        email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
        email_text = re.sub(email_pattern, "email_address", email_text)
        for url in urls:
            email_text = re.sub(re.escape(url), 'url', email_text)
        email_text = re.sub(r'\d+', '', email_text)
        email_text = re.sub(r'[^\w\s]', '', email_text)
        return email_text

    def preprocess(self, email_parsed):
        email_text = self.email_to_text(email_parsed)
        email_text = self.refine(email_text)
        cnt = Counter()
        stemmer = nltk.PorterStemmer()
        for word in email_text.split():
            cnt[stemmer.stem(word.lower())] += 1
        return cnt

    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            s += 1
            X_transformed.append(self.preprocess(email))
        return np.array(X_transformed)
        

In [41]:
pc = PCounter()
X_preprocessed = pc.transform(X_train)

TypeError: expected string or bytes-like object

In [42]:
def search(i, j , array):
    l = i +  int((j - i)/2)
    print('debug', i, l, j)
    if(j - i < 2):
        return i
    
    efh = False
    esh = False
    
    try:
        pc.transform(array[i: l])
    except:
        efh = True
    try:
        pc.transform(array[l+1: j])
    except:
        esh = True
        
    if(efh):
        search(i, l,array)
    else:
        search(l+1, j, array)

In [43]:
error_index_instance = search(0, 2399, X_train)

debug 0 1199 2399
debug 0 599 1199
debug 600 899 1199
debug 600 749 899
debug 750 824 899
debug 750 787 824
debug 788 806 824
debug 788 797 806
debug 798 802 806
debug 803 804 806
debug 805 805 806


In [45]:
pc = PCounter()
X_preprocessed = pc.transform(X_train[[805]])