# Objective Transfomer

In [1]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import urlextract
import re
import nltk

stemmer = nltk.PorterStemmer()
url_extractor = urlextract.URLExtract()
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

# email to text function

In [2]:
from bs4 import BeautifulSoup
def html_to_plain_text(html):
    soup = BeautifulSoup(html, 'lxml')
    for a_tag in soup.find_all('a'):
        a_tag.replace_with(" HYPERLINK ")
    return soup.get_text(separator = " ", strip = True)

def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

# Importing an email as a file

In [3]:
import os

In [4]:
email_path = os.path.join('datasets', 'spam', 'easy_ham', '00001.7c53336b37003a9286aba55d2945844c')

In [5]:
with open(email_path) as email_file:
    email_opened = email_file.read()

# Importing an email using email parser

In [7]:
import email
import email.policy

In [8]:
with open(email_path, 'rb') as email_file:
    email_parsed = email.parser.BytesParser(policy = email.policy.default).parse(email_file)

# Convert from email to word counter

In [33]:
from collections import Counter
import re
import urlextract

In [None]:
email_text = email_to_text(email_parsed)
url_extractor = urlextract.URLExtract()
urls = url_extractor.find_urls(email_text)
email_addresses = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', "email_address", email_text)
for url in urls:
    email_text = re.sub(re.escape(url), 'url', email_text)
for email_address in email_addresses:
    email_text = re.sub(re.escape(email_address), 'email_address', email_text)
email_text = re.sub(r'\d+', '', email_text)
email_text = re.sub(r'[^\w\s]', '', email_text)

In [None]:
print(email_text)

In [47]:
cnt = Counter()
for word in email_text.split():
    cnt[stemmer.stem(word.lower())] += 1

In [48]:
cnt

Counter({'date': 1,
         'wed': 1,
         'aug': 1,
         'from': 4,
         'chri': 1,
         'garrigu': 1,
         'cwgdatedfaddeepeddycom': 1,
         'messageid': 1,
         'tmdadeepeddyvirciocom': 1,
         'i': 4,
         'cant': 1,
         'reproduc': 1,
         'thi': 3,
         'error': 2,
         'for': 1,
         'me': 1,
         'it': 1,
         'is': 4,
         'veri': 1,
         'repeat': 1,
         'like': 1,
         'everi': 1,
         'time': 1,
         'without': 1,
         'fail': 1,
         'the': 15,
         'debug': 1,
         'log': 1,
         'of': 5,
         'pick': 9,
         'happen': 1,
         'pick_it': 1,
         'exec': 2,
         'inbox': 3,
         'list': 5,
         'lbrace': 6,
         'subject': 3,
         'ftp': 3,
         'rbrace': 6,
         'sequenc': 4,
         'mercuri': 3,
         'ftoc_pickmsg': 1,
         'hit': 4,
         'mark': 1,
         'tkerror': 1,
         'syntax': 1,
         'i

In [49]:
array = [email_parsed]
email_to_cnt = EmailToWordCounterTransformer()
cnt_objective = email_to_cnt.fit_transform(array)

In [50]:
len(cnt)

116

In [29]:
len(cnt_objective[0])

125

In [30]:
cnt_objective[0]

Counter({'date': 2,
         'wed': 1,
         'number': 39,
         'aug': 1,
         'from': 4,
         'chri': 1,
         'garrigu': 1,
         'cwg': 1,
         'numberfanumberd': 1,
         'deepeddi': 2,
         'com': 3,
         'messag': 1,
         'id': 1,
         'tmda': 1,
         'vircio': 1,
         'i': 5,
         'can': 1,
         't': 2,
         'reproduc': 1,
         'thi': 3,
         'error': 2,
         'for': 1,
         'me': 1,
         'it': 1,
         'is': 4,
         'veri': 1,
         'repeat': 1,
         'like': 1,
         'everi': 1,
         'time': 1,
         'without': 1,
         'fail': 1,
         'the': 15,
         'debug': 1,
         'log': 1,
         'of': 5,
         'pick': 9,
         'happen': 1,
         'pick_it': 1,
         'exec': 2,
         'inbox': 3,
         'list': 5,
         'lbrace': 6,
         'subject': 3,
         'ftp': 3,
         'rbrace': 6,
         'sequenc': 4,
         'mercuri': 3,
         