# Objective Transfomer

In [17]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import urlextract
import re
import nltk
from bs4 import BeautifulSoup
from collections import Counter
stemmer = nltk.PorterStemmer()

def html_to_plain_text(self, html):
    soup = BeautifulSoup(html, 'lxml')
    for a_tag in soup.find_all('a'):
        a_tag.replace_with(" HYPERLINK ")
    return soup.get_text(separator=" ", strip=True)

def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()
        if not ctype in ("text/plain", "text/html"):
            continue
        try:
            content = part.get_content()
        except: # in case of encoding issues
            content = str(part.get_payload())
        if ctype == "text/plain":
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)
    
url_extractor = urlextract.URLExtract()
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.strip_headers = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(set(url_extractor.find_urls(text)))
                urls.sort(key=lambda url: len(url), reverse=True)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*)?(?:[eE][+-]?\d+)?', 'NUMBER', text)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

# email preprocessor

In [18]:
class PCounter(BaseEstimator, TransformerMixin):
    
    def __init__(self, s = 0):
        self.s = s
        
    def fit(self, X, y=None):
        return self

    def html_to_plain_text(self, html):
        soup = BeautifulSoup(html, 'lxml')
        for a_tag in soup.find_all('a'):
            a_tag.replace_with(" HYPERLINK ")
        return soup.get_text(separator=" ", strip=True)

    def email_to_text(self, email):
        html = None
        for part in email.walk():
            ctype = part.get_content_type()
            if not ctype in ("text/plain", "text/html", "multipart/alternative"):
                continue
            try:
                content = part.get_content()
            except:  # in case of encoding issues
                content = str(part.get_payload())
            if ctype == "text/plain":
                return content
            else:
                html = content
        if html:
            return self.html_to_plain_text(html)
        

    def refine(self, email_text):
        url_extractor = urlextract.URLExtract()
        urls = url_extractor.find_urls(email_text)#HERE
        email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
        email_text = re.sub(email_pattern, "email_address", email_text)
        for url in urls:
            email_text = re.sub(re.escape(url), 'url', email_text)
        email_text = re.sub(r'\d+', '', email_text)
        email_text = re.sub(r'[^\w\s]', '', email_text)
        return email_text

    def preprocess(self, email_parsed):
        email_text = self.email_to_text(email_parsed)
        email_text = self.refine(email_text)
        cnt = Counter()
        stemmer = nltk.PorterStemmer()
        for word in email_text.split():
            cnt[stemmer.stem(word.lower())] += 1
        return cnt

    def transform(self, X, y=None):
        X_transformed = []
        for email in X:
            self.s += 1
            X_transformed.append(self.preprocess(email))
        return np.array(X_transformed)
        

# Importing an email as a file

In [19]:
import os

In [20]:
email_path = os.path.join('datasets', 'spam', 'easy_ham', '00001.7c53336b37003a9286aba55d2945844c')

In [21]:
with open(email_path) as email_file:
    email_opened = email_file.read()

# Importing an email using email parser

In [22]:
import email
import email.policy

In [23]:
with open(email_path, 'rb') as email_file:
    email_parsed = email.parser.BytesParser(policy = email.policy.default).parse(email_file)

# Convert from email to word counter

In [11]:
pc = PCounter()
cnt = pc.transform([email_parsed])[0]