In [1]:
import os
import tarfile
import urllib
import re
import bz2
import pandas as pd
import numpy as np
import unicodedata

In [2]:
DOWNLOAD_ROOT = "https://spamassassin.apache.org"
SPAM_PATH = os.path.join("datasets", "spam")
SPAM_URL = DOWNLOAD_ROOT + "/old/publiccorpus/"

def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    os.makedirs(spam_path, exist_ok=True)
    urlpath = urllib.request.urlopen('https://spamassassin.apache.org/old/publiccorpus/')
    string = urlpath.read().decode('utf-8')
    pattern = re.compile('href=".*\.bz2"')
    filelist = [f[6:-1] for f in pattern.findall(string)]
    for f in filelist:
        bz2path = os.path.join(spam_path, f)
        urllib.request.urlretrieve(spam_url+f, bz2path)
        with bz2.BZ2File(bz2path) as bz2_file:
            data = bz2_file.read()
        tarpath = bz2path[:-4]
        open(tarpath, 'wb').write(data)
        with tarfile.open(tarpath) as tar_file:
            tar_file.extractall(path=spam_path)
        os.remove(bz2path)
        os.remove(tarpath)

In [3]:
contents = []
cats = []

for cat in os.listdir(SPAM_PATH):
    cat_path = os.path.join(SPAM_PATH, cat)
    cat_files = os.listdir(cat_path)
    cat_contents = [open(os.path.join(cat_path, f), encoding='ansi').read() for f in cat_files if f != "cmds"]
    contents += cat_contents
    cats += [cat] * len(cat_contents)
    
emails = pd.DataFrame(data={'Content': contents, 'Cat': cats})

In [4]:
emails.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9349 entries, 0 to 9348
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Content  9349 non-null   object
 1   Cat      9349 non-null   object
dtypes: object(2)
memory usage: 146.2+ KB


In [5]:
emails['Cat'].value_counts()

easy_ham      5051
easy_ham_2    1400
spam_2        1397
spam          1001
hard_ham       500
Name: Cat, dtype: int64

In [6]:
emails["Cat"] = emails.apply(lambda row: row["Cat"][:-2] if (row["Cat"][-2] == '_' and row["Cat"][-1].isdigit()) else row["Cat"], axis=1)

In [7]:
emails['Cat'].value_counts()/len(emails)

easy_ham    0.690020
spam        0.256498
hard_ham    0.053482
Name: Cat, dtype: float64

In [8]:
emails['Label'] = emails['Cat'].str.contains('spam').astype(int)

In [9]:
emails['Label'].value_counts()

0    6951
1    2398
Name: Label, dtype: int64

In [10]:
emails

Unnamed: 0,Content,Cat,Label
0,From exmh-workers-admin@redhat.com Thu Aug 22...,easy_ham,0
1,From Steve_Burt@cursor-system.com Thu Aug 22 ...,easy_ham,0
2,From timc@2ubh.com Thu Aug 22 13:52:59 2002\n...,easy_ham,0
3,From irregulars-admin@tb.tf Thu Aug 22 14:23:...,easy_ham,0
4,From Stewart.Smith@ee.ed.ac.uk Thu Aug 22 14:...,easy_ham,0
...,...,...,...
9344,From Professional_Career_Development_Institute...,spam,1
9345,From tba@insiq.us Wed Dec 4 11:46:34 2002\nR...,spam,1
9346,Return-Path: <raye@yahoo.lv>\nReceived: from u...,spam,1
9347,From cweqx@dialix.oz.au Tue Aug 6 11:03:54 2...,spam,1


In [11]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(emails, emails["Cat"]):
    strat_train_set = emails.loc[train_index]
    strat_test_set = emails.loc[test_index]

In [12]:
strat_train_set['Cat'].value_counts()/len(strat_train_set)

easy_ham    0.690066
spam        0.256451
hard_ham    0.053483
Name: Cat, dtype: float64

In [13]:
strat_test_set['Cat'].value_counts()/len(strat_test_set)

easy_ham    0.689840
spam        0.256684
hard_ham    0.053476
Name: Cat, dtype: float64

In [14]:
X_train, y_train, X_test, y_test = strat_train_set.drop(columns=["Cat", "Label"]), strat_train_set["Label"].copy(), strat_test_set.drop(columns=["Cat", "Label"]), strat_test_set["Label"].copy()

In [15]:
emails

Unnamed: 0,Content,Cat,Label
0,From exmh-workers-admin@redhat.com Thu Aug 22...,easy_ham,0
1,From Steve_Burt@cursor-system.com Thu Aug 22 ...,easy_ham,0
2,From timc@2ubh.com Thu Aug 22 13:52:59 2002\n...,easy_ham,0
3,From irregulars-admin@tb.tf Thu Aug 22 14:23:...,easy_ham,0
4,From Stewart.Smith@ee.ed.ac.uk Thu Aug 22 14:...,easy_ham,0
...,...,...,...
9344,From Professional_Career_Development_Institute...,spam,1
9345,From tba@insiq.us Wed Dec 4 11:46:34 2002\nR...,spam,1
9346,Return-Path: <raye@yahoo.lv>\nReceived: from u...,spam,1
9347,From cweqx@dialix.oz.au Tue Aug 6 11:03:54 2...,spam,1


In [16]:
from sklearn.base import BaseEstimator, TransformerMixin
import email
from email_reply_parser import EmailReplyParser

# column index
content_ix = 0

class MyEmailParser(BaseEstimator, TransformerMixin):
    def __init__(self, strip_headers=True, check_html=True): # no *args or **kargs
        self.strip_headers = strip_headers
        self.check_html = check_html
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        X_og = X.copy()
        
        if self.strip_headers:
            new_contents = []
            for email_content in X_og[:, content_ix]:
                new = ''
                message = email.message_from_string(email_content)
                if message.is_multipart():
                    for payload in message.get_payload():
                        if payload.get_content_type() == "text/plain":
                            new += payload.get_payload()     
                else:
                    new = message.get_payload()
                new_contents += [new]
            X[:, content_ix] = np.array(new_contents)
        
        if self.check_html:
            have_html = np.empty((X.shape[0], 1))
            for email_content in X_og[:, content_ix]:
                has_html = False
                if message.is_multipart():
                    for payload in message.get_payload():
                        if payload.get_content_type() == "text/html":
                            has_html = True
                else:
                    has_html = message.get_content_type() == "text/html"
                np.append(have_html, has_html)
                np.c_[X, have_html.astype(int)]
            
        return X

email_parser = MyEmailParser()
parsed_emails = email_parser.transform(emails[emails.Content.str.contains('text/html')].values[0:3, :])

In [18]:
class MyTextParser(BaseEstimator, TransformerMixin):
    def __init__(self, to_lower=True, url_subs=True, money_subs=True, number_subs=True, punctuation_subs=True):
        self.to_lower = to_lower
        self.url_subs = url_subs
        self.money_subs = money_subs
        self.number_subs = number_subs
        self.punctuation_subs = punctuation_subs
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        X_og = X.copy()
            
        if self.url_subs:
            url_regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
            X[:, content_ix] = np.array([re.sub(url_regex, " url_marker ", text) for text in X[:, content_ix]])
        
        if self.money_subs:
            currency_regex = f"(USD)|(EUR)|(GBP)|[{''.join(chr(i) for i in range(0xffff) if unicodedata.category(chr(i)) == 'Sc')}]"
            X[:, content_ix] = np.array([re.sub(currency_regex, " currency_marker ", text) for text in X[:, content_ix]])
        
        if self.number_subs:
            number_regex = "(\d*\.?\d+|\d{1,3}(?:,\d{3})*(?:\.\d+)?)" # "(?:^|\s)(\d*\.?\d+|\d{1,3}(?:,\d{3})*(?:\.\d+)?)(?!\S)"
            X[:, content_ix] = np.array([re.sub(number_regex, " number_marker ", text) for text in X[:, content_ix]])
        
        if self.punctuation_subs:
            X[:, content_ix] = np.array([
                re.sub("\?", " interrogation_marker ", re.sub("\!", " exclamation_marker ", text)) 
                for text in X[:, content_ix]
            ])
            
        if self.to_lower:
            X[:, content_ix] = np.array([text.lower() for text in X[:, content_ix]])
            
        return X
    
text_parser = MyTextParser()
text_parsed_emails = text_parser.transform(parsed_emails)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
analyzer = CountVectorizer().build_analyzer()
def stemmed_words(doc):
    return (stemmer.stem(w) for w in analyzer(doc[0]))

vectorizer = CountVectorizer(stop_words='english', analyzer=stemmed_words, max_features=50000)


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

full_pipeline = Pipeline([
        ('email', MyEmailParser()),
        ('text', MyTextParser()),
        ('vectorize', vectorizer)
    ])

X_train_prepared = full_pipeline.fit_transform(X_train.values) #[:1000, :])

In [23]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_prepared, y_train)
log_reg.score(X_train_prepared, y_train)

0.9985292151357134

In [24]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(log_reg, X_train_prepared, y_train,
                         scoring="accuracy", cv=5)

In [25]:
X_test_prepared = full_pipeline.transform(X_test.values) #[:1000, :])

In [26]:
log_reg.score(X_test_prepared, y_test)

0.9855614973262032

In [37]:
from sklearn.model_selection import cross_val_predict

y_test_pred = log_reg.predict(X_test_prepared)

In [27]:
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

In [38]:
precision_score(y_test, y_test_pred)

0.968944099378882

In [39]:
recall_score(y_test, y_test_pred)

0.975

In [40]:
f1_score(y_test, y_test_pred)

0.9719626168224299

In [65]:
pd.DataFrame({"Label":y_test_pred}).Label.value_counts()

0    1387
1     483
Name: Label, dtype: int64

In [41]:
X_test

Unnamed: 0,Content
4592,url: url_marker \ndate: not supplied\n\ntechn...
9057,"<hr>\n<html>\n<div bgcolor= number_marker d""#f..."
6203,"once upon a time, brian wrote :\n\n> i was..."
1625,i'm getting these messages and i'm not sure wh...
7775,<html><head><title>::free mortgage quote::</ti...
...,...
9129,do you want to make money from home interrogat...
2243,url: url_marker \ndate: number_marker - numb...
854,russell turpin wrote:\n>invite her for an afte...
6578,<html>\n<head>\n <title>selling secure ...


# Considerações

contar a ocorrencia de cada palavra (binario ou frequencia?)

selecionar mais frequentes

tirar stopwords

pontuação provavelmente contar (!, ?)

tirar header do email (separar texto na primeira quebra de linha dupla (acho)) ou nao tirar?? pode avaliar assunto e quem enviou

substituir urls por URL, numeros por NUMBER, $ por MONEY talvez

stemming

tratar html </>

replies? \t |

caracteres estranhos ?? chines etc