In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os
import pandas as pd

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [2]:
import os
import tarfile
from six.moves import urllib

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20021010_easy_ham.tar.bz2"
HARD_HAM_URL = DOWNLOAD_ROOT + "20021010_hard_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(spam_url=SPAM_URL, spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("easy_ham.tar.bz2", HAM_URL), ("spam.tar.bz2", SPAM_URL), ("hard_ham.tar.bz2", HARD_HAM_URL)):
        path = os.path.join(spam_path, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        tar_bz2_file = tarfile.open(path)
        tar_bz2_file.extractall(path=SPAM_PATH)
        tar_bz2_file.close()

In [3]:
fetch_spam_data()

In [4]:
HAM_DIR = os.path.join(SPAM_PATH, "easy_ham")
HARD_HAM_DIR = os.path.join(SPAM_PATH, "hard_ham")
SPAM_DIR = os.path.join(SPAM_PATH, "spam")
ham_filenames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 20]
hard_ham_filenames = [name for name in sorted(os.listdir(HARD_HAM_DIR)) if len(name) > 20]
spam_filenames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 20]

In [5]:
len(ham_filenames), len(hard_ham_filenames), len(spam_filenames)

(2551, 250, 500)

In [6]:
import email
import email.policy

def load_email(is_spam, filename, spam_path=SPAM_PATH):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path, directory, filename), "rb") as f:
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

In [7]:
ham_emails = [load_email(is_spam=False, filename=name) for name in ham_filenames]
spam_emails = [load_email(is_spam=True, filename=name) for name in spam_filenames]

In [8]:
def email_to_text(email):
    for part in email.walk():
        ctype= part.get_content_type()
        if not ctype in ('text/plain', 'text/html'):
            continue 
        try:
            content= part.get_content()
        except:
            content = str(part.get_payload())
        return content

In [9]:
from textblob import TextBlob

ham_emails = [email for email in ham_emails if TextBlob(email_to_text(email) or 'bonjour').detect_language()=='en']
spam_emails = [email for email in spam_emails if TextBlob(email_to_text(email) or 'bonjour').detect_language()=='en']

In [10]:
len(ham_emails), len(spam_emails)

(2540, 480)

In [11]:
def get_email_structure(email):
    if isinstance(email, str):
        return 'text/plain'
    payload = email.get_payload()
    if isinstance(payload, list):
        return ", ".join([
            get_email_structure(sub_email)
            for sub_email in payload
        ])
    else:
        return email.get_content_type()

In [12]:
from collections import Counter
from itertools import chain

structures = [get_email_structure(email) for  email in ham_emails]
Counter(sum([ structure.split(", ") for structure in structures], [])).most_common()

[('text/plain', 2553),
 ('application/pgp-signature', 73),
 ('text/html', 8),
 ('application/octet-stream', 2),
 ('application/x-pkcs7-signature', 2),
 ('text/enriched', 1),
 ('application/ms-tnef', 1),
 ('video/mng', 1),
 ('text/rfc822-headers', 1),
 ('application/x-java-applet', 1)]

In [13]:
structures = structures + [get_email_structure(email) for  email in spam_emails]

In [14]:
contents = [email_to_text(email) or '' for email in ham_emails]
contents = contents + [email_to_text(email) or '' for email in spam_emails]

In [15]:
subjects = [email['Subject'] for email in ham_emails]
subjects = subjects + [email['Subject'] for email in spam_emails]

In [16]:
import numpy as np
from sklearn.model_selection import train_test_split

data = pd.DataFrame({"subject" : subjects, "content" : contents, "structure": structures, 
                     'target' : np.array([0] * len(ham_emails) + [1] * len(spam_emails))})
data.drop_duplicates(['subject', 'content'], inplace=True)
data.head()

Unnamed: 0,subject,content,structure,target
0,Re: New Sequences Window,"Date: Wed, 21 Aug 2002 10:54:46 -05...",text/plain,0
1,[zzzzteana] RE: Alexander,"Martin A posted:\nTassos Papadopoulos, the Gre...",text/plain,0
2,[zzzzteana] Moscow bomber,Man Threatens Explosion In Moscow \n\nThursday...,text/plain,0
3,[IRR] Klez: The Virus That Won't Die,Klez: The Virus That Won't Die\n \nAlready the...,text/plain,0
4,Re: Insert signature,"On Wed Aug 21 2002 at 15:46, Ulises Ponce wrot...",text/plain,0


In [17]:
!pip install urlextract

Collecting urlextract
  Downloading https://files.pythonhosted.org/packages/47/13/d8c5970ba73b0266cb13c6883f9e7cf37b044e52255208ceb32b0d09594a/urlextract-0.10-py3-none-any.whl
Collecting uritools (from urlextract)
  Downloading https://files.pythonhosted.org/packages/8c/5d/ef3cd3c40b4b97f0cb50cee8e4c5a8a4abc30953e1c7ce7e0d25cb2534c3/uritools-2.2.0-py2.py3-none-any.whl
Installing collected packages: uritools, urlextract
Successfully installed uritools-2.2.0 urlextract-0.10
[33mYou are using pip version 19.0.3, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [24]:
import re
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from html import unescape
import urlextract 
from sklearn.base import TransformerMixin, BaseEstimator
from nltk.stem import WordNetLemmatizer 



class TextProcessor:
    """
    Class for carrying all the text pre-processing stuff throughout the project
    """

    def __init__(self):
        
        self.stopwords = stopwords.words('english')

        #self.ps = PorterStemmer()  
        self.lm = WordNetLemmatizer()
        # stemmer will be used for each unique word once
        #self.stemmed = dict()
        self.lemmetized = dict()

        self.url_extractor = urlextract.URLExtract()
        

    
    def process(self, text, allow_stopwords = False, use_stemmer = True) :
        """
        Process the specified text,
        splitting by non-alphabetic symbols, casting to lower case,
        removing stopwords, HTML tags and stemming each word

        :param text: text to precess
        :param allow_stopwords: whether to remove stopwords
        :return: processed text
        """
        ret = []

        # split and cast to lower case
        #text = re.sub(r'<[^>]+>', ' ', str(text))        
        text = text.lower()
        text = re.sub(r'[0-9]+(?:\.[0-9]+){3}', ' URL ', text)
        urls = list(set(self.url_extractor.find_urls(text)))
        urls.sort(key=lambda url: len(url), reverse=True)
        for url in urls:
            text = text.replace(url, " URL ")
            
        text = re.sub('<head.*?>.*?</head>', '', text, flags=re.M | re.S | re.I)
        text = re.sub('<a\s.*?>', ' HYPERLINK ', text, flags=re.M | re.S | re.I)
        text = re.sub('<.*?>', '', text, flags=re.M | re.S)
        text = re.sub(r'(\s*\n)+', '\n', text, flags=re.M | re.S)
        text = unescape(text)
        text = re.sub(r'\W+', ' ', text, flags=re.M)
       
        
        text= re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', 'NUMBER', text)    
        
        for word in text.split():
            # remove non-alphabetic and stop words
            if (word.isalpha() and word not in self.stopwords) or allow_stopwords:
                if use_stemmer:
                    if word not in self.lemmetized:
                        self.lemmetized[word] = self.lm.lemmatize(word)
                    # use stemmed version of word
                    ret.append(self.lemmetized[word])
                else: 
                    ret.append(word)
        return ' '.join(ret)

In [25]:
data.head()

Unnamed: 0,subject,content,structure,target
0,Re: New Sequences Window,"Date: Wed, 21 Aug 2002 10:54:46 -05...",text/plain,0
1,[zzzzteana] RE: Alexander,"Martin A posted:\nTassos Papadopoulos, the Gre...",text/plain,0
2,[zzzzteana] Moscow bomber,Man Threatens Explosion In Moscow \n\nThursday...,text/plain,0
3,[IRR] Klez: The Virus That Won't Die,Klez: The Virus That Won't Die\n \nAlready the...,text/plain,0
4,Re: Insert signature,"On Wed Aug 21 2002 at 15:46, Ulises Ponce wrot...",text/plain,0


In [26]:
tp = TextProcessor()
data_processed = data.copy()
data_processed.content = data_processed.content.apply(lambda x: tp.process(x, allow_stopwords=False, use_stemmer=True))
data_processed.subject = data_processed.subject.apply(lambda x : tp.process(x, allow_stopwords = True, use_stemmer=True))
data_processed['whole']  = data_processed.subject + ' ' + data_processed.content
data_processed.head()

Unnamed: 0,subject,content,structure,target,whole
0,re new sequence window,date wed NUMBER aug NUMBER NUMBER NUMBER NUMBE...,text/plain,0,re new sequence window date wed NUMBER aug NUM...
1,zzzzteana re alexander,martin posted tasso papadopoulos greek sculpto...,text/plain,0,zzzzteana re alexander martin posted tasso pap...
2,zzzzteana moscow bomber,man threatens explosion moscow thursday august...,text/plain,0,zzzzteana moscow bomber man threatens explosio...
3,irr klez the virus that won t die,klez virus die already prolific virus ever kle...,text/plain,0,irr klez the virus that won t die klez virus d...
4,re insert signature,wed aug NUMBER NUMBER NUMBER NUMBER ulises pon...,text/plain,0,re insert signature wed aug NUMBER NUMBER NUMB...


In [28]:
from scipy.sparse import csr_matrix


class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, vocabulary_size=1000, column = 'whole'):
        self.vocabulary_size = vocabulary_size
        self.column = column
    def fit(self, X, y=None):
        counter = []
        for text in X[self.column].values:
            counter.append(Counter(text.split()))
        total_count = Counter()
        for word_count in counter:
            if isinstance(word_count, list):
                print(word_count)
            for word, count in word_count.items():
                total_count[word] += min(count, 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.most_common_ = most_common
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}
        return self
    def transform(self, X, y=None):
        counter = []
        for text in X[self.column].values:
            counter.append(Counter(text.split()))
            
        rows = []
        cols = []
        data = []        
        for row, word_count in enumerate(counter):
            for word, count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word, 0))
                data.append(count)        
        return pd.DataFrame(columns=['word_UNK'] + ['word_'+column for column in self.vocabulary_], 
                            data=csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1)).toarray())

In [29]:
from scipy.sparse import csr_matrix
from sklearn.base import TransformerMixin, BaseEstimator

class StructureTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column = 'structure'):
        self.column = column
        
    def fit(self, X, y=None):     
        tmp = []
        for email in X[self.column].apply(lambda x : x.split(', ')).values :
            for structure in email:
                tmp.append(structure)        
        self.structures = list(Counter(tmp).keys())        
        return self
    
    def transform(self, X, y=None):
        out = np.zeros((len(X), len(self.structures)))
        for i , structure in enumerate(self.structures):
            out[:,i] = X[self.column].apply(lambda x : 1 if structure in x.split(', ') else 0).values
        return out

In [30]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models.ldamulticore import LdaMulticore

class LdaTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, dim = 2, column = 'whole'):
        self.dim = dim
        self.column = column
    def fit(self, X, y=None):     
        lda_tokens = X[self.column].apply(lambda x: x.split())
        # create Dictionary and train it on text corpus
        self.lda_dic = Dictionary(lda_tokens)
        self.lda_dic.filter_extremes(no_below=10, no_above=0.6, keep_n=8000)
        lda_corpus = [self.lda_dic.doc2bow(doc) for doc in lda_tokens]
        # create TfidfModel and train it on text corpus
        self.lda_tfidf = TfidfModel(lda_corpus)
        lda_corpus = self.lda_tfidf[lda_corpus]
        # create LDA Model and train it on text corpus
        self.lda_model = LdaMulticore(
            lda_corpus, num_topics=self.dim, id2word=self.lda_dic, workers=4,
            passes=20, chunksize=1000, random_state=0
        )
        return self
    
    def transform(self, X, y=None):
        lda_emb_len = len(self.lda_model[[]])
        lda_corpus = [self.lda_dic.doc2bow(doc) for doc in X[self.column].apply(lambda x: x.split())]
        lda_corpus = self.lda_tfidf[lda_corpus]
        lda_que_embs = self.lda_model.inference(lda_corpus)[0]
        # append lda question embeddings
        out = np.zeros((len(X), lda_emb_len))
        for i in range(lda_emb_len):
            out[:, i] = lda_que_embs[:, i]
        return out



In [64]:
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models.ldamulticore import LdaMulticore
from sklearn.feature_extraction.text import TfidfVectorizer

class TfIdfTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, column = 'whole'):
        self.column = column
        self.model = TfidfVectorizer(lowercase = False, max_df=0.6, min_df=0.1, analyzer='char_wb', ngram_range=(1,3))
    def fit(self, X, y=None):     
        self.model = self.model.fit(X[self.column])
        return self
    
    def transform(self, X, y=None):
        self.model.transform(X[self.column])
        return self.model.transform(X[self.column])

In [70]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

preprocess_pipeline = ColumnTransformer([
    ("wordcount_to_vector", WordCounterToVectorTransformer(), ['whole']),
    ("structure_transformer", StructureTransformer(), ['structure']),
    #("tfidf", TfIdfTransformer(), ['whole']),
    #("lda_transformer", LdaTransformer(), ['whole']),
])
model = LogisticRegression(solver="liblinear", random_state=42)

full_pipeline = Pipeline([
    ('preprocessor', preprocess_pipeline),
    ('model', model)
])

In [71]:
data_processed = data_processed.reset_index(drop=True)
random_permutation = np.random.permutation(len(data_processed))
data_processed = data_processed.loc[random_permutation]
data_processed = data_processed.reset_index(drop=True)
data_processed.head()

Unnamed: 0,subject,content,structure,target,whole
0,re electric car an edsel,ah car seen discovery channel url via lurker U...,text/plain,0,re electric car an edsel ah car seen discovery...
1,english well for you,hallo found email id directoric russian man li...,text/plain,1,english well for you hallo found email id dire...
2,adv lowest life insurance rate available moode,lowest rate available term life insurance take...,text/plain,1,adv lowest life insurance rate available moode...
3,sadev bug NUMBER rpm build put wrong path in t...,URL URL changed removed added cc spamassassin ...,text/plain,0,sadev bug NUMBER rpm build put wrong path in t...
4,re ilug interesting article on free software l...,david neary said francophones among article su...,text/plain,0,re ilug interesting article on free software l...


In [72]:
X = data_processed.drop('target', axis=1).values
y = data_processed.target.values

In [74]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
kfold = StratifiedKFold(n_splits = 10)

scores = []
for i, (train_index, test_index) in enumerate(kfold.split(X, y)):
    X_train, X_test = data_processed.loc[train_index, ['whole', 'structure']], data_processed.loc[test_index, ['whole', 'structure']]
    y_train, y_test = data_processed.loc[train_index, ['target']], data_processed.loc[test_index, ['target']]
    full_pipeline.fit(X_train, y_train.values.ravel())
    predictions = full_pipeline.predict(X_test)
    scores.append(accuracy_score(y_test.values.ravel(), predictions))
    print(i, '==============>', scores[i])


    



In [75]:
np.mean(scores), np.std(scores)

(0.9916217586435815, 0.004467691398642306)