In [21]:
random_seed = 1337

In [1]:
import os
import email
import mailbox

def list_all_files(exclude_dirs, path):
    file_paths = []
    for root, dirs, files in os.walk(path):
        # update list of directories to exclude dirs we don't want
        dirs[:] = [directory for directory in dirs if directory not in exclude_dirs]
        for filename in files:
            file_paths.append(os.path.join(root, filename))
    return file_paths

def load_enron(exclude_dirs, path, filename):
    email_file_paths = list_all_files(exclude_dirs, path)

    mbox_filename = os.path.join(path, filename + '.mbox')
    enron_mailbox = mailbox.mbox(mbox_filename)
    enron_mailbox.lock()
    
    for email_file in email_file_paths:
        with open(email_file) as f:
            email_extracted = email.message_from_file(f)
        enron_mailbox.add(email_extracted)
        enron_mailbox.flush()

    enron_mailbox.unlock()
    enron_mailbox.close()

In [2]:
exclude_dirs = ['discussion_threads', 'notes_inbox', 'all_documents', '_sent_mail']
load_enron(exclude_dirs, 'lstm_datasets\enron_ham_dataset', 'enron_extracted')

In [26]:
from preprocessing_utilss import mbox_file_to_pd

In [4]:
df_ham_raw = mbox_file_to_pd('lstm_datasets\enron_ham_dataset\\enron_extracted.mbox', only_file=True)

In [7]:
df_ham_raw

Unnamed: 0,filename,email_body,file_key
0,enron_extracted.mbox,,0
1,enron_extracted.mbox,loan servicing-jessica weeber 800-393-5626 jwe...,1
2,enron_extracted.mbox,exit mccollough off 410\n,2
3,enron_extracted.mbox,"If you cannot read this email, please click he...",3
4,enron_extracted.mbox,"\n[IMAGE] \t\t[IMAGE] \t\n\t\tDear PHILLIP, ...",4
...,...,...,...
527571,enron_extracted.mbox,Some of my position is with the Alberta Term b...,527571
527572,enron_extracted.mbox,2\n\n -----Original Message-----\nFrom: \tDouc...,527572
527573,enron_extracted.mbox,Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\...,527573
527574,enron_extracted.mbox,i think the YMCA has a class that is for peopl...,527574


In [9]:
df_ham_raw = df_ham_raw[~df_ham_raw['email_body'].isna()]
df_ham_raw.reset_index(inplace=True, drop=True)

In [16]:
df_ham_raw[df_ham_raw['email_body'].duplicated()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 312470 entries, 364 to 527574
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   filename    312470 non-null  object
 1   email_body  312470 non-null  object
 2   file_key    312470 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 9.5+ MB


Removing duplicates, as we will random select a certain amount of e-mails, so we are reducing the chance to sample many exact e-mails in sample.

In [17]:
df_ham_raw = df_ham_raw[~df_ham_raw['email_body'].duplicated()]
df_ham_raw.reset_index(inplace=True, drop=True)

In [18]:
df_ham_raw

Unnamed: 0,filename,email_body,file_key
0,enron_extracted.mbox,loan servicing-jessica weeber 800-393-5626 jwe...,1
1,enron_extracted.mbox,exit mccollough off 410\n,2
2,enron_extracted.mbox,"If you cannot read this email, please click he...",3
3,enron_extracted.mbox,"\n[IMAGE] \t\t[IMAGE] \t\n\t\tDear PHILLIP, ...",4
4,enron_extracted.mbox,"\nEarlier this week, Enron Global Technology a...",5
...,...,...,...
215100,enron_extracted.mbox,2\n\n -----Original Message-----\nFrom: \tDouc...,263784
215101,enron_extracted.mbox,Analyst\t\t\t\t\tRank\n\nStephane Brodeur\t\t\...,263785
215102,enron_extracted.mbox,i think the YMCA has a class that is for peopl...,263786
215103,enron_extracted.mbox,I will have 4 books:\n\nCAND-MGMT-BAS for all ...,263787


In [19]:
df_ham_raw[df_ham_raw['email_body'] == '']

Unnamed: 0,filename,email_body,file_key


In [20]:
df_ham_raw[df_ham_raw['email_body'] == 'decoding_error']

Unnamed: 0,filename,email_body,file_key


Get random sample, so that rest of preprocessing will be faster.

In [22]:
df_ham_sample = df_ham_raw.sample(n=100000, random_state=random_seed)

In [24]:
df_ham_sample.reset_index(inplace=True, drop=True)

In [25]:
df_ham_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   filename    100000 non-null  object
 1   email_body  100000 non-null  object
 2   file_key    100000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 2.3+ MB


Perform preprocessing:
* extract message from HTML
* lowercasing all characters 
* removing nonalphanumeric characters, special characters, punctuation marks, whitespaces caused by removal, stopwords, HTML elements.
* replacing URLs with fixed string
* replacing e-mail with fixed string
  
Perform tokenization:
* tokenization based on white spaces

In [27]:
from preprocessing_utils import preprocess_body

In [28]:
df_ham_sample['preprocessed_body'] = df_ham_sample['email_body'].apply(preprocess_body)



In [29]:
df_ham_sample[df_ham_sample['preprocessed_body'] == 'to_manual_extraction']

Unnamed: 0,filename,email_body,file_key,preprocessed_body


In [3]:
df_ham_sample[df_ham_sample['preprocessed_body'] == ''].head(20)

Unnamed: 0,filename,email_body,file_key,preprocessed_body
4948,enron_extracted.mbox,<<Hanson Scheduling Model_3_31_1.xls>> <<Hanso...,55172,
7753,enron_extracted.mbox,\n\n\n <<STO Investor Memo 92501.pdf>> \n <<ST...,38341,
8948,enron_extracted.mbox,\n \n\n<Embedded Microsoft Word Document>\n,192105,
9508,enron_extracted.mbox,\n \n\n \n\n\n\n<Embedded >\n,237020,
10327,enron_extracted.mbox,\n\n<Embedded Picture (Metafile)>\n,18806,
10783,enron_extracted.mbox,<<Hanson Scheduling Model_5_26_27.xls>> <<Hans...,54925,
11595,enron_extracted.mbox,\n<http://www.enron.com/corp/pressroom/>\n \n \n,68443,
11874,enron_extracted.mbox,<<Hanson Scheduling Model_4_5_6.xls>> <<Hanson...,55147,
14486,enron_extracted.mbox,\n\n \n\n \n\n<Embedded Picture (Device Indepe...,179120,
18099,enron_extracted.mbox,<<MVC-004F.JPG>> <<MVC-005F.JPG>> <<MVC-006...,169856,


In [2]:
df_ham_sample = df_ham_sample[df_ham_sample['preprocessed_body'] != '']
df_ham_sample.reset_index(drop=True, inplace=True)

In [5]:
from joblib import dump
dump(df_ham_sample, 'backup_dumps\df_ham_sample')

['backup_dumps\\df_ham_sample']

In [6]:
from joblib import load
df_ham_sample = load('backup_dumps\df_ham_sample')

In [19]:
df_ham_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99956 entries, 0 to 99955
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   filename           99956 non-null  object
 1   email_body         99956 non-null  object
 2   file_key           99956 non-null  int64 
 3   preprocessed_body  99956 non-null  object
dtypes: int64(1), object(3)
memory usage: 3.1+ MB


In [7]:
import spacy
nlp = spacy.load('en_core_web_sm')
nlp.disable_pipes(["tagger", "parser", "ner"])
def tokenize(body):
    doc = nlp(body)
    tokens = [token.text for token in doc if not token.is_stop]
    return tokens

Split dataset in chunks to make it faster and more memory friendly.

In [8]:
def split_text(text, chunk_size=1000000):
    text_len = len(text)
    chunks = []

    for i in range(0, text_len, chunk_size):
        chunk = text[i:i + chunk_size]
        chunks.append(chunk)

    return chunks

def tokenize_and_remove_stopwords(text, nlp):
    tokens = []
    text_chunks = split_text(text)
    try:
        for doc in nlp.pipe(text_chunks):
            chunk_tokens = [token.text for token in doc if not token.is_stop]
            tokens.extend(chunk_tokens)
    except MemoryError as merr:
        print(merr)
        print(text)
        return 'memoryerror'
        
    return tokens

In [8]:
import pandas as pd
chunk_size = 10000
data_chunks = [df_ham_sample.iloc[i:i + chunk_size] for i in range(0, len(df_ham_sample), chunk_size)]

for i, chunk in enumerate(data_chunks):
    df_ham_sample[i]['tokenized_body'] = df_ham_sample['preprocessed_body'].apply(lambda x: tokenize_and_remove_stopwords(x, nlp))

df_ham_sample_tokenized = pd.concat(data_chunks, ignore_index=True)



In [20]:
df_ham_sample_last= df_ham_sample[80001:]

In [21]:
df_ham_sample_last['tokenized_body'] = df_ham_sample_last['preprocessed_body'].apply(lambda x: tokenize_and_remove_stopwords(x, nlp))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ham_sample_last['tokenized_body'] = df_ham_sample_last['preprocessed_body'].apply(lambda x: tokenize_and_remove_stopwords(x, nlp))


In [11]:
from joblib import dump, load

In [22]:
dump(df_ham_sample_last, 'backup_dumps\df_ham_sample_last')

['backup_dumps\\df_ham_sample_last']

In [3]:
test2 = load('backup_dumps\df_ham_sample_fourth_1k_3_crash')

In [4]:
test2 = test2[test2['tokenized_body'] != 'memoryerror']
test2.reset_index(inplace=True, drop=True)

In [5]:
dump(test2, 'backup_dumps\df_ham_sample_fourth_1k_3')

['backup_dumps\\df_ham_sample_fourth_1k_3']