# Enron Dataset Preprocessing : Parsing the raw dataset

In this notebook, we parse the raw dataset in to a single csv file.

## Step 1: Import libraries

In [1]:
import os
import re
import sys
import email
import dateutil

import pandas as pd

## Step 2: Download and extract the dataset

The Enron dataset should be downloaded from `https://www.cs.cmu.edu/~enron/enron_mail_20150507.tar.gz` and extracted in the current directory.

The following variable defines the location to the raw dataset.

In [None]:
MAIL_DIR = 'maildir/'

## Step 3: Parse the dataset

We first define a couple helper functions to extract the emails sent by all individuals in the dataset.

In [None]:
def recursive_listdir(path):
    """Recursively walk from a given path"""
    return [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(path)) for f in fn]

def get_sender_emails(user):
    """Generator that iterates over all the emails sent by a given user"""
    # Check all all sub-folders in the user folder
    for fpath in recursive_listdir(os.path.join(MAIL_DIR, user)):
        # Ignore os specific file
        if fpath.endswith('.DS_Store'):
            continue
        # Read email file
        with open(fpath, 'rb') as f:
            msg = email.message_from_binary_file(f)
        # Parse date
        date = msg['Date']
        dt = dateutil.parser.parse(date)
        t = dt.timestamp()
        # Store data in a dict
        mail_dict = {
            'user': user,
            'date': date,
            'timestamp': t, 
            'file': re.sub(MAIL_DIR, '', fpath), 
        }
        for key in ['From', 'To', 'Cc', 'Bcc', 'X-From', 'X-To', 'X-cc', 'X-bcc', 'X-Origin']:
            mail_dict[key] = msg.get(key)

        yield mail_dict

Get all emails from all individuals.

In [None]:
# Get the list of individuals
SENDER_LIST = sorted([d for d in os.listdir(MAIL_DIR) if os.path.isdir(os.path.join(MAIL_DIR, d))])

data = list()
n_senders = len(SENDER_LIST)
for i, sender in enumerate(SENDER_LIST):
    print('{:d}/{:d} - Process user: {:<20s}'.format(i+1, n_senders, sender), end='\r', flush=True)
    data.extend(list(get_sender_emails(sender)))
print()

Visualize some mails

In [None]:
df = pd.DataFrame.from_dict(data)
df.sample(5)

In [None]:
df.shape

Save dataframe to `csv`

In [None]:
df.to_csv('enron_dataset_raw.csv', encoding='utf-8')