#Welcome to the Email Data Cleaning Notebook for EmailRank

##This notebook takes the raw data and performs clean-up and mapping for use in the modeling

### Steps here:

1. Import data from the directory structure
1. Regex code to strip out items we do not want / some leakage removal code
1. Save the final data for use in the model code
###Note that this workbook takes significant time to run as it opens all 500k emails and does text processing

In [2]:
import pandas as pd
import json
from os import listdir
import os
import re
import pickle
from multiprocessing import Pool

In [3]:
filepath = './FlaskApp/data/maildir/'

In [4]:
positions = pd.read_csv('employee.csv', header=None)
positions.head(5)

email1 = positions[[3,8]]
email1.columns = ['email', 'position']
email2 = positions[[4,8]]
email2.columns = ['email', 'position']
email3 = positions[[5,8]]
email3.columns = ['email', 'position']
email4 = positions[[6,8]]
email4.columns = ['email', 'position']
full_email_list = pd.concat([email1, email2, email3, email4], axis=0)
full_email_list.reset_index()
pass

In [8]:
## Main loop for processing
# for a given user, read in the emails, strip out various metrics, 

def readFiles(filepath, user, master):
    ## Input: system path to directory of emails, username, master database
       
    print 'ReadFiles called on user', user
    directories = listdir(filepath + user)
    
    ## Create list of emails
    walkthrough = os.walk(filepath+'/'+user)
    userEmails = master
    for i in walkthrough:
        for j in i[2]:
            email = i[0]+'/'+j

            email_id, parsedText = stripFiles(email)
            
            if not pd.isnull(parsedText['email_from_position']) and not pd.isnull(parsedText['email_to_position']):
                userEmails = userEmails.append({'FROM':parsedText['email_from_position'],
                               'TO':parsedText['email_to_position'],
                               'TEXT':parsedText['email_text']},
                              ignore_index=True)

    return userEmails

def getPosition(email):
    ## Gets the email position mapping (used in a pandas map on the emails)
    ## Input: Email address
    ## Output: position associated with email address (if found)
    
    return full_email_list.position[full_email_list.email == email].values[0]
    
    
def stripFiles(email):
    ## Performs regex to strip out the email text and other items from the raw files
    ## Input: system path to email in the directory structure
    ## Output: Email ID and Dictionary of email components
    
    
    with open(email) as f:
        text = f.read()
    text = text.decode(errors='ignore').encode('utf-8', errors='ignore')
    
    email_id = re.sub('\.', '',
                      re.sub('\.Java.*', '', 
                             re.sub('Message-ID: <', '', 
                                    re.findall('Message-ID:.*\n',text)[0]), flags=re.DOTALL))
    
    email_from = re.findall('.*enron\.com', 
                            re.sub('From: ', '', 
                                   re.findall('From:.*\n',text)[0]))
    
    email_to = re.findall('.*enron\.com',
                          re.sub('\t','',
                                 re.sub('To: ', '',
                                        re.findall('To:.*:',text, flags=re.DOTALL)[0])))
    
    ccs = re.findall('Cc:.*\n',text)
    if ccs:
        email_ccs = re.findall('.*enron\.com', re.sub('Cc: ', '', ccs[0]))
    else:
        email_ccs = []
        
    email_subject = [re.sub('Subject: ', '',
                            re.findall('Subject:.*\n',text)[0])]

    email_text = re.sub('---------------.*','',
                    re.sub('From:.*','',
                    re.sub('X-FileName:.*', '',
                         re.findall('X-FileName:.*',text, flags=re.DOTALL)[0])))
    
    try:
        email_from_position = getPosition(email_from[0])
    except IndexError:
        email_from_position = None
    try:
        email_to_position = getPosition(email_to[0])
    except IndexError:
        email_to_position = None

    
    output = {'email_id':email_id,
              'email_from':email_from,
              'email_from_position':email_from_position,
              'email_to':email_to,
              'email_to_position':email_to_position,
              'email_text':email_text,
              'email_ccs':email_ccs,
              'email_subject':email_subject,
              #'full_text':text
             }
    
    return email_id, output


## Code used to process the email text

master = pd.DataFrame(columns=['FROM', 'TO', 'TEXT'])

users = listdir(filepath)
users = users

## set up multi threading
# p = Pool(8)

# def threadedFunction(user):
#     return readFiles(filepath, user)

# master = p.map(threadedFunction, users)

for user in users:
    master = readFiles(filepath, user, master)


## Save as pickle for the model processing workbook
with open('FlaskApp/data/full_data.pkl', 'w+') as f:
    pickle.dump(master, f)

ReadFiles called on user thomas-p
ReadFiles called on user watson-k
ReadFiles called on user mcconnell-m
ReadFiles called on user harris-s
ReadFiles called on user campbell-l
ReadFiles called on user lay-k
ReadFiles called on user hendrickson-s
ReadFiles called on user taylor-m
ReadFiles called on user steffes-j
ReadFiles called on user giron-d
ReadFiles called on user parks-j
ReadFiles called on user semperger-c
ReadFiles called on user swerzbin-m
ReadFiles called on user nemec-g
ReadFiles called on user saibi-e
ReadFiles called on user arora-h
ReadFiles called on user guzman-m
ReadFiles called on user martin-t
ReadFiles called on user tycholiz-b
ReadFiles called on user ermis-f
ReadFiles called on user fischer-m
ReadFiles called on user gang-l
ReadFiles called on user dorland-c
ReadFiles called on user stepenovitch-j
ReadFiles called on user kuykendall-t
ReadFiles called on user whalley-g
ReadFiles called on user lenhart-m
ReadFiles called on user merriss-s
ReadFiles called on user s

In [65]:
full_email_list.position.unique()

array([nan, 'Employee', 'CEO', 'Director', 'Trader', 'President',
       'Vice President', 'Manager', 'Managing Director', 'In House Lawyer',
       'Manager)'], dtype=object)