1) Extract the messages from the enron index that include a Keny Lay email address in them in a message header. How many email messages are these?

In [1]:
import pandas as pd
import json
import sys
import re
import pprint
pp = pprint.PrettyPrinter(indent=4)

# Start with a JSON file dowloaded from SSCC ES server, via the sample code
# provided by instructor.  That code not shown here.  Open the file and load 
# contents to a dict object
mail_list = []
with open('enron_data.json') as input_file:
    j_dict =json.load(input_file)

# init var to track key errors, not all mail has "to" and a few are missing headers
key_error_count = 0

# iterate over the messages, extracting the relevant parts and saving those to 
# a new list for further processing
for msg in j_dict:
#    pprint.pprint(msg)
    try:
        msg_ID = msg['headers']['Message-ID']  
        msg_date = msg['headers']['Date']
        msg_from = msg['headers']['From']
        msg_to_tmp = msg['headers']['To'].split()
        for msg_to in msg_to_tmp:
            mail_list.append({'msg_ID':msg_ID,'address_from':msg_from,
                              'address_to':msg_to.strip(','),'date':msg_date})
    except KeyError as e:
        key_error_count +=1
    except:
        print "Unexpected error:", sys.exc_info()[0]    

df_headers = pd.DataFrame(mail_list)

# Convert the NaN to blanks so RegEx is happy 
df_headers = df_headers.fillna('')

# create 2 empty dataframes, loop over the extracted data, and split mails 
# into the 'to' and 'from' Ken Lay buckets.  RegEx looks for all permutations
# of starts with 'k' and ends with 'lay@'
j = 0
k = 0
df_from_kl = pd.DataFrame(columns=['msg_ID', 'send_alias', 'to_person','date'])
df_to_kl = pd.DataFrame(columns=['msg_ID', 'to_alias', 'from_person'])
for i in range(0, len(df_headers)):
    if re.match(r"^k+\S{0,}lay@", df_headers['address_from'][i]):
        df_from_kl.loc[j] = [df_headers['msg_ID'][i], df_headers['address_from'][i], 
                             df_headers['address_to'][i],df_headers['date'][i]]
        j+=1
    if re.match(r"^k+\S{0,}lay@", df_headers['address_to'][i]):  
        df_to_kl.loc[k] = [df_headers['msg_ID'][i], df_headers['address_to'][i], 
                           df_headers['address_from'][i]]
        k+=1

# clean the data before counting
df_from_kl.drop_duplicates() 
df_to_kl.drop_duplicates()     
print "Total number of mails to/from Ken Lay =", len(df_to_kl)+len(df_from_kl),'\n'

Total number of mails to/from Ken Lay = 7198 



2) How many different Ken Lay email addresses are there in these messages? Provide a count of how many times each one occurs in the messages.

In [9]:
sent_alias_count = df_from_kl.groupby('send_alias').size()
print "Aliases used to send mail \n", sent_alias_count,'\n'

to_alias_count = df_to_kl.groupby('to_alias').size()
print "Aliases used to get mail \n", to_alias_count,'\n'

Aliases used to send mail 
send_alias
ken.lay@enron.com           1
kenneth.lay@enron.com    4081
dtype: int64 

Aliases used to get mail 
to_alias
k.l.lay@enron.com             1
k.lay@enron.com               1
k_lay@enron.com               2
ken.lay@enron.com             2
ken_lay@enron.com             2
ken_lay@enron.net             1
kenlay@enron.com              1
kenneth.l.lay@enron.com       2
kenneth.lay@enron.com      2110
kenneth_lay@enron.com        23
kenneth_lay@enron.net         3
kennethlay@enron.com          1
klay@enron.com              967
dtype: int64 



3) Determine how many of the messages are "To:" Ken Lay, and are "From:" Ken Lay. Provde a count for each of these.

In [3]:
to_kl_count = len(df_to_kl)
from_kl_count= len(df_from_kl)
print "Number of mails to Ken Lay = ", to_kl_count, '\n'
print "Number of mails from Ken Lay = ", from_kl_count,'\n'

Number of mails to Ken Lay =  3116 

Number of mails from Ken Lay =  4082 



4) Who did Lay send the most emails to? How many did he send to this recipient? Who did he receive the most from? How many did he receive from this sender?

In [5]:
#Group and sort to get the counts by person for sending and recieving
max_mails = df_from_kl.groupby('to_person').size()
max_mails = max_mails.sort_values(ascending=False)
print "most frequent recipient of mail FROM Ken Lay", max_mails.iloc[0:1], '\n'

max_mails = df_to_kl.groupby('from_person').size()
max_mails = max_mails.sort_values(ascending=False)
print "most frequent sender of mail TO Ken Lay", max_mails.iloc[0:1],'\n'

most frequent recipient of mail FROM Ken Lay to_person
l..wells@enron.com    28
dtype: int64 

most frequent sender of mail TO Ken Lay from_person
leonardo.pacheco@enron.com    187
dtype: int64 



5) Did the volume of emails sent by Lay increase or decrease after Enron filed for bankruptcy? How many did he send before the filing? How many, after?

In [10]:
# Extract the date column, and then delete the timezone adjustment which isn't
# recognized by python 2.7, parse and compare
df_date = pd.DataFrame(df_from_kl['date'])
df_date.to_csv('test_date.csv', header=True, sep=',') 
from dateutil.parser import parse
from datetime import datetime
# using 12/2/2001 at 12:00:00 as the filing date
before = 0 
after = 0
for i in df_date['date']:
    try:
        tmp = parse(i[0:25])
        if tmp <= datetime(2001,12,1,11,15,59):
            before +=1
        else:
            after +=1
    except:
        print "Unexpected error:", sys.exc_info()[0]   

print "KLay sent %i mails before the filing, and %i mails after \n"% (before,after)

KLay sent 4080 mails before the filing, and 2 mails after 



6) How many of the email messages in 4., above, mention Arthur Andersen, Enron's accounting firm?

In [7]:
# get the series of all IDs from step 4, the 'to' & 'from" Ken Lay & make list
# then iterate and get the body of message
id_list = (df_from_kl['msg_ID'].append(df_to_kl['msg_ID'])).tolist()
aa_count = 0
for msg in j_dict: 
    try:
        tmp = str(msg['headers']['Message-ID'])
        if tmp in id_list:
            body = str(msg['body'])
            if "Arthur Andersen" in body:
                aa_count +=1
    except KeyError:
        key_error_count +=1
print "Found %i mail(s) from or to Ken Lay mentioning Arthur Andersen" %aa_count

Found 5 mail(s) from or to Ken Lay mentioning Arthur Andersen
