In [124]:
import pickle
import IPython.display as disp
import pandas as pd
import numpy as np

# Exploratory Data Analysis: Enron Dataset

## Introduction

### Dataset Contents

#### final_project_dataset.pkl
Financial data from the included `enron61712insiderpay.pdf` have been combined into a dictionary in the included final_project_dataset.pkl file. In the dictionary, the key is the person's name, and the value is another dictionary, which contains the names of all the features and their values for that person. The features in the data fall into three major types, namely financial features, email features and POI labels.

**financial features**:
```python
# (all units are in US dollars)
[
 'salary',
 'deferral_payments',
 'total_payments',
 'loan_advances',
 'bonus',
 'restricted_stock_deferred',
 'deferred_income',
 'total_stock_value',
 'expenses',
 'exercised_stock_options',
 'other',
 'long_term_incentive',
 'restricted_stock',
 'director_fees'
]
```

**email features**:
```python
# (units are generally number of emails messages that reference the ; notable exception is ‘email_address’, which is a text string)
[
 'to_messages',
 'email_address',
 'from_poi_to_this_person',
 'from_messages',
 'from_this_person_to_poi',
 'poi', # POI Label (boolean, represented as integer).
 'shared_receipt_with_poi'
] 
```

#### ../maildir directory
The maildir directory contains the untar'ed contents of `enron_mail_20150507.tar`, which was downloaded from https://www.cs.cmu.edu/~./enron/. The contents consist of emails from about 150 employees, mostly senior management of Enron, organized into directories. An element in the list returned by any of the *email features* above references an email in this directory.

#### poi_names.txt
This file contains a list of all the people who were a person of interest in the Enron scandal. A POI is defined as someone who was:
* indicted
* settled without admitting guilt
* testified in exchange for immunity


## Summary of Dataset

In [140]:
enron_data = pickle.load(open("./final_project_dataset.pkl"))

**What an entry in the dataset looks like:**

In [17]:
enron_data.iteritems().next()

('METTS MARK',
 {'bonus': 600000,
  'deferral_payments': 'NaN',
  'deferred_income': 'NaN',
  'director_fees': 'NaN',
  'email_address': 'mark.metts@enron.com',
  'exercised_stock_options': 'NaN',
  'expenses': 94299,
  'from_messages': 29,
  'from_poi_to_this_person': 38,
  'from_this_person_to_poi': 1,
  'loan_advances': 'NaN',
  'long_term_incentive': 'NaN',
  'other': 1740,
  'poi': False,
  'restricted_stock': 585062,
  'restricted_stock_deferred': 'NaN',
  'salary': 365788,
  'shared_receipt_with_poi': 702,
  'to_messages': 807,
  'total_payments': 1061827,
  'total_stock_value': 585062})

**Number of employees in the dataset:**

In [20]:
len(enron_data)

146

**Number of features in the dataset:**

In [23]:
len(enron_data.itervalues().next())

21

### Features
To summarize the features in the dataset, convert the dictionary into a pandas DataFrame:

In [142]:
# Convert to pandas DataFrame to get a better picture of what the features are

# First a quick sanity check to make sure values are being converted properly
assert(enron_data == pd.DataFrame.from_dict(enron_data, orient="index").transpose().to_dict())

In [146]:
# Replace "Nan" with NaN
enron_data_mod = dict(enron_data)
for columns in enron_data_mod.itervalues():
    for k,v in columns.iteritems():
        if v == "NaN":
            columns[k] = np.nan

enron_df = pd.DataFrame.from_dict(enron_data, orient="index")

#### Financial Features

In [145]:
enron_df.loc[:, ['salary',
                 'deferral_payments',
                 'total_payments',
                 'loan_advances',
                 'bonus',
                 'restricted_stock_deferred',
                 'deferred_income',]].describe()

Unnamed: 0,salary,deferral_payments,total_payments,loan_advances,bonus,restricted_stock_deferred,deferred_income
count,95.0,39.0,125.0,4.0,82.0,18.0,49.0
mean,562194.294737,1642674.153846,5081526.0,41962500.0,2374234.609756,166410.555556,-1140475.142857
std,2716369.154553,5161929.973575,29061720.0,47083208.7019,10713327.969046,4201494.314703,4025406.378506
min,477.0,-102500.0,148.0,400000.0,70000.0,-7576788.0,-27992891.0
25%,211816.0,81573.0,394475.0,1600000.0,431250.0,-389621.75,-694862.0
50%,259996.0,227449.0,1101393.0,41762500.0,769375.0,-146975.0,-159792.0
75%,312117.0,1002671.5,2093263.0,82125000.0,1200000.0,-75009.75,-38346.0
max,26704229.0,32083396.0,309886600.0,83925000.0,97343619.0,15456290.0,-833.0


In [148]:
enron_df.loc[:, ['total_stock_value',
                 'expenses',
                 'exercised_stock_options',
                 'other',
                 'long_term_incentive',
                 'restricted_stock',
                 'director_fees']].describe()

Unnamed: 0,total_stock_value,expenses,exercised_stock_options,other,long_term_incentive,restricted_stock,director_fees
count,126.0,95.0,102.0,93.0,66.0,110.0,17.0
mean,6773957.0,108728.915789,5987054.0,919064.967742,1470361.454545,2321741.0,166804.882353
std,38957770.0,533534.814109,31062010.0,4589252.907638,5942759.315498,12518280.0,319891.409747
min,-44093.0,148.0,3285.0,2.0,69223.0,-2604490.0,3285.0
25%,494510.2,22614.0,527886.2,1215.0,281250.0,254018.0,98784.0
50%,1102872.0,46950.0,1310814.0,52382.0,442035.0,451740.0,108579.0
75%,2949847.0,79952.5,2547724.0,362096.0,938672.0,1002370.0,113784.0
max,434509500.0,5235198.0,311764000.0,42667589.0,48521928.0,130322300.0,1398517.0


#### Email Features

In [150]:
enron_df.loc[:, ['to_messages',
                 'email_address',
                 'from_poi_to_this_person',
                 'from_messages',
                 'from_this_person_to_poi',
                 'poi', # POI Label (boolean, represented as integer).
                 'shared_receipt_with_poi']].describe()

Unnamed: 0,to_messages,from_poi_to_this_person,from_messages,from_this_person_to_poi,poi,shared_receipt_with_poi
count,86.0,86.0,86.0,86.0,146,86.0
mean,2073.860465,64.895349,608.790698,41.232558,0.1232877,1176.465116
std,2582.700981,86.979244,1841.033949,100.073111,0.3298989,1178.317641
min,57.0,0.0,12.0,0.0,False,2.0
25%,541.25,10.0,22.75,1.0,0,249.75
50%,1211.0,35.0,41.0,8.0,0,740.5
75%,2634.75,72.25,145.5,24.75,0,1888.25
max,15149.0,528.0,14368.0,609.0,True,5521.0


#### ../maildir contents

In [156]:
ls ../maildir

[34mallen-p[m[m/         [34mdasovich-j[m[m/      [34mgiron-d[m[m/         [34mkeavey-p[m[m/        [34mmcconnell-m[m[m/     [34mquigley-d[m[m/       [34mshackleton-s[m[m/    [34mtholt-j[m[m/
[34marnold-j[m[m/        [34mdavis-d[m[m/         [34mgriffith-j[m[m/      [34mkeiser-k[m[m/        [34mmckay-b[m[m/         [34mrapp-b[m[m/          [34mshankman-j[m[m/      [34mthomas-p[m[m/
[34marora-h[m[m/         [34mdean-c[m[m/          [34mgrigsby-m[m[m/       [34mking-j[m[m/          [34mmckay-j[m[m/         [34mreitmeyer-j[m[m/     [34mshapiro-r[m[m/       [34mtownsend-j[m[m/
[34mbadeer-r[m[m/        [34mdelainey-d[m[m/      [34mguzman-m[m[m/        [34mkitchen-l[m[m/       [34mmclaughlin-e[m[m/    [34mrichey-c[m[m/        [34mshively-h[m[m/       [34mtycholiz-b[m[m/
[34mbailey-s[m[m/        [34mderrick-j[m[m/       [34mhaedicke-m[m[m/      [34mkuykendall-t[m[m/    [34mmerriss-

##### Inbox Contents

In [164]:
ls ../maildir/buy-r/

[34m_sent_mail[m[m/         [34mconnect_deletes[m[m/    [34mdiscussion_threads[m[m/ [34mnotes_inbox[m[m/        [34msent_items[m[m/
[34mall_documents[m[m/      [34mcontacts[m[m/           [34mhelsinki[m[m/           [34mpersonal[m[m/           [34mtasks[m[m/
[34mcalendar[m[m/           [34mdeleted_items[m[m/      [34minbox[m[m/              [34msent[m[m/               [34mto_do[m[m/


In [169]:
ls -l ../maildir/buy-r/inbox

total 11896
-rw-r--r--@ 1 wtruong  staff    1760 Feb  3  2004 1.
-rw-r--r--@ 1 wtruong  staff    1880 Feb  3  2004 10.
-rw-r--r--@ 1 wtruong  staff    2812 Feb  3  2004 100.
-rw-r--r--@ 1 wtruong  staff    3658 Feb  3  2004 1000.
-rw-r--r--@ 1 wtruong  staff    3096 Feb  3  2004 1001.
-rw-r--r--@ 1 wtruong  staff    1865 Feb  3  2004 1002.
-rw-r--r--@ 1 wtruong  staff    3490 Feb  3  2004 1003.
-rw-r--r--@ 1 wtruong  staff     967 Feb  3  2004 1004.
-rw-r--r--@ 1 wtruong  staff    2105 Feb  3  2004 1005.
-rw-r--r--@ 1 wtruong  staff    3645 Feb  3  2004 1006.
-rw-r--r--@ 1 wtruong  staff    1420 Feb  3  2004 1008.
-rw-r--r--@ 1 wtruong  staff     618 Feb  3  2004 1009.
-rw-r--r--@ 1 wtruong  staff     793 Feb  3  2004 101.
-rw-r--r--@ 1 wtruong  staff    4063 Feb  3  2004 1010.
-rw-r--r--@ 1 wtruong  staff    4263 Feb  3  2004 1011.
-rw-r--r--@ 1 wtruong  staff    1821 Feb  3  2004 1012.
-rw-r--r--@ 1 wtruong  staff    6026 Feb  3  2004 1013.
-rw-r--r--@ 1 wtruong  st

Each of these files are emails.

In [174]:
enron_df.iloc[2]

salary                                              477
to_messages                                         566
deferral_payments                                   NaN
total_payments                                   916197
exercised_stock_options                         4046157
bonus                                               NaN
restricted_stock                                1757552
shared_receipt_with_poi                             465
restricted_stock_deferred                       -560222
total_stock_value                               5243487
expenses                                          56301
loan_advances                                       NaN
from_messages                                        29
other                                            864523
from_this_person_to_poi                               0
poi                                               False
director_fees                                       NaN
deferred_income                                 

In [36]:
def css_styling():
    styles = open("./styles/custom0.css", "r").read()
    return disp.HTML(styles)
css_styling()