In [3]:
%%html
<style>
table {margin-left: 0 !important;}
</style>

# Identify Persons of Interest in the Enron Corpus Dataset

## Introduction


## Data Exploration

### Dataset Description

In [4]:
# import packages
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import os
import pandas as pd
import pickle
import re
from sklearn.preprocessing import MinMaxScaler

In [5]:
def convert_dict_to_df(dictionary, features, remove_NaN=True, 
                        remove_all_zeroes=True, remove_any_zeroes=False, 
                        sort_keys=False):
    """
    Convert dictionary to a pandas data frame of features.
    
    Args:
        dictionary: Dictionary containing the feature names as keys and the 
            corresponding values.
        features: List of feature names. First feature passed needs to be 'poi'.
        remove_NaN: True converts all "NaN" strings to 0.
        remove_all_zeroes: True omits all 0 data points.
        remove_any_zeroes: True omits single 0 data points.
        sort_keys: True sorts the dictionary keys in alphabetical order before
            adding the data points to the data frame.

    Returns:
        Function returns a pandas data frame with each row representing a data 
        point with the specified features in its columns.
    """

    # check that first feature passed is 'poi'
    assert (features[0] == 'poi'), "The first feature needs to be 'poi'!"

    # data frame to store the data points as individual rows
    df = pd.DataFrame(columns=['name'] + features)

    # sort keys alphabetically if sort_keys is set to True
    if sort_keys:
        keys = sorted(dictionary.keys())
    else:
        keys = dictionary.keys()

    # loop trough the data dictionary 
    for key in keys:
        
        val_dict = {'name': key} # first entry of data point is the name of the person

        for feature in features:
            # check if specified feature exists, throw a warning if not and 
            # stop the function
            try:
                val = dictionary[key][feature]
            except KeyError:
                print("error: key ", feature, " not present")
                return

            val = dictionary[key][feature]

            # set 'NaN' strings to np.NaN values
            if val == "NaN" and not remove_NaN:
                val = np.NaN
            # set NaN values to 0 if remove_NaN is set to True
            elif val == "NaN" and remove_NaN:
                val = 0

            val_dict[feature] = val

        # do not add all zero data points if remove_all_zeroes is set to True
        if remove_all_zeroes:       
            append = False
            for key, val in val_dict.items(): 
                if key != 'poi' and key != 'name': # exclude 'poi' and 'name' from criteria
                    if val != 0 and val != "NaN":
                        append = True
                        break
        
        # don not add single zero data points if remove_any_zeroes is set to 
        # True
        elif remove_any_zeroes:
            append = True
            keys =  [f for f in features if f not in ('poi', 'name')] # exclude 'poi' and 'name' from criteria
            val_list = [val_dict.get(k) for k in keys] # list containing values of remaining features

            if 0 in val_list or "NaN" in val_list:
                append = False
        
        # all data points are added 
        else:
            append = True
    
        
        # append data point if it is flagged for addition
        if append:
            df = df.append(val_dict, ignore_index=True)
        
    return df

In [6]:
# load dictionary containing the dataset
with open("enron_dataset.pkl", "rb") as data_file:
    data_dict = pickle.load(data_file)

# feature list
features = ['poi', 'bonus', 'deferral_payments', 'deferred_income', 
            'director_fees', 'exercised_stock_options', 'expenses', 
            'loan_advances', 'long_term_incentive', 'other', 'restricted_stock', 
            'restricted_stock_deferred', 'salary', 'total_payments', 
            'total_stock_value', 'email_address', 'from_messages', 
            'from_poi_to_this_person', 'from_this_person_to_poi', 
            'shared_receipt_with_poi', 'to_messages']

# convert specified features to data frame
data_df = convert_dict_to_df(data_dict, features, remove_NaN=False, 
                        remove_all_zeroes=True, remove_any_zeroes=False, 
                        sort_keys=True)

print(data_df.shape)
#print(data_df.columns.values)

(146, 22)


In [7]:
data_df['poi'].sum()

18

The underlying dataset for this project is a combination of Enron email and financial data. The data is present in the `enron_dataset.pkl` file where it is stored in a dictionary structure. Each key-value pair in the dictionary corresponds to one person. The dictionary key is the person's name, and the value is another dictionary, which contains the names of all the features and their values for that person. 

The data contains three major feature categories: POI labels, financial features and email features. The 21 feature names and their type are described in the table below.

| Feature                   | Type        |
|-------------------------- | ----------- |
| poi                       | categorical |
| bonus                     | numerical   |
| deferral_payments         | numerical   |
| deferred_income           | numerical   |
| director_fees             | numerical   |
| exercised_stock_options   | numerical   |
| expenses                  | numerical   |
| loan_advances             | numerical   |
| long_term_incentive       | numerical   |
| other                     | numerical   |
| restricted_stock          | numerical   |
| restricted_stock_deferred | numerical   |
| salary                    | numerical   |
| total_payments            | numerical   |
| total_stock_value         | numerical   |
| email_address             | text        |
| from_messages             | numerical   |
| from_poi_to_this_person   | numerical   |
| from_this_person_to_poi   | numerical   |
| shared_receipt_with_poi   | numerical   |
| to_messages               | numerical   |

The dataset contains information about 146 different data points. Of those, 18 are marked as POI while 128 are not.

Missing values...

In [8]:
# count number of NaN values in each column
print(data_df.isnull().sum())

name                           0
poi                            0
bonus                         64
deferral_payments            107
deferred_income               97
director_fees                129
exercised_stock_options       44
expenses                      51
loan_advances                142
long_term_incentive           80
other                         53
restricted_stock              36
restricted_stock_deferred    128
salary                        51
total_payments                21
total_stock_value             20
email_address                 35
from_messages                 60
from_poi_to_this_person       60
from_this_person_to_poi       60
shared_receipt_with_poi       60
to_messages                   60
dtype: int64


The dataset contains a lot of features with missing values. One can see that it is based on the finacial data and that only for 86 point a connection to the email dataset could be made (60 data points without email data). For the features 'defferal_payments', 'director_fees', 'loan_advances' and 'restricted_stock_deferred' only few data is available.

### Outliers

In the financial data a strange outlier was found. This data point had the largest values for all the different financial feature and was identified as the total column from the spreadsheet. This line was dropped and not considered further.

In [9]:
# drop 'TOTAL' row
data_df = data_df[data_df['name'] != 'TOTAL']

## Feature Selection

### Feature Engineering


from_poi_deleted

In [10]:
def get_email_from_filename(filename, start, end):
    """
    
    """
    email_address = ""
    
    m = re.search("{}(.*){}".format(start, end), filename)
    if m:
        email_address = m.group(1)
    
    return email_address

def get_sender_from_fileline(line):
    """
    enron_mail_20110402/maildir/allen-p/deleted_items/127.
    """
    start = "enron_mail_20110402\/maildir\/"
    end = "\/.*"
    m = re.search("{}(.*){}".format(start, end), line)
    sender = m.group(1)
    
    return sender

data_df[(data_df['poi'] == True)][['name', 'from_messages']]

Unnamed: 0,name,from_messages
7,BELDEN TIMOTHY N,484.0
15,BOWEN JR RAYMOND M,27.0
20,CALGER CHRISTOPHER F,144.0
22,CAUSEY RICHARD A,49.0
26,COLWELL WESLEY,40.0
31,DELAINEY DAVID W,3069.0
43,FASTOW ANDREW S,
54,GLISAN JR BEN F,16.0
59,HANNON KEVIN P,32.0
65,HIRKO JOSEPH,


In [11]:
dir_path = "./emails_by_address/"
poi_list = ["delaney-d", "skilling-j"]


from_poi_deleted_dict = {}

for filename in os.listdir(dir_path):
    # only check files with emails to that address
    if filename.startswith("to_"):
        # extract email from filename   
        email = get_email_from_filename(filename, "to\_", "\.txt")
        
        from_poi_deleted_dict[email] = 0
        
        with open(os.path.join(dir_path, filename), 'r') as f:
            for line in f:
                sender = get_sender_from_fileline(line)
                #print(sender)
                """
                print(sender)
                folder = get_folder_from_fileline(line)
                
                if sender in poi_list and folder == "deleted_items":
                    from_poi_deleted_dict[email] += 1

"""
            
    

In [12]:
data_df[data_df['email_address'] == 'andrew.fastow@enron.com']

Unnamed: 0,name,poi,bonus,deferral_payments,deferred_income,director_fees,exercised_stock_options,expenses,loan_advances,long_term_incentive,...,restricted_stock_deferred,salary,total_payments,total_stock_value,email_address,from_messages,from_poi_to_this_person,from_this_person_to_poi,shared_receipt_with_poi,to_messages
43,FASTOW ANDREW S,True,1300000.0,,-1386055.0,,,55921.0,,1736055.0,...,,440698.0,2424083.0,1794412.0,andrew.fastow@enron.com,,,,,


## Algorithm Selection


## Validation and Evaluation


## Conclusion