# Identify Persons of Interest in the Enron Corpus Dataset

## Introduction


## Data Exploration

### Dataset Description

In [3]:
# import packages
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler

In [4]:
def convert_dict_to_df(dictionary, features, remove_NaN=True, 
                        remove_all_zeroes=True, remove_any_zeroes=False, 
                        sort_keys=False):
    """
    Convert dictionary to a pandas data frame of features.
    
    Args:
        dictionary: Dictionary containing the feature names as keys and the 
            corresponding values.
        features: List of feature names. First feature passed needs to be 'poi'.
        remove_NaN: True converts all "NaN" strings to 0.
        remove_all_zeroes: True omits all 0 data points.
        remove_any_zeroes: True omits single 0 data points.
        sort_keys: True sorts the dictionary keys in alphabetical order before
            adding the data points to the data frame.

    Returns:
        Function returns a pandas data frame with each row representing a data 
        point with the specified features in its columns.
    """

    # check that first feature passed is 'poi'
    assert (features[0] == 'poi'), "The first feature needs to be 'poi'!"

    # data frame to store the data points as individual rows
    df = pd.DataFrame(columns=['name'] + features)

    # sort keys alphabetically if sort_keys is set to True
    if sort_keys:
        keys = sorted(dictionary.keys())
    else:
        keys = dictionary.keys()

    # loop trough the data dictionary 
    for key in keys:
        
        val_dict = {'name': key} # first entry of data point is the name of the person

        for feature in features:
            # check if specified feature exists, throw a warning if not and 
            # stop the function
            try:
                val = dictionary[key][feature]
            except KeyError:
                print("error: key ", feature, " not present")
                return

            val = dictionary[key][feature]

            # set 'NaN' strings to np.NaN values
            if val == "NaN" and not remove_NaN:
                val = np.NaN
            # set NaN values to 0 if remove_NaN is set to True
            elif val == "NaN" and remove_NaN:
                val = 0

            val_dict[feature] = val

        # do not add all zero data points if remove_all_zeroes is set to True
        if remove_all_zeroes:
            append = False
            for _, val in val_dict.items(): 
                if val != 0 and val != "NaN":
                    append = True
                    break
        
        # don not add single zero data points if remove_any_zeroes is set to 
        # True
        elif remove_any_zeroes:
            append = True
            if 0 in val_list[1:] or "NaN" in val_list[1:]: # exclude 'poi' from criteria????
                append = False
        
        # all data points are added 
        else:
            append = True

        # append data point if it is flagged for addition
        if append:
            df = df.append(val_dict, ignore_index=True)
        
        # convert the 'poi' column from boolean to numerical
        #df['poi'] = df['poi'].astype(int)
        
    return df

In [9]:
# load dictionary containing the dataset
with open("final_project_dataset.pkl", "rb") as data_file:
    data_dict = pickle.load(data_file)

# feature list
features = ['poi', 'bonus', 'deferral_payments', 'deferred_income', 
            'director_fees', 'exercised_stock_options', 'expenses', 
            'loan_advances', 'long_term_incentive', 'other', 'restricted_stock', 
            'restricted_stock_deferred', 'salary', 'total_payments', 
            'total_stock_value', 'email_address', 'from_messages', 
            'from_poi_to_this_person', 'from_this_person_to_poi', 
            'shared_receipt_with_poi', 'to_messages']

# convert specified features to data frame
data_df = convert_dict_to_df(data_dict, features, remove_NaN=True, 
                        remove_all_zeroes=True, remove_any_zeroes=False, 
                        sort_keys=True)

print(data_df.shape)
print(data_df.columns.values)

(146, 22)
['name' 'poi' 'bonus' 'deferral_payments' 'deferred_income' 'director_fees'
 'exercised_stock_options' 'expenses' 'loan_advances' 'long_term_incentive'
 'other' 'restricted_stock' 'restricted_stock_deferred' 'salary'
 'total_payments' 'total_stock_value' 'email_address' 'from_messages'
 'from_poi_to_this_person' 'from_this_person_to_poi'
 'shared_receipt_with_poi' 'to_messages']


| Feature                   | Type        |
|-------------------------- | ----------- |
| poi                       | categorical |
| bonus                     | numerical   |
| deferral_payments         | numerical   |  ab  |
| deferred_income           | numerical       |  ab  |
| director_fees             | numerical       |  ab  |
| exercised_stock_options   | numerical       |  ab  |
| expenses                  | numerical       |  ab  |
| loan_advances             | numerical       |  ab  |
| long_term_incentive       | numerical       |  ab  |
| other                     | numerical       |  ab  |
| restricted_stock          | numerical       |  ab  |
| restricted_stock_deferred | numerical       |  ab  |
| salary                    | numerical       |  ab  |
| total_payments            | numerical       |  ab  |
| total_stock_value         | numerical       |  ab  |
| email_address             | text       |  ab  |
| from_messages             | numerical       |  ab  |
| from_poi_to_this_person   | numerical       |  ab  |
| from_this_person_to_poi   | numerical       |  ab  |
| shared_receipt_with_poi   | numerical       |  ab  |
| to_messages               | numerical       |  ab  |


### Outliers



In [None]:
# drop 'TOTAL' row
data_df = data_df[data_df['name'] != 'TOTAL']

## Feature Selection and Engineering


## Algorithm Selection


## Validation and Evaluation


## Conclusion