# Person of Interest Identification - Enron
## Udacity - Introduction to Machine Learning
### Wesley Scoggin November 2017
**Purpose:** To Identify persons of interest (POI) utilizing machine learning techniques to evaluate email data published as part of the fraud investigation of Enron business practices in the early 2000's


In [5]:
# import IPython.core.display as di
# di.display_html('<script>jQuery(function() {if (jQuery("body.notebook_app").length == 0) { jQuery(".input_area").toggle(); jQuery(".prompt").toggle();}});</script>', raw=True)
# di.display_html('''<button onclick="jQuery('.input_area').toggle(); jQuery('.prompt').toggle();">Toggle code</button>''', raw=True)
import sys
sys.path.append("../tools/")
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tester import dump_classifier_and_data, test_classifier
from feature_format import featureFormat, targetFeatureSplit

### Available Data
In this financial information dataset of Enron employees there are 146 entries with 21 features each, including POI. The email address feature is neither indicates a value that can be scored usefully or categorical and will be removed in feature selection.

In [43]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

available_features=[]

for people, k in data_dict.iteritems():
    for key, value in k.iteritems():
        if key not in available_features:
            available_features.append(key)
print 'Number of employees in dataset:', len(data_dict)
print 'Number of Features in dataset:',len(available_features)\
,'\ncomprised of:\n', available_features

Number of employees in dataset: 146
Number of Features in dataset: 21 
comprised of:
['salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'email_address', 'from_poi_to_this_person']


In [44]:
#Identify and Remove outliers

#find employees with nan for all data
all_nan = []
for person, features in data_dict.items():
    notNaN = False
    for feature, val in features.items():
        if feature != 'poi':
            if val != 'NaN':
                notNaN = True
                break
    if not notNaN:
        all_nan.append(person)
print 'Users with all ''NaN'' data', all_nan

Users with all NaN data ['LOCKHART EUGENE E']


### Outliers
As detailed above, Eugene Lockhart, seems to have 'Nan' as values for each feature type. Further visual inspection below of the keys of each feature list indicates that there are two other employees in this list that do not seem to have names that are consistent with the format: LASTNAME FIRSTNAME MI and will be removed from the dataset.

In [45]:
#Examine people in the data_dict to find names that do not resemble employees:
people = sorted(list(data_dict.keys()))
c1 = pd.Series(people[0:30])
c2 = pd.Series(people[30:60])
c3 = pd.Series(people[60:90])
c4 = pd.Series(people[90:120])
c5 = pd.Series(people[120:])
columns = pd.DataFrame({' ':c1, '  ':c2, '   ':c3, '    ':c4, '     ':c5})
#format nan values for printing list columnwise to display
columns = columns.replace(np.nan, '', regex=True)
#highlight suspicious entries that do not resemble employee names 
#or employees with all NaN data
def highlight_vals(val, color='green'):
    if val in ['TOTAL','THE TRAVEL AGENCY IN THE PARK','LOCKHART EUGENE E']:
        return 'background-color: %s' % color
    else:
        return ''
columns.style.applymap(highlight_vals, subset=[' ', '  ', '   ','    ', '     '])

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5
0,ALLEN PHILLIP K,DEFFNER JOSEPH M,HAUG DAVID L,MCDONALD REBECCA,SHERRICK JEFFREY B
1,BADUM JAMES P,DELAINEY DAVID W,HAYES ROBERT E,MCMAHON JEFFREY,SHERRIFF JOHN R
2,BANNANTINE JAMES M,DERRICK JR. JAMES V,HAYSLETT RODERICK J,MENDELSOHN JOHN,SKILLING JEFFREY K
3,BAXTER JOHN C,DETMERING TIMOTHY J,HERMANN ROBERT J,METTS MARK,STABLER FRANK
4,BAY FRANKLIN R,DIETRICH JANET R,HICKERSON GARY J,MEYER JEROME J,SULLIVAN-SHAKLOVITZ COLLEEN
5,BAZELIDES PHILIP J,DIMICHELE RICHARD G,HIRKO JOSEPH,MEYER ROCKFORD G,SUNDE MARTIN
6,BECK SALLY W,DODSON KEITH,HORTON STANLEY C,MORAN MICHAEL P,TAYLOR MITCHELL S
7,BELDEN TIMOTHY N,DONAHUE JR JEFFREY M,HUGHES JAMES A,MORDAUNT KRISTINA M,THE TRAVEL AGENCY IN THE PARK
8,BELFER ROBERT,DUNCAN JOHN H,HUMPHREY GENE E,MULLER MARK S,THORN TERENCE H
9,BERBERIAN DAVID,DURAN WILLIAM D,IZZO LAWRENCE L,MURRAY JULIA H,TILNEY ELIZABETH A


In [46]:
#Remove people identified above
for person in ['TOTAL','THE TRAVEL AGENCY IN THE PARK','LOCKHART EUGENE E']:
    data_dict.pop(person)


In [94]:
def count_unique_and_null(dictionary):
    feature_summary = []
    for feature in [f for f in available_features if f!='poi']:
        entry_unique= []
        nan_count = 0
        for entry in dictionary:
            if dictionary[entry][feature] == 'NaN':
                nan_count += 1
            elif dictionary[entry][feature] not in entry_unique and dictionary[entry][feature] !='NaN' :
                entry_unique.append(entry)
        feature_summary.append({'feature':feature
                                ,'unique count':len(entry_unique)
                                ,'nans': nan_count})
    return feature_summary

df = pd.DataFrame(count_unique_and_null(data_dict))
df.sort_values(by = 'unique count', ascending = False)

# df = pd.DataFrame(data_dict).dropna().transpose()
# df = pd.DataFrame(df.describe()).transpose()
# df.sort_values(by = 'unique', ascending = False)

Unnamed: 0,feature,nans,unique count
9,total_stock_value,18,125
3,total_payments,20,123
18,email_address,32,111
6,restricted_stock,34,109
4,exercised_stock_options,42,101
0,salary,49,94
10,expenses,49,94
13,other,52,91
7,shared_receipt_with_poi,57,86
1,to_messages,57,86


In [None]:
#explore data
features_list = [ 'poi'
                 ,'salary'
                 ,'to_messages'
                 ,'deferral_payments'
                 ,'total_payments'
                 ,'exercised_stock_options'
                 ,'bonus', 'restricted_stock'
                 ,'shared_receipt_with_poi'
                 ,'restricted_stock_deferred'
                 ,'total_stock_value'
                 ,'expenses'
                 ,'loan_advances'
                 ,'from_messages'
                 ,'other'
                 ,'from_this_person_to_poi'
                 ,'poi', 'director_fees'
                 ,'deferred_income'
                 ,'long_term_incentive'
                 ,'from_poi_to_this_person']# You will need to use more features

