In [2]:
import os
import sys
mod_path = os.path.abspath(os.path.join('..'))
if mod_path not in sys.path:
    sys.path.append(mod_path)

In [84]:
from pprint import pprint as pp
import pandas as pd
import numpy as np
import pickle
from pathlib import Path
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest

In [4]:
from tools.feature_format import featureFormat, targetFeatureSplit
from final_project.tester import dump_classifier_and_data

In [5]:
pwd_ = %pwd
pwd_ = Path(pwd_)
pwd_.joinpath('tools/feature_format.py')

WindowsPath('E:/Users/Trenton J. McKinney/PycharmProjects/ud120-projects/tools/feature_format.py')

# Task 1: Select Features
#### features_list is a list of strings, each of which is a feature name.
#### The first feature must be "poi".

In [6]:
financial_features = ['salary', 'deferral_payments', 'total_payments', 'loan_advances', 'bonus', 'restricted_stock_deferred',
                      'deferred_income', 'total_stock_value', 'expenses', 'exercised_stock_options', 'other',
                      'long_term_incentive', 'restricted_stock', 'director_fees']

email_features = ['to_messages', 'email_address', 'from_poi_to_this_person', 'from_messages', 'from_this_person_to_poi', 'shared_receipt_with_poi']

poi_label = ['poi']

In [7]:
feature_list = []
feature_list.extend(poi_label)
feature_list.extend(financial_features)
feature_list.extend(email_features)
feature_list

['poi',
 'salary',
 'deferral_payments',
 'total_payments',
 'loan_advances',
 'bonus',
 'restricted_stock_deferred',
 'deferred_income',
 'total_stock_value',
 'expenses',
 'exercised_stock_options',
 'other',
 'long_term_incentive',
 'restricted_stock',
 'director_fees',
 'to_messages',
 'email_address',
 'from_poi_to_this_person',
 'from_messages',
 'from_this_person_to_poi',
 'shared_receipt_with_poi']

In [86]:
len(feature_list)

21

## Load Dataset

In [48]:
# Load the dictionary containing the dataset
with open('final_project/final_project_dataset_unix.pkl', 'rb') as data_file:
    data_dict = pickle.load(data_file)

## Explore Dataset

In [49]:
df = pd.DataFrame.from_dict(data_dict, orient='index')
df = df.replace('NaN', np.nan)  # can't use the isnull method with 'NaN'
pd.set_option('max_rows', 150)
pd.set_option('max_columns', 21)
df

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,loan_advances,bonus,email_address,restricted_stock_deferred,deferred_income,total_stock_value,expenses,from_poi_to_this_person,exercised_stock_options,from_messages,other,from_this_person_to_poi,poi,long_term_incentive,shared_receipt_with_poi,restricted_stock,director_fees
ALLEN PHILLIP K,201955.0,2902.0,2869717.0,4484442.0,,4175000.0,phillip.allen@enron.com,-126027.0,-3081055.0,1729541.0,13868.0,47.0,1729541.0,2195.0,152.0,65.0,False,304805.0,1407.0,126027.0,
BADUM JAMES P,,,178980.0,182466.0,,,,,,257817.0,3486.0,,257817.0,,,,False,,,,
BANNANTINE JAMES M,477.0,566.0,,916197.0,,,james.bannantine@enron.com,-560222.0,-5104.0,5243487.0,56301.0,39.0,4046157.0,29.0,864523.0,0.0,False,,465.0,1757552.0,
BAXTER JOHN C,267102.0,,1295738.0,5634343.0,,1200000.0,,,-1386055.0,10623258.0,11200.0,,6680544.0,,2660303.0,,False,1586055.0,,3942714.0,
BAY FRANKLIN R,239671.0,,260455.0,827696.0,,400000.0,frank.bay@enron.com,-82782.0,-201641.0,63014.0,129142.0,,,,69.0,,False,,,145796.0,
BAZELIDES PHILIP J,80818.0,,684694.0,860136.0,,,,,,1599641.0,,,1599641.0,,874.0,,False,93750.0,,,
BECK SALLY W,231330.0,7315.0,,969068.0,,700000.0,sally.beck@enron.com,,,126027.0,37172.0,144.0,,4343.0,566.0,386.0,False,,2639.0,126027.0,
BELDEN TIMOTHY N,213999.0,7991.0,2144013.0,5501630.0,,5249999.0,tim.belden@enron.com,,-2334434.0,1110705.0,17355.0,228.0,953136.0,484.0,210698.0,108.0,True,,5521.0,157569.0,
BELFER ROBERT,,,-102500.0,102500.0,,,,44093.0,,-44093.0,,,3285.0,,,,False,,,,3285.0
BERBERIAN DAVID,216582.0,,,228474.0,,,david.berberian@enron.com,,,2493616.0,11892.0,,1624396.0,,,,False,,,869220.0,


In [59]:
describe_ = df.describe(percentiles=[])
describe_

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,loan_advances,bonus,restricted_stock_deferred,deferred_income,total_stock_value,expenses,from_poi_to_this_person,exercised_stock_options,from_messages,other,from_this_person_to_poi,long_term_incentive,shared_receipt_with_poi,restricted_stock,director_fees
count,95.0,86.0,39.0,125.0,4.0,82.0,18.0,49.0,126.0,95.0,86.0,102.0,86.0,93.0,86.0,66.0,86.0,110.0,17.0
mean,562194.3,2073.860465,1642674.0,5081526.0,41962500.0,2374235.0,166410.6,-1140475.0,6773957.0,108728.9,64.895349,5987054.0,608.790698,919065.0,41.232558,1470361.0,1176.465116,2321741.0,166804.9
std,2716369.0,2582.700981,5161930.0,29061720.0,47083210.0,10713330.0,4201494.0,4025406.0,38957770.0,533534.8,86.979244,31062010.0,1841.033949,4589253.0,100.073111,5942759.0,1178.317641,12518280.0,319891.4
min,477.0,57.0,-102500.0,148.0,400000.0,70000.0,-7576788.0,-27992890.0,-44093.0,148.0,0.0,3285.0,12.0,2.0,0.0,69223.0,2.0,-2604490.0,3285.0
50%,259996.0,1211.0,227449.0,1101393.0,41762500.0,769375.0,-146975.0,-159792.0,1102872.0,46950.0,35.0,1310814.0,41.0,52382.0,8.0,442035.0,740.5,451740.0,108579.0
max,26704230.0,15149.0,32083400.0,309886600.0,83925000.0,97343620.0,15456290.0,-833.0,434509500.0,5235198.0,528.0,311764000.0,14368.0,42667590.0,609.0,48521930.0,5521.0,130322300.0,1398517.0


In [60]:
df.isnull().sum()

salary                        51
to_messages                   60
deferral_payments            107
total_payments                21
loan_advances                142
bonus                         64
email_address                 35
restricted_stock_deferred    128
deferred_income               97
total_stock_value             20
expenses                      51
from_poi_to_this_person       60
exercised_stock_options       44
from_messages                 60
other                         53
from_this_person_to_poi       60
poi                            0
long_term_incentive           80
shared_receipt_with_poi       60
restricted_stock              36
director_fees                129
dtype: int64

In [61]:
df['poi'].value_counts()

False    128
True      18
Name: poi, dtype: int64

In [109]:
headers = list(df)
headers.sort()
headers

['bonus',
 'deferral_payments',
 'deferred_income',
 'director_fees',
 'email_address',
 'exercised_stock_options',
 'expenses',
 'from_messages',
 'from_poi_to_this_person',
 'from_this_person_to_poi',
 'loan_advances',
 'long_term_incentive',
 'other',
 'poi',
 'restricted_stock',
 'restricted_stock_deferred',
 'salary',
 'shared_receipt_with_poi',
 'to_messages',
 'total_payments',
 'total_stock_value']

In [89]:
len(headers)

21

In [63]:
list(df.index.values)

['ALLEN PHILLIP K',
 'BADUM JAMES P',
 'BANNANTINE JAMES M',
 'BAXTER JOHN C',
 'BAY FRANKLIN R',
 'BAZELIDES PHILIP J',
 'BECK SALLY W',
 'BELDEN TIMOTHY N',
 'BELFER ROBERT',
 'BERBERIAN DAVID',
 'BERGSIEKER RICHARD P',
 'BHATNAGAR SANJAY',
 'BIBI PHILIPPE A',
 'BLACHMAN JEREMY M',
 'BLAKE JR. NORMAN P',
 'BOWEN JR RAYMOND M',
 'BROWN MICHAEL',
 'BUCHANAN HAROLD G',
 'BUTTS ROBERT H',
 'BUY RICHARD B',
 'CALGER CHRISTOPHER F',
 'CARTER REBECCA C',
 'CAUSEY RICHARD A',
 'CHAN RONNIE',
 'CHRISTODOULOU DIOMEDES',
 'CLINE KENNETH W',
 'COLWELL WESLEY',
 'CORDES WILLIAM R',
 'COX DAVID',
 'CUMBERLAND MICHAEL S',
 'DEFFNER JOSEPH M',
 'DELAINEY DAVID W',
 'DERRICK JR. JAMES V',
 'DETMERING TIMOTHY J',
 'DIETRICH JANET R',
 'DIMICHELE RICHARD G',
 'DODSON KEITH',
 'DONAHUE JR JEFFREY M',
 'DUNCAN JOHN H',
 'DURAN WILLIAM D',
 'ECHOLS JOHN B',
 'ELLIOTT STEVEN',
 'FALLON JAMES B',
 'FASTOW ANDREW S',
 'FITZGERALD JAY L',
 'FOWLER PEGGY',
 'FOY JOE',
 'FREVERT MARK A',
 'FUGH JOHN L',
 'GAHN 

In [64]:
df.loc['LOCKHART EUGENE E']

salary                         NaN
to_messages                    NaN
deferral_payments              NaN
total_payments                 NaN
loan_advances                  NaN
bonus                          NaN
email_address                  NaN
restricted_stock_deferred      NaN
deferred_income                NaN
total_stock_value              NaN
expenses                       NaN
from_poi_to_this_person        NaN
exercised_stock_options        NaN
from_messages                  NaN
other                          NaN
from_this_person_to_poi        NaN
poi                          False
long_term_incentive            NaN
shared_receipt_with_poi        NaN
restricted_stock               NaN
director_fees                  NaN
Name: LOCKHART EUGENE E, dtype: object

In [65]:
df.loc['THE TRAVEL AGENCY IN THE PARK']

salary                          NaN
to_messages                     NaN
deferral_payments               NaN
total_payments               362096
loan_advances                   NaN
bonus                           NaN
email_address                   NaN
restricted_stock_deferred       NaN
deferred_income                 NaN
total_stock_value               NaN
expenses                        NaN
from_poi_to_this_person         NaN
exercised_stock_options         NaN
from_messages                   NaN
other                        362096
from_this_person_to_poi         NaN
poi                           False
long_term_incentive             NaN
shared_receipt_with_poi         NaN
restricted_stock                NaN
director_fees                   NaN
Name: THE TRAVEL AGENCY IN THE PARK, dtype: object

In [66]:
df.loc['TOTAL']

salary                       2.67042e+07
to_messages                          NaN
deferral_payments            3.20834e+07
total_payments               3.09887e+08
loan_advances                 8.3925e+07
bonus                        9.73436e+07
email_address                        NaN
restricted_stock_deferred   -7.57679e+06
deferred_income             -2.79929e+07
total_stock_value             4.3451e+08
expenses                      5.2352e+06
from_poi_to_this_person              NaN
exercised_stock_options      3.11764e+08
from_messages                        NaN
other                        4.26676e+07
from_this_person_to_poi              NaN
poi                                False
long_term_incentive          4.85219e+07
shared_receipt_with_poi              NaN
restricted_stock             1.30322e+08
director_fees                1.39852e+06
Name: TOTAL, dtype: object

# Task 2: Remove Outliers

In [67]:
outliers = ['THE TRAVEL AGENCY IN THE PARK', 'LOCKHART EUGENE E', 'TOTAL']
for value in outliers:
    del data_dict[value]

In [68]:
len(data_dict)

143

# Task 3: Create New Features

In [77]:
# In order to perform calculations
df = df.replace(np.nan, 0.0)
df['poi'] = df['poi'].replace([False, True], [0.0, 1.0])

In [79]:
df.head(8)

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,loan_advances,bonus,email_address,restricted_stock_deferred,deferred_income,total_stock_value,expenses,from_poi_to_this_person,exercised_stock_options,from_messages,other,from_this_person_to_poi,poi,long_term_incentive,shared_receipt_with_poi,restricted_stock,director_fees
ALLEN PHILLIP K,201955.0,2902.0,2869717.0,4484442.0,0.0,4175000.0,phillip.allen@enron.com,-126027.0,-3081055.0,1729541.0,13868.0,47.0,1729541.0,2195.0,152.0,65.0,0.0,304805.0,1407.0,126027.0,0.0
BADUM JAMES P,0.0,0.0,178980.0,182466.0,0.0,0.0,0,0.0,0.0,257817.0,3486.0,0.0,257817.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BANNANTINE JAMES M,477.0,566.0,0.0,916197.0,0.0,0.0,james.bannantine@enron.com,-560222.0,-5104.0,5243487.0,56301.0,39.0,4046157.0,29.0,864523.0,0.0,0.0,0.0,465.0,1757552.0,0.0
BAXTER JOHN C,267102.0,0.0,1295738.0,5634343.0,0.0,1200000.0,0,0.0,-1386055.0,10623258.0,11200.0,0.0,6680544.0,0.0,2660303.0,0.0,0.0,1586055.0,0.0,3942714.0,0.0
BAY FRANKLIN R,239671.0,0.0,260455.0,827696.0,0.0,400000.0,frank.bay@enron.com,-82782.0,-201641.0,63014.0,129142.0,0.0,0.0,0.0,69.0,0.0,0.0,0.0,0.0,145796.0,0.0
BAZELIDES PHILIP J,80818.0,0.0,684694.0,860136.0,0.0,0.0,0,0.0,0.0,1599641.0,0.0,0.0,1599641.0,0.0,874.0,0.0,0.0,93750.0,0.0,0.0,0.0
BECK SALLY W,231330.0,7315.0,0.0,969068.0,0.0,700000.0,sally.beck@enron.com,0.0,0.0,126027.0,37172.0,144.0,0.0,4343.0,566.0,386.0,0.0,0.0,2639.0,126027.0,0.0
BELDEN TIMOTHY N,213999.0,7991.0,2144013.0,5501630.0,0.0,5249999.0,tim.belden@enron.com,0.0,-2334434.0,1110705.0,17355.0,228.0,953136.0,484.0,210698.0,108.0,1.0,0.0,5521.0,157569.0,0.0


In [80]:
df.values

array([[201955.0, 2902.0, 2869717.0, ..., 1407.0, 126027.0, 0.0],
       [0.0, 0.0, 178980.0, ..., 0.0, 0.0, 0.0],
       [477.0, 566.0, 0.0, ..., 465.0, 1757552.0, 0.0],
       ...,
       [0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0],
       [158403.0, 0.0, 0.0, ..., 0.0, 3576206.0, 0.0],
       [0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0]], dtype=object)

In [110]:
headers.remove('poi')
headers.remove('email_address')

In [121]:
k_best = SelectKBest(k='all')
k_best.fit(df[financial_features].values, df['poi'].values)
score = k_best.scores_
score_chart = list(zip(financial_features, score))
score_chart_df = pd.DataFrame(score_chart, columns=['Feature', 'Score'])

In [122]:
score_chart_df

Unnamed: 0,Feature,Score
0,salary,5.7e-05
1,deferral_payments,0.235515
2,total_payments,0.357679
3,loan_advances,2.545253
4,bonus,0.081305
5,restricted_stock_deferred,0.004141
6,deferred_income,0.225532
7,total_stock_value,0.171227
8,expenses,0.012878
9,exercised_stock_options,0.234424


In [82]:
resources = pd.read_csv('resources.csv')

In [83]:
resources

Unnamed: 0,title,author,url
0,An Introduction to Feature Selection,Jason Brownlee,https://machinelearningmastery.com/an-introduc...
1,A Feature Selection Tool for Machine Learning ...,William Koehrsen,https://towardsdatascience.com/a-feature-selec...
2,Machine Learning with Scikit-Learn - 42 - Auto...,Cristi Vlad,https://www.youtube.com/watch?v=RXFnwCRb-is
3,scikit-learn v0.19.2 Section 1.13 Feature Sele...,,http://scikit-learn.org/stable/modules/feature...
