In [1]:
import warnings
warnings.filterwarnings('ignore')
import sys
import pickle
sys.path.append("../tools/")
import pandas as pd
from scipy import stats
import numpy as np

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data



In [2]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary', 'total_payments', 'bonus', 'total_stock_value', 'long_term_incentive', 'share_receipt_with_poi', 'from_this_person_to_poit', 'from_poi_to_this_person'] # You will need to use more features

### Features to use with Pandas
feats = ['salary', 'total_payments', 'bonus', 'total_stock_value', 'long_term_incentive', 'share_receipt_with_poi', 'from_this_person_to_poit', 'from_poi_to_this_person']

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Create a dataframe from the dictionary using pandas
df = pd.DataFrame.from_dict(data_dict, orient='index').reset_index()

In [3]:
### Convert all columns besides email_address and poi to numbers
cols = ['salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income', 'long_term_incentive']
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [4]:
### Some basic data exploration
poi = df[df.poi == True]
non_poi = df[df.poi == False]
print "Total number people in dataset: %.0f\nNumber of POI: %.0f\nNumber of Non POI: %.0f\nNumber of features in data: %.0f" % (len(df), 
      len(poi), len(non_poi), len(df.columns))

Total number people in dataset: 146
Number of POI: 18
Number of Non POI: 128
Number of features in data: 22


In [5]:
### Total NaN values for each column 
print "Totals NAs for each number column:\n", df.isnull().sum(), "\nTotal NAs for email address: %.0f" % (len(df[df['email_address'] == 'NaN']))

Totals NAs for each number column:
index                          0
salary                        51
to_messages                   60
deferral_payments            107
total_payments                21
exercised_stock_options       44
bonus                         64
restricted_stock              36
shared_receipt_with_poi       60
restricted_stock_deferred    128
total_stock_value             20
expenses                      51
loan_advances                142
from_messages                 60
other                         53
from_this_person_to_poi       60
poi                            0
director_fees                129
deferred_income               97
long_term_incentive           80
email_address                  0
from_poi_to_this_person        0
dtype: int64 
Total NAs for email address: 35


In [6]:
### For features, we'll use salary, total_payments, bonus, total_stock_value, long_term_incentive,
### share_receipt_with_poi, from_this_person_to_poit, and from_poi_to_this_person

In [None]:
### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = df.to_dict(orient='index')

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [7]:
top_five = df.sort_values(['salary'], ascending=[False])
top_five.head()

Unnamed: 0,index,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,...,loan_advances,from_messages,other,from_this_person_to_poi,poi,director_fees,deferred_income,long_term_incentive,email_address,from_poi_to_this_person
130,TOTAL,26704229.0,,32083396.0,309886585.0,311764000.0,97343619.0,130322299.0,,-7576788.0,...,83925000.0,,42667589.0,,False,1398517.0,-27992891.0,48521928.0,,
122,SKILLING JEFFREY K,1111258.0,3627.0,,8682716.0,19250000.0,5600000.0,6843672.0,2042.0,,...,,108.0,22122.0,30.0,True,,,1920000.0,jeff.skilling@enron.com,88.0
79,LAY KENNETH L,1072321.0,4273.0,202911.0,103559793.0,34348384.0,7000000.0,14761694.0,2411.0,,...,81525000.0,36.0,10359729.0,16.0,True,,-300000.0,3600000.0,kenneth.lay@enron.com,123.0
47,FREVERT MARK A,1060932.0,3275.0,6426990.0,17252530.0,10433518.0,2000000.0,4188667.0,2979.0,,...,2000000.0,21.0,7427621.0,6.0,False,,-3367011.0,1617011.0,mark.frevert@enron.com,242.0
105,PICKERING MARK R,655037.0,898.0,,1386690.0,28798.0,300000.0,,728.0,,...,400000.0,67.0,,0.0,False,,,,mark.pickering@enron.com,7.0


In [10]:
df = df[df['index'] != 'TOTAL']

In [11]:
top_five = df.sort_values(['salary'], ascending=[False])
top_five.head()

Unnamed: 0,index,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,...,loan_advances,from_messages,other,from_this_person_to_poi,poi,director_fees,deferred_income,long_term_incentive,email_address,from_poi_to_this_person
122,SKILLING JEFFREY K,1111258.0,3627.0,,8682716.0,19250000.0,5600000.0,6843672.0,2042.0,,...,,108.0,22122.0,30.0,True,,,1920000.0,jeff.skilling@enron.com,88
79,LAY KENNETH L,1072321.0,4273.0,202911.0,103559793.0,34348384.0,7000000.0,14761694.0,2411.0,,...,81525000.0,36.0,10359729.0,16.0,True,,-300000.0,3600000.0,kenneth.lay@enron.com,123
47,FREVERT MARK A,1060932.0,3275.0,6426990.0,17252530.0,10433518.0,2000000.0,4188667.0,2979.0,,...,2000000.0,21.0,7427621.0,6.0,False,,-3367011.0,1617011.0,mark.frevert@enron.com,242
105,PICKERING MARK R,655037.0,898.0,,1386690.0,28798.0,300000.0,,728.0,,...,400000.0,67.0,,0.0,False,,,,mark.pickering@enron.com,7
139,WHALLEY LAWRENCE G,510364.0,6019.0,,4677574.0,3282960.0,3000000.0,2796177.0,3920.0,,...,,556.0,301026.0,24.0,False,,,808346.0,greg.whalley@enron.com,186


In [None]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

In [None]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

In [None]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)