In [1]:
import warnings
warnings.filterwarnings('ignore')
import sys
import pickle
sys.path.append("../tools/")
import pandas as pd
from scipy import stats
import numpy as np

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data



In [2]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] # You will need to use more features
### Features selection will be done after some removal of errant values, na values filled, and features creation
### has occurred

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Create a dataframe from the dictionary using pandas
df = pd.DataFrame.from_dict(data_dict, orient='index').reset_index()

In [3]:
### Convert all columns besides email_address and poi to numbers
cols = [c for c in df.columns if c not in ['index', 'poi', 'email_address']]
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [4]:
### Some basic data exploration
poi = df[df.poi == True]
non_poi = df[df.poi == False]
print "Total number people in dataset: %.0f\nNumber of POI: %.0f\nNumber of Non POI: %.0f\nNumber of features in data: %.0f" % (len(df), 
      len(poi), len(non_poi), len(df.columns))

Total number people in dataset: 146
Number of POI: 18
Number of Non POI: 128
Number of features in data: 22


In [5]:
### Total NaN values for each column 
print "Totals NAs for each number column:\n", df.isnull().sum(), "\nTotal NAs for email address: %.0f" % (len(df[df['email_address'] == 'NaN']))

Totals NAs for each number column:
index                          0
salary                        51
to_messages                   60
deferral_payments            107
total_payments                21
exercised_stock_options       44
bonus                         64
restricted_stock              36
shared_receipt_with_poi       60
restricted_stock_deferred    128
total_stock_value             20
expenses                      51
loan_advances                142
from_messages                 60
other                         53
from_this_person_to_poi       60
poi                            0
director_fees                129
deferred_income               97
long_term_incentive           80
email_address                  0
from_poi_to_this_person       60
dtype: int64 
Total NAs for email address: 35


In [6]:
### Task 2: Remove outliers
### Upon inspecting the dataset, the TOTAL column was added. This is removed below.
df = df[df['index'] != 'TOTAL']

In [7]:
### Now we'll fill NaN amounts with 0
df = df.fillna(0)

In [8]:
### Task 3: Create new feature(s)
### New features using all emails including poi with this person
df['total_poi_emails'] = df['shared_receipt_with_poi'] + df['from_this_person_to_poi'] + df['from_poi_to_this_person']

In [9]:
print df.dtypes

index                         object
salary                       float64
to_messages                  float64
deferral_payments            float64
total_payments               float64
exercised_stock_options      float64
bonus                        float64
restricted_stock             float64
shared_receipt_with_poi      float64
restricted_stock_deferred    float64
total_stock_value            float64
expenses                     float64
loan_advances                float64
from_messages                float64
other                        float64
from_this_person_to_poi      float64
poi                             bool
director_fees                float64
deferred_income              float64
long_term_incentive          float64
email_address                 object
from_poi_to_this_person      float64
total_poi_emails             float64
dtype: object


In [10]:
df.head()

Unnamed: 0,index,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,...,from_messages,other,from_this_person_to_poi,poi,director_fees,deferred_income,long_term_incentive,email_address,from_poi_to_this_person,total_poi_emails
0,ALLEN PHILLIP K,201955.0,2902.0,2869717.0,4484442.0,1729541.0,4175000.0,126027.0,1407.0,-126027.0,...,2195.0,152.0,65.0,False,0.0,-3081055.0,304805.0,phillip.allen@enron.com,47.0,1519.0
1,BADUM JAMES P,0.0,0.0,178980.0,182466.0,257817.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,False,0.0,0.0,0.0,,0.0,0.0
2,BANNANTINE JAMES M,477.0,566.0,0.0,916197.0,4046157.0,0.0,1757552.0,465.0,-560222.0,...,29.0,864523.0,0.0,False,0.0,-5104.0,0.0,james.bannantine@enron.com,39.0,504.0
3,BAXTER JOHN C,267102.0,0.0,1295738.0,5634343.0,6680544.0,1200000.0,3942714.0,0.0,0.0,...,0.0,2660303.0,0.0,False,0.0,-1386055.0,1586055.0,,0.0,0.0
4,BAY FRANKLIN R,239671.0,0.0,260455.0,827696.0,0.0,400000.0,145796.0,0.0,-82782.0,...,0.0,69.0,0.0,False,0.0,-201641.0,0.0,frank.bay@enron.com,0.0,0.0


In [11]:
### Temporary features for analysis
temp_features = [c for c in df.columns if c not in ['index', 'poi', 'email_address']]

In [12]:
### First we'll scale the features. We will probably be trying out SVM so, while not necessary for all classifers
### it won't adversely impact, for example, Random Forests, while SVM will be negatively impacted by not feature
### scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[temp_features] = scaler.fit_transform(df[temp_features])

In [13]:
### Using Recursive Feature Elimination to rank features
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV

estimator = RandomForestClassifier()
selector = RFECV(estimator, step=1, cv=5)
selector = selector.fit(df[temp_features], df['poi'])
print selector.ranking_
print selector.support_

[ 1  4  7  1  1  1  1  1  9  1  1  8  3  1  1 10  5  6  2  1]
[ True False False  True  True  True  True  True False  True  True False
 False  True  True False False False False  True]


In [14]:
### Take the features selected by RFECV
temp_features = [a for a, t in zip(temp_features, selector.support_) if t]

In [15]:
### Take the top half of the features selected with RFECV
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2

selector = SelectPercentile(chi2, percentile=50).fit(df[temp_features], df['poi'])

In [17]:
### Take the features selected by SelectKBest
temp_features = [a for a, t in zip(temp_features, selector.get_support()) if t]

In [18]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
# Importing a variety of classifiers
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB

In [20]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(df[temp_features], df['poi'], test_size=0.3, random_state=42)

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import precision_score, recall_score

naive = GaussianNB()

naive = naive.fit(features_train, labels_train)

pred = naive.predict(features_test)

print precision_score(labels_test, pred)
print recall_score(labels_test, pred)

0.4
0.5


In [21]:
### Set the parameters by cross-validation
### This is from http://scikit-learn.org/0.15/auto_examples/grid_search_digits.html
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, scoring=score)
    clf.fit(features_train, labels_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_estimator_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = labels_test, clf.predict(features_test)
    print(classification_report(y_true, y_pred))
    print()
    

# Tuning hyper-parameters for precision
()
Best parameters set found on development set:
()
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
()
Grid scores on development set:
()
0.000 (+/-0.000) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
0.000 (+/-0.000) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
0.000 (+/-0.000) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
0.000 (+/-0.000) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
0.000 (+/-0.000) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
0.000 (+/-0.000) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}
0.000 (+/-0.000) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}
0.000 (+/-0.000) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.0001}
0.000 (+/-0.000) for {'kernel': 'linear', 'C': 1}
0.000 (+/-0.000) for {'kernel': 'linear', 'C': 10}
0.198 (+/-0.200) for {

In [22]:
# Set the parameters by cross-validation
tuned_parameters = [{'n_estimators': [10, 100, 1000], 'min_samples_split': [5, 10, 20]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring=score)
    clf.fit(features_train, labels_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_estimator_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = labels_test, clf.predict(features_test)
    print(classification_report(y_true, y_pred))
    print()
    

# Tuning hyper-parameters for precision
()
Best parameters set found on development set:
()
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=5, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
()
Grid scores on development set:
()
0.267 (+/-0.194) for {'min_samples_split': 5, 'n_estimators': 10}
0.302 (+/-0.200) for {'min_samples_split': 5, 'n_estimators': 100}
0.267 (+/-0.194) for {'min_samples_split': 5, 'n_estimators': 1000}
0.267 (+/-0.194) for {'min_samples_split': 10, 'n_estimators': 10}
0.000 (+/-0.000) for {'min_samples_split': 10, 'n_estimators': 100}
0.000 (+/-0.000) for {'min_samples_split': 10, 'n_estimators': 1000}
0.000 (+/-0.000) for {'min_samples_split': 20, 'n_estimators': 10}
0.198 (+/-0.20

In [None]:
### Store to my_dataset for easy export below.
my_dataset = df.to_dict(orient='index')

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [None]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)