In [1]:
import warnings
warnings.filterwarnings('ignore')
import sys
import pickle
sys.path.append("../tools/")
import pandas as pd
from scipy import stats
import numpy as np

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data



In [303]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi','salary'] # You will need to use more features
### Features selection will be done after some removal of errant values, na values filled, and features creation
### has occurred

### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

### Create a dataframe from the dictionary using pandas
df = pd.DataFrame.from_dict(data_dict, orient='index').reset_index()

In [304]:
### Convert all columns besides email_address and poi to numbers
cols = [c for c in df.columns if c not in ['index', 'poi', 'email_address']]
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [305]:
### Some basic data exploration
poi = df[df.poi == True]
non_poi = df[df.poi == False]
print "Total number people in dataset: %.0f\nNumber of POI: %.0f\nNumber of Non POI: %.0f\nNumber of features in data: %.0f" % (len(df), 
      len(poi), len(non_poi), len(df.columns))

Total number people in dataset: 146
Number of POI: 18
Number of Non POI: 128
Number of features in data: 22


In [306]:
### Total NaN values for each column 
print "Totals NAs for each number column:\n", df.isnull().sum(), "\nTotal NAs for email address: %.0f" % (len(df[df['email_address'] == 'NaN']))

Totals NAs for each number column:
index                          0
salary                        51
to_messages                   60
deferral_payments            107
total_payments                21
exercised_stock_options       44
bonus                         64
restricted_stock              36
shared_receipt_with_poi       60
restricted_stock_deferred    128
total_stock_value             20
expenses                      51
loan_advances                142
from_messages                 60
other                         53
from_this_person_to_poi       60
poi                            0
director_fees                129
deferred_income               97
long_term_incentive           80
email_address                  0
from_poi_to_this_person       60
dtype: int64 
Total NAs for email address: 35


In [307]:
### Task 2: Remove outliers
### Upon inspecting the dataset, the TOTAL column was added. This is removed below.
df = df[df['index'] != 'TOTAL']
df = df[df['index'] != 'THE TRAVEL AGENCY IN THE PARK']
df = df[df['index'] != 'LOCKHART EUGENE E']

In [308]:
### Now we'll fill NaN amounts with 0
df = df.fillna(0)

In [309]:
### Task 3: Create new feature(s)
### New features using all emails including poi with this person
df['expense_percent'] = df['expenses']/(df['salary']+1)

In [310]:
### Temporary features for analysis
temp_features = [c for c in df.columns if c not in ['index', 'poi', 'email_address']]

In [311]:
### First we'll scale the features. We will probably be trying out SVM so, while not necessary for all classifers
### it won't adversely impact, for example, Random Forests, while SVM will be negatively impacted by not feature
### scaling
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[temp_features] = scaler.fit_transform(df[temp_features])

In [312]:
### Take the top half of the features selected with RFECV
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2

selector = SelectPercentile(chi2, percentile=100).fit(df[temp_features], df['poi'])
feat_scores = [(a,t) for a, t, z in zip(temp_features, selector.scores_, selector.get_support()) if z] 
for i in  feat_scores: print i[0], i[1]

salary 3.05278674479
to_messages 0.436397768802
deferral_payments 0.0606966069314
total_payments 2.78477883965
exercised_stock_options 6.84550933503
bonus 5.12075413709
restricted_stock 0.589535349487
shared_receipt_with_poi 2.43221986514
restricted_stock_deferred 0.00350676503321
total_stock_value 5.47661009929
expenses 1.48610336666
loan_advances 6.68878173834
from_messages 0.0687385421513
other 1.7159505308
from_this_person_to_poi 1.0008076418
director_fees 1.50113085359
deferred_income 0.340099218406
long_term_incentive 2.53848503308
from_poi_to_this_person 1.37005929223
expense_percent 0.00123674763195


In [313]:
### Take the features selected by SelectKBest
temp_features = [a for a, t in zip(temp_features, selector.get_support()) if t]

In [314]:
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

# Provided to give you a starting point. Try a variety of classifiers.
# Importing a variety of classifiers
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.naive_bayes import GaussianNB
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.3, random_state=42)

In [315]:
final_features = ['bonus', 'exercised_stock_options', 'total_stock_value']

In [316]:
metrics_test = {"SVM":{"precision":[], "recall":[]}, "Forest":{"precision":[], "recall":[]}, "Naive":{"precision":[], "recall":[]}}

In [317]:
for train_index, test_index in sss.split(df[temp_features], df['poi']):
    X_train, X_test = df[temp_features].iloc[train_index], df[temp_features].iloc[test_index]
    y_train, y_test = df['poi'].iloc[train_index], df['poi'].iloc[test_index]

    print "SVM"
    svm = SVC(kernel='linear', C=10)
    svm = svm.fit(X_train, y_train)
    pred = svm.predict(X_test)
    print precision_score(y_test, pred)
    print recall_score(y_test, pred)
    metrics_test['SVM']['precision'].append(precision_score(y_test, pred))
    metrics_test['SVM']['recall'].append(recall_score(y_test, pred))

for train_index, test_index in sss.split(df[temp_features], df['poi']):
    X_train, X_test = df[temp_features].iloc[train_index], df[temp_features].iloc[test_index]
    y_train, y_test = df['poi'].iloc[train_index], df['poi'].iloc[test_index]

    print "Random Forest"
    forest = RandomForestClassifier(min_samples_split=5, n_estimators=100)
    forest = forest.fit(X_train, y_train)
    pred = forest.predict(X_test)
    print precision_score(y_test, pred)
    print recall_score(y_test, pred)
    metrics_test['Forest']['precision'].append(precision_score(y_test, pred))
    metrics_test['Forest']['recall'].append(recall_score(y_test, pred))

for train_index, test_index in sss.split(df[temp_features], df['poi']):
    X_train, X_test = df[temp_features].iloc[train_index], df[temp_features].iloc[test_index]
    y_train, y_test = df['poi'].iloc[train_index], df['poi'].iloc[test_index]

    print "Naive Bayes"
    clf = GaussianNB()
    clf = clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    print precision_score(y_test, pred)
    print recall_score(y_test, pred)
    metrics_test['Naive']['precision'].append(precision_score(y_test, pred))
    metrics_test['Naive']['recall'].append(recall_score(y_test, pred))

SVM
0.0
0.0
SVM
0.0
0.0
SVM
0.5
0.2
Random Forest
0.0
0.0
Random Forest
0.5
0.2
Random Forest
1.0
0.2
Naive Bayes
0.185185185185
1.0
Naive Bayes
0.107142857143
0.6
Naive Bayes
0.129032258065
0.8


In [318]:
print 'Naive: recall = %f, precision = %f' %(np.mean(metrics_test['Naive']['recall']), np.mean(metrics_test['Naive']['precision']))
print 'SVM: recall = %f, precision = %f' %(np.mean(metrics_test['SVM']['recall']), np.mean(metrics_test['SVM']['precision']))
print 'Forest: recall = %f, precision = %f' %(np.mean(metrics_test['Forest']['recall']), np.mean(metrics_test['Forest']['precision']))


Naive: recall = 0.800000, precision = 0.140453
SVM: recall = 0.066667, precision = 0.166667
Forest: recall = 0.133333, precision = 0.500000


In [None]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(df[temp_features], df['poi'], test_size=0.3, random_state=42)

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import precision_score, recall_score

naive = GaussianNB()

naive = naive.fit(features_train, labels_train)

pred = naive.predict(features_test)

print precision_score(labels_test, pred)
print recall_score(labels_test, pred)

In [None]:
### Set the parameters by cross-validation
### This is from http://scikit-learn.org/0.15/auto_examples/grid_search_digits.html
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, scoring=score)
    clf.fit(features_train, labels_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_estimator_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = labels_test, clf.predict(features_test)
    print(classification_report(y_true, y_pred))
    print()
    

In [None]:
# Set the parameters by cross-validation
tuned_parameters = [{'n_estimators': [10, 100, 1000], 'min_samples_split': [5, 10, 20]}]

scores = ['precision', 'recall']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5, scoring=score)
    clf.fit(features_train, labels_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_estimator_)
    print()
    print("Grid scores on development set:")
    print()
    for params, mean_score, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (mean_score, scores.std() / 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = labels_test, clf.predict(features_test)
    print(classification_report(y_true, y_pred))
    print()
    

In [None]:
### Make features list for data dump
features_list = ['poi'] + temp_features

In [None]:
### Reset index to names
df = df.set_index(['index'])

In [None]:
### Store to my_dataset for easy export below.
my_dataset = df.to_dict(orient='index')

In [None]:
### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [None]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)