In [23]:
import pickle
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier

import sys
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

# Inline Graphics for IPython NB's
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [24]:

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

features_list = ['poi','bonus', 'deferred_income','exercised_stock_options', 'expenses','loan_advances',
                 'long_term_incentive','other','restricted_stock','restricted_stock_deferred','salary',
                 'total_payments','total_stock_value', 'deferral_payments', 'director_fees', 'shared_receipt_with_poi'] 

In [25]:
### Load the (pickle) dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [26]:
# Convert the Python Dictionary to a Pandas DataFrame:
raw_dataframe = pd.DataFrame(data_dict)
raw_dataframe.head(5)
dataset = raw_dataframe.T
mydataset = dataset.convert_objects(convert_dates=True, convert_numeric=True, convert_timedeltas=True, copy=True)

mydataset['email_address'] = mydataset['email_address'].apply(lambda x: 0 if x == 'NaN' else 1)
mydataset = mydataset.fillna(0)

mydataset = mydataset.abs()
mydataset.drop(mydataset.index[130], inplace=True)
mydataset.drop(mydataset.index[127], inplace=True)
mydataset['poi'] = mydataset['poi'].astype(int)

mydataset.head(5)
mydataset = mydataset.convert_objects(convert_dates=True, convert_numeric=True, convert_timedeltas=True, copy=True)
print(mydataset.dtypes)


bonus                        float64
deferral_payments            float64
deferred_income              float64
director_fees                float64
email_address                  int64
exercised_stock_options      float64
expenses                     float64
from_messages                float64
from_poi_to_this_person      float64
from_this_person_to_poi      float64
loan_advances                float64
long_term_incentive          float64
other                        float64
poi                            int64
restricted_stock             float64
restricted_stock_deferred    float64
salary                       float64
shared_receipt_with_poi      float64
to_messages                  float64
total_payments               float64
total_stock_value            float64
dtype: object




In [27]:
temp_mydataset = mydataset.copy(deep=True)
MyPoiDF = mydataset.loc[mydataset['poi'] == 1].copy(deep=True)


In [28]:
# Do this ONLY ONCE per oversampling run to initialize.
start_range = 0
# CHANGE the INDEX of the DF to a numeric range,
# this can be done multiple times in sequence to add to the oversampling.

# Calculate Proper Range Values for this individual DF
end_range = len(MyPoiDF)

# RESET the Temp POI DF (for unique indexes)
MyPoiDF.index=range(start_range, end_range + start_range)


In [29]:
over_sampled_df = temp_mydataset.append(MyPoiDF)


In [30]:
# Simple Model
clf = SVC(kernel='rbf')                                              



In [31]:
reverted_df = over_sampled_df.T
reverted_dict = reverted_df.to_dict()
my_dataset = reverted_dict
dump_classifier_and_data(clf, my_dataset, features_list)


In [32]:
#!/usr/bin/pickle

""" a basic script for importing student's POI identifier,
    and checking the results that they get from it 
 
    requires that the algorithm, dataset, and features list
    be written to my_classifier.pkl, my_dataset.pkl, and
    my_feature_list.pkl, respectively

    that process should happen at the end of poi_id.py
"""

import pickle
import sys
from sklearn.cross_validation import StratifiedShuffleSplit
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit



PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"



def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."

        
        
CLF_PICKLE_FILENAME = "my_classifier.pkl"
DATASET_PICKLE_FILENAME = "my_dataset.pkl"
FEATURE_LIST_FILENAME = "my_feature_list.pkl"



def dump_classifier_and_data(clf, dataset, feature_list):
    with open(CLF_PICKLE_FILENAME, "w") as clf_outfile:
        pickle.dump(clf, clf_outfile)
    with open(DATASET_PICKLE_FILENAME, "w") as dataset_outfile:
        pickle.dump(dataset, dataset_outfile)
    with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile:
        pickle.dump(feature_list, featurelist_outfile)

        
        
def load_classifier_and_data():
    with open(CLF_PICKLE_FILENAME, "r") as clf_infile:
        clf = pickle.load(clf_infile)
    with open(DATASET_PICKLE_FILENAME, "r") as dataset_infile:
        dataset = pickle.load(dataset_infile)
    with open(FEATURE_LIST_FILENAME, "r") as featurelist_infile:
        feature_list = pickle.load(featurelist_infile)
    return clf, dataset, feature_list


In [33]:
from tester import test_classifier
features_list = ['poi','bonus','deferral_payments','deferred_income','director_fees',
                                   'exercised_stock_options','expenses','loan_advances','long_term_incentive',
                                   'other','restricted_stock','restricted_stock_deferred','salary',
                                   'shared_receipt_with_poi','total_payments','total_stock_value']
test_classifier(clf, my_dataset, features_list)


precision,recall =  1.0 0.9155
	Accuracy: 0.98012	Precision: 1.00000	Recall: 0.91550	F1: 0.95589	F2: 0.93124
	Total predictions: 17000	True positives: 3662	False positives:    0	False negatives:  338	True negatives: 13000



Precision and recall above .9 for 1 X oversampling. 