In [23]:
import pickle
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from sklearn.preprocessing import StandardScaler
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier

import sys
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

# Inline Graphics for IPython NB's
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [24]:

### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".

features_list = ['poi','bonus', 'deferred_income','exercised_stock_options', 'expenses','loan_advances',
                 'long_term_incentive','other','restricted_stock','restricted_stock_deferred','salary',
                 'total_payments','total_stock_value', 'deferral_payments', 'director_fees', 'shared_receipt_with_poi'] 

In [25]:
### Load the (pickle) dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [26]:
# Convert the Python Dictionary to a Pandas DataFrame:
raw_dataframe = pd.DataFrame(data_dict)
raw_dataframe.head(5)
dataset = raw_dataframe.T
mydataset = dataset.convert_objects(convert_dates=True, convert_numeric=True, convert_timedeltas=True, copy=True)

mydataset['email_address'] = mydataset['email_address'].apply(lambda x: 0 if x == 'NaN' else 1)
mydataset = mydataset.fillna(0)

mydataset = mydataset.abs()
mydataset.drop(mydataset.index[130], inplace=True)
mydataset.drop(mydataset.index[127], inplace=True)
mydataset['poi'] = mydataset['poi'].astype(int)

mydataset.head(5)
mydataset = mydataset.convert_objects(convert_dates=True, convert_numeric=True, convert_timedeltas=True, copy=True)
print(mydataset.dtypes)


bonus                        float64
deferral_payments            float64
deferred_income              float64
director_fees                float64
email_address                  int64
exercised_stock_options      float64
expenses                     float64
from_messages                float64
from_poi_to_this_person      float64
from_this_person_to_poi      float64
loan_advances                float64
long_term_incentive          float64
other                        float64
poi                            int64
restricted_stock             float64
restricted_stock_deferred    float64
salary                       float64
shared_receipt_with_poi      float64
to_messages                  float64
total_payments               float64
total_stock_value            float64
dtype: object




In [27]:
temp_mydataset = mydataset.copy(deep=True)
MyPoiDF = mydataset.loc[mydataset['poi'] == 1].copy(deep=True)


In [28]:
# Do this ONLY ONCE per oversampling run to initialize.
start_range = 0
# CHANGE the INDEX of the DF to a numeric range,
# this can be done multiple times in sequence to add to the oversampling.

# Calculate Proper Range Values for this individual DF
end_range = len(MyPoiDF)

# RESET the Temp POI DF (for unique indexes)
MyPoiDF.index=range(start_range, end_range + start_range)
MyPoiDF

Unnamed: 0,bonus,deferral_payments,deferred_income,director_fees,email_address,exercised_stock_options,expenses,from_messages,from_poi_to_this_person,from_this_person_to_poi,...,long_term_incentive,other,poi,restricted_stock,restricted_stock_deferred,salary,shared_receipt_with_poi,to_messages,total_payments,total_stock_value
0,5249999.0,2144013.0,2334434.0,0.0,1,953136.0,17355.0,484.0,228.0,108.0,...,0.0,210698.0,1,157569.0,0.0,213999.0,5521.0,7991.0,5501630.0,1110705.0
1,1350000.0,0.0,833.0,0.0,1,0.0,65907.0,27.0,140.0,15.0,...,974293.0,1621.0,1,252055.0,0.0,278601.0,1593.0,1858.0,2669589.0,252055.0
2,1250000.0,0.0,262500.0,0.0,1,0.0,35818.0,144.0,199.0,25.0,...,375304.0,486.0,1,126027.0,0.0,240189.0,2188.0,2598.0,1639297.0,126027.0
3,1000000.0,0.0,235000.0,0.0,1,0.0,30674.0,49.0,58.0,12.0,...,350000.0,307895.0,1,2502063.0,0.0,415189.0,1585.0,1892.0,1868758.0,2502063.0
4,1200000.0,27610.0,144062.0,0.0,1,0.0,16514.0,40.0,240.0,11.0,...,0.0,101740.0,1,698242.0,0.0,288542.0,1132.0,1758.0,1490344.0,698242.0
5,3000000.0,0.0,0.0,0.0,1,2291113.0,86174.0,3069.0,66.0,609.0,...,1294981.0,1661.0,1,1323148.0,0.0,365163.0,2097.0,3093.0,4747979.0,3614261.0
6,1300000.0,0.0,1386055.0,0.0,1,0.0,55921.0,0.0,0.0,0.0,...,1736055.0,277464.0,1,1794412.0,0.0,440698.0,0.0,0.0,2424083.0,1794412.0
7,600000.0,0.0,0.0,0.0,1,384728.0,125978.0,16.0,52.0,6.0,...,71023.0,200308.0,1,393818.0,0.0,274975.0,874.0,873.0,1272284.0,778546.0
8,1500000.0,0.0,3117011.0,0.0,1,5538001.0,34039.0,32.0,32.0,21.0,...,1617011.0,11350.0,1,853064.0,0.0,243293.0,1035.0,1045.0,288682.0,6391065.0
9,0.0,10259.0,0.0,0.0,1,30766064.0,77978.0,0.0,0.0,0.0,...,0.0,2856.0,1,0.0,0.0,0.0,0.0,0.0,91093.0,30766064.0


In [29]:
over_sampled_df = mydataset

In [30]:
# Simple Model
clf = SVC(kernel='rbf')                                              



In [31]:
reverted_df = over_sampled_df.T
reverted_dict = reverted_df.to_dict()
my_dataset = reverted_dict
dump_classifier_and_data(clf, my_dataset, features_list)


In [32]:
#!/usr/bin/pickle

""" a basic script for importing student's POI identifier,
    and checking the results that they get from it 
 
    requires that the algorithm, dataset, and features list
    be written to my_classifier.pkl, my_dataset.pkl, and
    my_feature_list.pkl, respectively

    that process should happen at the end of poi_id.py
"""

import pickle
import sys
from sklearn.cross_validation import StratifiedShuffleSplit
sys.path.append("../tools/")
from feature_format import featureFormat, targetFeatureSplit



PERF_FORMAT_STRING = "\
\tAccuracy: {:>0.{display_precision}f}\tPrecision: {:>0.{display_precision}f}\t\
Recall: {:>0.{display_precision}f}\tF1: {:>0.{display_precision}f}\tF2: {:>0.{display_precision}f}"
RESULTS_FORMAT_STRING = "\tTotal predictions: {:4d}\tTrue positives: {:4d}\tFalse positives: {:4d}\
\tFalse negatives: {:4d}\tTrue negatives: {:4d}"



def test_classifier(clf, dataset, feature_list, folds = 1000):
    data = featureFormat(dataset, feature_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
    true_negatives = 0
    false_negatives = 0
    true_positives = 0
    false_positives = 0
    for train_idx, test_idx in cv: 
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        for prediction, truth in zip(predictions, labels_test):
            if prediction == 0 and truth == 0:
                true_negatives += 1
            elif prediction == 0 and truth == 1:
                false_negatives += 1
            elif prediction == 1 and truth == 0:
                false_positives += 1
            elif prediction == 1 and truth == 1:
                true_positives += 1
            else:
                print "Warning: Found a predicted label not == 0 or 1."
                print "All predictions should take value 0 or 1."
                print "Evaluating performance for processed predictions:"
                break
    try:
        total_predictions = true_negatives + false_negatives + false_positives + true_positives
        accuracy = 1.0*(true_positives + true_negatives)/total_predictions
        precision = 1.0*true_positives/(true_positives+false_positives)
        recall = 1.0*true_positives/(true_positives+false_negatives)
        f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
        f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
        print clf
        print PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5)
        print RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives)
        print ""
    except:
        print "Got a divide by zero when trying out:", clf
        print "Precision or recall may be undefined due to a lack of true positive predicitons."

        
        
CLF_PICKLE_FILENAME = "my_classifier.pkl"
DATASET_PICKLE_FILENAME = "my_dataset.pkl"
FEATURE_LIST_FILENAME = "my_feature_list.pkl"



def dump_classifier_and_data(clf, dataset, feature_list):
    with open(CLF_PICKLE_FILENAME, "w") as clf_outfile:
        pickle.dump(clf, clf_outfile)
    with open(DATASET_PICKLE_FILENAME, "w") as dataset_outfile:
        pickle.dump(dataset, dataset_outfile)
    with open(FEATURE_LIST_FILENAME, "w") as featurelist_outfile:
        pickle.dump(feature_list, featurelist_outfile)

        
        
def load_classifier_and_data():
    with open(CLF_PICKLE_FILENAME, "r") as clf_infile:
        clf = pickle.load(clf_infile)
    with open(DATASET_PICKLE_FILENAME, "r") as dataset_infile:
        dataset = pickle.load(dataset_infile)
    with open(FEATURE_LIST_FILENAME, "r") as featurelist_infile:
        feature_list = pickle.load(featurelist_infile)
    return clf, dataset, feature_list


In [33]:
from tester import test_classifier
features_list = ['poi','bonus','deferral_payments','deferred_income','director_fees',
                                   'exercised_stock_options','expenses','loan_advances','long_term_incentive',
                                   'other','restricted_stock','restricted_stock_deferred','salary',
                                   'shared_receipt_with_poi','total_payments','total_stock_value']
test_classifier(clf, my_dataset, features_list)


Got a divide by zero when trying out: SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Precision or recall may be undefined due to a lack of true positive predicitons.


Here no 'unseen' value was predicted.