In [47]:
import sys
import pickle
sys.path.append("../tools/")

import pandas as pd
import numpy as np

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

import matplotlib.pyplot as plt
%matplotlib inline
from pprint import pprint

from sklearn.preprocessing import Imputer
                                                
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.decomposition import PCA

from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn import cross_validation

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from tester import test_classifier

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import precision_recall_fscore_support

import scipy.stats as st


In [2]:
data = "final_project_dataset.pkl"


In [59]:
## Data load functions

def import_data(data):
    '''This are the things I will do to import the data everytime, 
    regardless of what variables I make.'''
    with open(data, "r") as data_file:
        data_dict = pickle.load(data_file)
    df = pd.DataFrame(data_dict)
    df = df.transpose()
    df = df.drop('email_address', axis=1)
    df = df.astype(float)
    df = df.drop('TOTAL')
    df = df.drop("THE TRAVEL AGENCY IN THE PARK")
    df = df.drop("loan_advances", axis=1)
    return df
def import_data1(data):
    '''This are the things I will do to import the data everytime, 
    regardless of what variables I make.'''
    with open(data, "r") as data_file:
        data_dict1 = pickle.load(data_file)
    return data_dict1

def get_df_features_labels_features_list(df):
    '''This is where the features and labels are extracted to use as arguments
    for sklearn\'s StratifiedShuffleSplit function AND for the model that I submit
    to the grader. That is why it returns four things. It is also where I add some 
    new variables.'''
    
    #df = df.drop('email_address', axis=1)
    
    df = df.astype(float)
    
    #add columns
    df['pct_from_poi'] = df['from_poi_to_this_person']/(df['from_messages'] + 1)
    df['pct_to_poi'] = df['from_this_person_to_poi']/(df['from_messages'] + 1)
    df['to_from'] = df['pct_from_poi']*df['pct_from_poi']
    
    #drop columns
    #df = df.drop("loan_advances", axis=1)
    #df = df.drop('restricted_stock_deferred', axis=1)
    #df = df.drop('director_fees', axis=1)
    #df = df.drop('restricted_stock', axis=1)
    #df = df.drop('deferral_payments', axis=1)
    #df = df.drop('deferred_income', axis=1)
   
    # drop rows based on meaning

    #df = df.drop('TOTAL')
    #df = df.drop("THE TRAVEL AGENCY IN THE PARK")
    #df = df.drop("LOCKHART EUGENE E")



    #drop rows
    for i in df.index:
        if df.ix[i].count() < 3:
            df = df.drop(i, axis=0)
    
    features_list = list(df.columns)
    features_list.remove('poi')
    # get features of udacity_grader
    features = df[features_list]
    labels = df['poi']
    # put poi back in for udacity grader
    features_list.insert(0,'poi')

    
    return df, features, labels, features_list

## precision recall functiom 
def precision_recall(labels,predictions):
    ind_true_pos = [i for i in range(0,len(labels)) if (predictions[i]==1) & (labels[i]==1)]
    ind_false_pos = [i for i in range(0,len(labels)) if ((predictions[i]==1) & (labels[i]==0))]
    ind_false_neg = [i for i in range(0,len(labels)) if ((predictions[i]==0) & (labels[i]==1))]
    ind_true_neg = [i for i in range(0,len(labels)) if ((predictions[i]==0) & (labels[i]==0))]
    precision = 0
    recall = 0
    true_pos = len(ind_true_pos)
    false_pos = len(ind_false_pos)
    true_neg = len(ind_true_neg)
    false_neg = len(ind_false_neg)
    
    ind_labels = [i for i in range(0,len(labels)) if labels[i]==1]
    
    if len(ind_labels) !=0:
        if float( len(ind_true_pos) + len(ind_false_pos))!=0:
            precision = float(len(ind_true_pos))/float( len(ind_true_pos) + len(ind_false_pos))
        if float( len(ind_true_pos) + len(ind_false_neg))!=0:
            recall = float(len(ind_true_pos))/float( len(ind_true_pos) + len(ind_false_neg))
        
        
        return precision, recall,true_pos,true_neg,false_pos,false_neg
    else:
        return -1,-1,0,0,0,0

def custom_scorer(labels, predictions):
    precision,recall = precision_recall(labels,predictions)
    min_score = min(precision, recall)
    return min_score


In [4]:
# GradientBoostingClassifier 
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer
from sklearn.feature_selection import SelectKBest

# Get custom scorer
score = make_scorer(custom_scorer, greater_is_better=True)

# get the df
df = import_data(data)

#df.fillna(inplace=True, value=0)

# Get data, here with the features unrealated to poi dropped AND Tanya's features added.
df, features, labels, features_list = get_df_features_labels_features_list(df)
df1 = df.transpose()
df1 = df1.to_dict()
# Get the test-train split
# features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels)

# Build pipeline
Pipeline = Pipeline([
        ('imp', Imputer(missing_values='NaN')),
        ('std', MinMaxScaler()),
        ('selection', SelectKBest()),
        ('pca', PCA()),
        ('clf', GradientBoostingClassifier(random_state=0))
    ])

# Build Grid
# pre-processing
k = [k for k in range(16,17)]
c = [x for x in range(3,4)]

# estimator parameters
e = [100]
r = [0.1]
d = [d for d in range(6, 7)]
l = ["exponential"]

param_grid = {'selection__k': k,
              'pca__n_components': c,
              'imp__strategy': ['median'],
              'clf__n_estimators': e,
              'clf__learning_rate': r,
              'clf__max_depth': d,
              'clf__loss': l
             }

# set model parameters to grid search object
gridCV_object = GridSearchCV(estimator = Pipeline, 
                             param_grid = param_grid,
                             scoring = score,
                             cv = StratifiedShuffleSplit(labels, test_size=0.1,  n_iter=1000, random_state = 42))

# train the model
gridCV_object.fit(features, labels)

print gridCV_object.best_params_
print gridCV_object.scorer_

{'clf__n_estimators': 100, 'imp__strategy': 'median', 'pca__n_components': 3, 'clf__max_depth': 6, 'clf__learning_rate': 0.1, 'clf__loss': 'exponential', 'selection__k': 16}
make_scorer(custom_scorer)


In [77]:
## making classifier

from tester import test_classifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest


clf = Pipeline([
        ('imp', Imputer(missing_values='NaN')),
        ('scaler', MinMaxScaler()),
        ('selection', SelectKBest(k=16)),
        ('pca', PCA(n_components=3)),
        ('clf', GradientBoostingClassifier(n_estimators=100,loss="exponential", max_depth=6, learning_rate=0.1))
    ])



In [78]:
## below is code to check precision and recall. This part is similar to what the grader uses.

features_array = np.array(features)
labels_array = np.array(labels)
i_count = 0
precision_all = []
recall_all = []
true_pos_all = 0
true_neg_all = 0
false_pos_all = 0
false_neg_all = 0
    
cv = StratifiedShuffleSplit(labels, test_size=0.1,  n_iter=100, random_state = 42)
# get_CI_mean_PrecisionRecall(features_array, labels,gridCV_object,cv)
for train_index, test_index in cv:
        X_train, X_test = features_array[train_index], features_array[test_index]
        y_train, y_test = labels_array[train_index], labels_array[test_index]
        clf = clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        precision,recall,true_pos,true_neg,false_pos,false_neg = precision_recall(y_test,predictions)
    
        if precision!=-1:
            i_count +=1
            #print i_count, "Precision:", round(precision, 4), " , Recall:", round(recall, 4)
            precision_all.append(precision)
            recall_all.append(recall)
            true_pos_all = true_pos_all+true_pos
            true_neg_all = true_neg_all+true_neg
            false_pos_all = false_pos_all+false_pos
            false_neg_all = false_neg_all+false_neg


            
            
precision_all = np.array(precision_all)
recall_all = np.array(recall_all)
mean_precision = round(np.mean(precision_all), 4)
mean_recall = round(np.mean(recall_all), 4)
std_precision = round(np.std(precision_all), 4)
std_recall = round(np.std(recall_all), 4)
                           
print "Mean Precision:", mean_precision, ", Mean Recall:", mean_recall
print "STD Precision:", round(np.std(precision_all), 4), ", STD Recall:", round(np.std(recall_all), 4)
CI_recall = st.t.interval(0.95, len(recall_all)-1, loc=np.mean(recall_all), scale=st.sem(recall_all))
CI_precision= st.t.interval(0.95, len(precision_all)-1, loc=np.mean(precision_all), scale=st.sem(precision_all))
print "CI Precision:", CI_precision
print "CI_recall:", CI_recall

print "Overall calculation."
print ""
print "Total predictions",true_pos_all+false_pos_all+true_neg_all+false_neg_all
print "True-positive: ",true_pos_all
print "False-positive: ",false_pos_all
print "True-negative: ",true_neg_all
print "False-negative: ",false_neg_all

print ""
print "Accuracy: ",float(true_pos_all+true_neg_all)/float(true_pos_all+false_pos_all+true_neg_all+false_neg_all)
print "Precision: ",float(true_pos_all)/float(true_pos_all+false_pos_all)
print "Recall: ", float(true_pos_all)/float(true_pos_all+false_neg_all)


print "\n\nAnd these are the results going through the test classifier:\n"
test_classifier(clf, df1, features_list,folds = 100)

Mean Precision: 0.2215 , Mean Recall: 0.2
STD Precision: 0.3498 , STD Recall: 0.2828
CI Precision: (0.15174165433084907, 0.29125834566915088)
CI_recall: (0.14359513660445378, 0.25640486339554625)
Overall calculation.

Total predictions 1500
True-positive:  40
False-positive:  116
True-negative:  1184
False-negative:  160

Accuracy:  0.816
Precision:  0.25641025641
Recall:  0.2


And these are the results going through the test classifier:

precision,recall =  0.276315789474 0.21
	Accuracy: 0.82133	Precision: 0.27632	Recall: 0.21000	F1: 0.23864	F2: 0.22059
	Total predictions: 1500	True positives:   42	False positives:  110	False negatives:  158	True negatives: 1190

