In [17]:
#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")
import numpy as np
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
import tester

In [18]:
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
###features_list = ['poi','salary'] # You will need to use more features
features_list = ['poi',
                'salary',
                'bonus', 
                'long_term_incentive', 
                'deferred_income', 
                'deferral_payments',
                'loan_advances', 
                'other',
                'expenses', 
                'director_fees',
                'total_payments',
                'exercised_stock_options',
                'restricted_stock',
                'restricted_stock_deferred',
                'total_stock_value',
                'to_messages',
                'from_messages',
                'from_this_person_to_poi',
                'from_poi_to_this_person']

#ravi code
financial_features = [
                'salary',
                'bonus', 
                'long_term_incentive', 
                'deferred_income', 
                'deferral_payments',
                'loan_advances', 
                'other',
                'expenses', 
                'director_fees',
                'total_payments',
                'exercised_stock_options',
                'restricted_stock',
                'restricted_stock_deferred',
                'total_stock_value']

email_features = [  'to_messages',
                'from_messages',
                'from_this_person_to_poi',
                'from_poi_to_this_person']

In [19]:
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

In [20]:
# Transform data from dictionary to the Pandas DataFrame
df = pd.DataFrame.from_dict(data_dict, orient = 'index')
#Order columns in DataFrame, exclude email column
df = df[features_list].astype('float64')
df[financial_features]=df[financial_features].fillna(0)

In [21]:
##Ravi Code
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index, test_index in split.split(df,df['poi']):
    strat_train_set = df.iloc[train_index]
    strat_test_set = df.iloc[test_index]

In [22]:
## Ravi Code
##creating a custom transformer for column attributes
from sklearn.base import BaseEstimator,TransformerMixin
import pandas as pd
class CombinedAttributersAdder(BaseEstimator,TransformerMixin):
    def __init__(self,fraction_from_poi=False):
        
        self.fraction_from_poi = fraction_from_poi
    def fit(self,X,y=None):
        return self #nothing else to do
    
    def transform(self,X,y=None):
        X['fraction_to_poi']=X['from_this_person_to_poi'] / X['to_messages']
        X['fraction_from_poi']=X['from_poi_to_this_person'] / X['from_messages']
        X['fraction_of_overall_to_poi']=X['from_poi_to_this_person'] / X['from_poi_to_this_person'].sum()
        X['fraction_of_overall_from_poi']=X['from_poi_to_this_person'] / X['from_poi_to_this_person'].sum()
        return X

In [23]:
#Ravi code
#creating the pipeline and running it
#from sklearn.pipeline import Pipeline
#from sklearn.preprocessing import StandardScaler

#num_pipeline = Pipeline([
#    ('imputer',Imputer(strategy='median')),
#    ('attribs_adder',CombinedAttributersAdder()),
#    ('std_scaler',StandardScaler()),
#])

#train_data_pipeline = num_pipeline.fit_transform(strat_train_set)
#train_data_pipeline

In [24]:
df=strat_train_set.drop('TOTAL')

#df.iloc[:,:] = df.iloc[:,:].fillna(0)

imp = Imputer(missing_values='NaN', strategy='median', axis=0)

#impute missing values of email features 
df.loc[df[df.poi == 1].index,email_features] = imp.fit_transform(df[email_features][df.poi == 1])
df.loc[df[df.poi == 0].index,email_features] = imp.fit_transform(df[email_features][df.poi == 0])

attr=CombinedAttributersAdder()
df_new = attr.transform(df)
#df_new.to_csv("../df_new.csv")

In [25]:
scaler = StandardScaler()
df_norm = scaler.fit_transform(df_new.drop('poi',axis=1))


In [26]:
features=df_norm
target = df_new['poi']

In [27]:
#modifying the testdata
#impute the missing values
dftest=strat_test_set
imp = Imputer(missing_values='NaN', strategy='median', axis=0)

#impute missing values of email features
dftest.loc[dftest[dftest.poi == 1].index,email_features] = imp.fit_transform(dftest[email_features][dftest.poi == 1])
dftest.loc[dftest[dftest.poi == 0].index,email_features] = imp.fit_transform(dftest[email_features][dftest.poi == 0])
#attr2=CombinedAttributersAdder()
df_new_test = attr.transform(dftest)
#df_new.to_csv("../df_new.csv")

scaler = StandardScaler()
df_norm_test = scaler.fit_transform(df_new_test.drop('poi',axis=1))

features_test=df_norm_test
target_test=df_new_test['poi']

In [28]:
#Decision tree using features with non-null importance
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix,precision_score,recall_score,precision_recall_curve
clf = DecisionTreeClassifier(random_state = 75)
clf=clf.fit(features,target)
#target_score=cross_val_score(clf,features,target,cv=3,)
target_pred = cross_val_predict(clf,features,target,cv=3)
confusion_matrix = confusion_matrix(target,target_pred)
precision_score(target,target_pred)
recall_score(target,target_pred)
##does not seem to be the best algo to use



0.14285714285714285

In [29]:
# show the features with non null importance, sorted and create features_list of features for the model
features_importance = []
for i in range(len(clf.feature_importances_)):
    if clf.feature_importances_[i] > 0:
        features_importance.append([df.columns[i+1], clf.feature_importances_[i]])
features_importance.sort(key=lambda x: x[1], reverse = True)
for f_i in features_importance:
    print f_i
features_list = [x[0] for x in features_importance]
features_list.insert(0, 'poi')

['exercised_stock_options', 0.25997425997425994]
['from_this_person_to_poi', 0.23139324412308845]
['other', 0.19152935701733442]
['to_messages', 0.12941953536012948]
['total_payments', 0.11374763850011378]
['fraction_of_overall_from_poi', 0.040664780763790674]
['expenses', 0.033271184261283289]


In [30]:
# Searchgrid for random forest
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
# specify parameters and distributions to sample from
param_grid = {'bootstrap': [False],
 'criterion': ['entropy'],
 'max_depth': [None],
 'max_features': [1],
 'min_samples_leaf': [1],
 'min_samples_split': [9]}
forest_reg = RandomForestClassifier()
grid_search = GridSearchCV(forest_reg,param_grid=param_grid)
grid_search.fit(features,target)
predictions=grid_search.predict(features_test)
print classification_report(target_test,predictions)

             precision    recall  f1-score   support

        0.0       0.87      1.00      0.93        26
        1.0       0.00      0.00      0.00         4

avg / total       0.75      0.87      0.80        30



  'precision', 'predicted', average, warn_for)


In [31]:
grid_search.best_score_
grid_search
#grid_search.cv_results_['params'][grid_search.best_index_]

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'bootstrap': [False], 'min_samples_leaf': [1], 'min_samples_split': [9], 'criterion': ['entropy'], 'max_features': [1], 'max_depth': [None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [53]:
#code for radient boosting classifier
from sklearn.ensemble import GradientBoostingClassifier
clf_gbc = GradientBoostingClassifier(n_estimators=20,max_features=None,
                                     min_samples_split=2, min_samples_leaf=1,
                                     min_weight_fraction_leaf=0.0, max_depth=3)
#grid search for GBC..first ran the grid search for a # of parameters and then found the best estimate and redid the score
param_grid = {
   'n_estimators':[10,20],
    'max_features':[None],
    'min_samples_split':[2,4],
    'min_samples_leaf':[1,2],
    'min_weight_fraction_leaf':[0.0],
    'max_depth':[3,4]}
# param_grid = {'criterion':'friedman_mse', 'init':[None],
#               'learning_rate':[0.1], 'loss':'deviance', 'max_depth':[3],
#               'max_features':[None], 'max_leaf_nodes':[None],
#               'min_impurity_split':[1e-07], 'min_samples_leaf':[2],
#               'min_samples_split':[2], 'min_weight_fraction_leaf':[0.0],
#               'n_estimators':[20], 'presort':['auto'], 'random_state':None,
#               'subsample':[1.0],'verbose':[0], 'warm_start':[False]
#                }
grid_search_gbc = GridSearchCV(clf_gbc,param_grid=param_grid)
grid_search_gbc.fit(features,target)

#target_predict=clf_gbc.predict(features_test)
#clf_gbc.score(target_test,target_predict)


GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=20, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_samples_leaf': [1, 2], 'n_estimators': [10, 20], 'min_samples_split': [2, 4], 'min_weight_fraction_leaf': [0.0], 'max_features': [None], 'max_depth': [3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [54]:
grid_search_gbc.best_estimator_

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=4, min_weight_fraction_leaf=0.0,
              n_estimators=20, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)

In [55]:
predictions_gbc=grid_search_gbc.predict(features_test)
print classification_report(target_test,predictions_gbc)

             precision    recall  f1-score   support

        0.0       0.93      0.96      0.94        26
        1.0       0.67      0.50      0.57         4

avg / total       0.89      0.90      0.89        30

