# POI Classification

In [1]:
import sys
import pickle
import pandas as pd
import numpy as np
from time import time
sys.path.append("../tools/")
sys.path.append("../impl/")

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

% matplotlib inline

In [3]:
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data, test_classifier
from preprocess import preprocess, ordered_columns



In [4]:
with open("../impl/final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

my_dataset = preprocess(data_dict)

In [5]:
my_dataset_df = pd.DataFrame.from_dict(my_dataset, orient="index")
my_dataset_df.replace('NaN', np.nan, inplace=True)

In [6]:
exclude = ["poi", "email_address"]
all_features_list = [f for f in my_dataset.items()[0][1].keys() if f not in exclude]

In [7]:
features_list_org = ['poi'] + ordered_columns + ["to_messages", "from_messages", "from_this_person_to_poi", 
                                                "shared_receipt_with_poi", "from_poi_to_this_person"]
features_list_ext = features_list_org + ["to_poi_perc", "from_poi_perc", "shared_with_poi_perc"]
features_list_full = ["poi"] + all_features_list

Dump cleaned dataset and features_list

In [8]:
with open("../impl/dev/my_dataset.pkl", "w") as dataset_outfile:
        pickle.dump(my_dataset, dataset_outfile)

with open("../impl/dev/features_list_org.pkl", "w") as featurelist_outfile:
        pickle.dump(features_list_org, featurelist_outfile)
with open("../impl/dev/features_list_ext.pkl", "w") as featurelist_outfile:
        pickle.dump(features_list_ext, featurelist_outfile)
with open("../impl/dev/features_list_full.pkl", "w") as featurelist_outfile:
        pickle.dump(features_list_full, featurelist_outfile)

In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.feature_selection import  SelectPercentile, mutual_info_classif, f_classif
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler


## Original feature set

### Naive Bayes 

In [10]:
features_list = features_list_org
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [11]:
def constructNB(base_clf=None, param=None): 
    """
    Construct Naive Bayes classifier, define default construction proccess
    and deafault grid search setup 
    """    
    
    if not base_clf:
        base_clf = Pipeline([
                    ('feature_selection', SelectPercentile()),
                    ('classification', GaussianNB())
                  ])
    if not param:
        param = {
            "feature_selection__score_func" : [f_classif, mutual_info_classif],
            "feature_selection__percentile" : [30, 50, 70, 100],
            }
    
    clf = GridSearchCV(base_clf,
                    param_grid = param,
                    scoring = make_scorer(f1_score),
                    cv = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=32))
    return clf
        

In [12]:
clf_v1 = constructNB()

In [13]:
def load_or_fit(clf, features, labels, path, dump_new=True, features_path=None):
    """
    Load dumped version of classifier from provided path.
    Otherwise fit classifier, if dump_new is True, then dump it in a provided path.
    If features_path is specified, than dump current features_list to the features_path
    """
    
    try:
        with open(path, "r") as clf_infile:
            fitted_clf = pickle.load(clf_infile)
        print "Classifier was loaded from ", path
    except IOError:
        print "Failed to load fitted classifier\nStart fitting..."
        t0 = time()
        fitted_clf = clf
        fitted_clf.fit(features, labels)
        print "Classifier fitted in ", round(time()-t0, 3), "s"
        if dump_new:
            with open(path, "w") as clf_outfile:
                pickle.dump(fitted_clf, clf_outfile)
            print "Fitted classifier was dumped to ", path
        
    if features_path:
        with open(features_path, "w") as feat_outfile:
            pickle.dump(features_list, feat_outfile)
    
    return fitted_clf 
            

In [14]:
def test_clf(clf, my_dataset, features_list, n_splits=100, random_state=32, test_size=0.2):
    data = featureFormat(my_dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_list = []

    for train_index, test_index in sss.split(features, labels):
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_index:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_index:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        clf.fit(features_train, labels_train)
        pred = clf.predict(features_test)
        accuracy_list.append(accuracy_score(labels_test, pred))
        precision_list.append(precision_score(labels_test, pred))
        recall_list.append(recall_score(labels_test, pred))
        f1_list.append(f1_score(labels_test, pred))

    return np.mean(accuracy_list), np.mean(precision_list), np.mean(recall_list), \
            np.mean(f1_list)

In [15]:
def add_summary(fname, values, version):
    with open(fname, "a") as f:
        #if empty add header
        if f.tell() == 0: 
            f.write("Version, Accuracy, Precision, Recall, F1\n")
        line = version + ", %2.4f"*4 % values + "\n"
        f.write(line)

In [16]:
clf_v1 = load_or_fit(clf_v1, features, labels, path="../impl/dev/clf_v1.pkl", 
                     features_path = "../impl/dev/features_list_org.pkl")

Classifier was loaded from  ../impl/dev/clf_v1.pkl


In [17]:
clf_v1

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=32, test_size=0.2,
            train_size=None),
       error_score='raise',
       estimator=Pipeline(steps=[('feature_selection', SelectPercentile(percentile=10,
         score_func=<function f_classif at 0x117394a28>)), ('classification', GaussianNB(priors=None))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'feature_selection__score_func': [<function f_classif at 0x117394a28>, <function mutual_info_classif at 0x11752e938>], 'feature_selection__percentile': [30, 50, 70, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(f1_score), verbose=0)

In [18]:
clf_v1.best_params_

{'feature_selection__percentile': 70,
 'feature_selection__score_func': <function sklearn.feature_selection.univariate_selection.f_classif>}

In [19]:
clf_v1.best_score_

0.33256631930161346

Best score correspond to best F1 value. Next some other scores are presented

In [20]:
scores = test_clf(clf_v1.best_estimator_, my_dataset, features_list_org)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver1(NB)") 

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy: 0.831034482759, Precision: 0.411264041514, Recall: 0.3375, F1: 0.332566319302


In [21]:
pd.DataFrame(
{ "score" : clf_v1.cv_results_["mean_test_score"], 
"percentile" : clf_v1.cv_results_["param_feature_selection__percentile"],
"score_fun" : clf_v1.cv_results_['param_feature_selection__score_func']
})

Unnamed: 0,percentile,score,score_fun
0,30,0.303232,<function f_classif at 0x117394a28>
1,30,0.203847,<function mutual_info_classif at 0x11752e938>
2,50,0.316916,<function f_classif at 0x117394a28>
3,50,0.241352,<function mutual_info_classif at 0x11752e938>
4,70,0.332566,<function f_classif at 0x117394a28>
5,70,0.285572,<function mutual_info_classif at 0x11752e938>
6,100,0.329348,<function f_classif at 0x117394a28>
7,100,0.329348,<function mutual_info_classif at 0x11752e938>


What features were used by NB?

In [22]:
f, pval = f_classif(features, labels)
l = zip(features_list_org[1:], f)
n_features = int(round(0.3*len(l)))
sorted(l, key = lambda s: s[1], reverse=True )[:n_features]

[('exercised_stock_options', 24.815079733218194),
 ('total_stock_value', 24.182898678566879),
 ('bonus', 20.792252047181535),
 ('salary', 18.289684043404513),
 ('deferred_income', 11.458476579280369)]

Note that NB classifier assumes that featues are independent, however we have included total payments and total stock values along with their constitution. Next we we run NB on reduced set of features and compare results.

In [23]:
features_list = [feat for feat in features_list_org if feat not in ["total_payments", "total_stock_value"]]
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

clf_v12 = constructNB()

In [24]:
clf_v12 = load_or_fit(clf_v12, features, labels, "../impl/dev/clf_v12.pkl", "dev/features_reduced.pkl")

Classifier was loaded from  ../impl/dev/clf_v12.pkl


In [25]:
print "Optimal parameters\n{}\n{}".format(clf_v12.best_params_, "-"*100)
print "Best test score\n{}\n{}".format(clf_v12.best_score_, "-"*100)

Optimal parameters
{'feature_selection__score_func': <function f_classif at 0x117394a28>, 'feature_selection__percentile': 70}
----------------------------------------------------------------------------------------------------
Best test score
0.347924703023
----------------------------------------------------------------------------------------------------


Excluding correlated features did not change test score, however, we can notice that this time already 70% of features were included in the model. In next section we will try to resrtucture current features or add some new features in order to provide more information for the model.

In [26]:
scores = test_clf(clf_v12.best_estimator_, my_dataset, 
                                                       features_list)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver12(NB)") 

Accuracy: 0.808965517241, Precision: 0.397204517705, Recall: 0.395, F1: 0.347924703023


As for now, Naive Bayes gives nearly 0.35 test score. Both variants have variance and recall higher than 0.3.
The second classifier has higher recall - 0.42.

### Decision Tree

In [27]:
data = featureFormat(my_dataset, features_list_org, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [28]:
def constructDT(base_clf=None, param=None): 
    """
    Construct Naive Bayes classifier, define default construction proccess
    and deafault grid search setup 
    """    
    
    if not base_clf:
        base_clf = Pipeline([
                    ('feature_selection', SelectPercentile()),
                    ('classification', DecisionTreeClassifier(class_weight="balanced"))
                  ])
    if not param:
        param = {
            "feature_selection__score_func" : [f_classif, mutual_info_classif],
            "feature_selection__percentile" : [30, 50, 70, 100],
            "classification__min_samples_leaf" : [3, 5, 8, 15],
            "classification__criterion" : ["gini", "entropy"]
            }
    
    clf = GridSearchCV(base_clf,
                    param_grid = param,
                    scoring = make_scorer(f1_score),
                    cv = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=32))
    return clf

In [29]:
clf_v2 = constructDT()
clf_v2 = load_or_fit(clf_v2, features, labels, "../impl/dev/clf_v2.pkl")

Classifier was loaded from  ../impl/dev/clf_v2.pkl


In [30]:
clf_v2.best_params_

{'classification__criterion': 'entropy',
 'classification__min_samples_leaf': 15,
 'feature_selection__percentile': 30,
 'feature_selection__score_func': <function sklearn.feature_selection.mutual_info_.mutual_info_classif>}

In [31]:
clf_v2.best_score_

0.38996238565356212

In [32]:
scores = test_clf(clf_v2.best_estimator_, my_dataset, features_list_org)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver2(DT)") 

Accuracy: 0.732068965517, Precision: 0.284186396936, Recall: 0.6275, F1: 0.381473041664


### SVM

In [33]:
data = featureFormat(my_dataset, features_list_org, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [34]:
def constructSVM(base_clf=None, param=None): 
    """
    Construct Naive Bayes classifier, define default construction proccess
    and deafault grid search setup 
    """    
    
    if not base_clf:
        base_clf = Pipeline([
                    ('scaling', MinMaxScaler()),     
                    ('feature_selection', SelectPercentile()),
                    ('classification', SVC(class_weight="balanced"))
                  ])
    if not param:
        param = {
            "feature_selection__score_func" : [f_classif, mutual_info_classif],
            "feature_selection__percentile" : [30, 50, 70, 100],
            "classification__C" : np.logspace(-2,2,num=5,endpoint=True),
            "classification__gamma" : np.logspace(-1,1,num=5,endpoint=True)
            }
    
    clf = GridSearchCV(base_clf,
                    param_grid = param,
                    scoring = make_scorer(f1_score),
                    cv = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=32))
    return clf

In [35]:
clf_v3 = constructSVM()
clf_v3 = load_or_fit(clf_v3, features, labels, "../impl/dev/clf_v3.pkl")

Classifier was loaded from  ../impl/dev/clf_v3.pkl


In [36]:
clf_v3.best_params_

{'classification__C': 0.10000000000000001,
 'classification__gamma': 10.0,
 'feature_selection__percentile': 70,
 'feature_selection__score_func': <function sklearn.feature_selection.univariate_selection.f_classif>}

In [37]:
clf_v3.best_score_

0.4198014694993763

In [38]:
scores = test_clf(clf_v3.best_estimator_, my_dataset, features_list_org)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver3(SVM)") 

Accuracy: 0.648620689655, Precision: 0.276362503381, Recall: 0.9, F1: 0.419801469499


### Logistic Regression

In [39]:
data = featureFormat(my_dataset, features_list_org, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [40]:
def constructLogReg(base_clf=None, param=None): 
    """
    Construct Naive Bayes classifier, define default construction proccess
    and deafault grid search setup 
    """    
    
    if not base_clf:
        base_clf = Pipeline([
                    ('scaling', MinMaxScaler()),     
                    ('feature_selection', SelectPercentile()),
                    ('classification', LogisticRegression(class_weight="balanced"))
                  ])
    if not param:
        param = {
            "feature_selection__score_func" : [f_classif, mutual_info_classif],
            "feature_selection__percentile" : [30, 50, 70, 100],
            "classification__C" : np.logspace(-2,2,num=5,endpoint=True)
            }
    
    clf = GridSearchCV(base_clf,
                    param_grid = param,
                    scoring = make_scorer(f1_score),
                    cv = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=32))
    return clf

In [41]:
clf_v4 = constructLogReg()
clf_v4 = load_or_fit(clf_v4, features, labels, "../impl/dev/clf_v4.pkl")

Classifier was loaded from  ../impl/dev/clf_v4.pkl


In [42]:
clf_v4.best_params_

{'classification__C': 0.01,
 'feature_selection__percentile': 100,
 'feature_selection__score_func': <function sklearn.feature_selection.univariate_selection.f_classif>}

In [43]:
clf_v4.best_score_

0.42037373287262236

In [44]:
scores = test_clf(clf_v4.best_estimator_, my_dataset, features_list_org)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver4(LogReg)") 

Accuracy: 0.734482758621, Precision: 0.315266090892, Recall: 0.7, F1: 0.420373732873


Calculated on reduced set of features

In [45]:
features_list = [feat for feat in features_list_org if feat not in ["total_payments", "total_stock_value"]]
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

clf_v41 = constructLogReg()
clf_v41 = load_or_fit(clf_v41, features, labels, "../impl/dev/clf_v41.pkl")

Classifier was loaded from  ../impl/dev/clf_v41.pkl


In [46]:
clf_v41.best_params_

{'classification__C': 0.01,
 'feature_selection__percentile': 100,
 'feature_selection__score_func': <function sklearn.feature_selection.univariate_selection.f_classif>}

In [47]:
clf_v41.best_score_

0.42308613192867461

In [48]:
scores = test_clf(clf_v41.best_estimator_, my_dataset, features_list)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver41(LogReg)") 

Accuracy: 0.734827586207, Precision: 0.318005018631, Recall: 0.6975, F1: 0.423086131929


## Extended feature set

### Logistic Regression

In [49]:
data = featureFormat(my_dataset, features_list_full, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [50]:
clf_v5 = constructLogReg()
clf_v5 = load_or_fit(clf_v5, features, labels, "../impl/dev/clf_v5.pkl")

Classifier was loaded from  ../impl/dev/clf_v5.pkl


In [51]:
clf_v5.best_params_

{'classification__C': 0.10000000000000001,
 'feature_selection__percentile': 70,
 'feature_selection__score_func': <function sklearn.feature_selection.univariate_selection.f_classif>}

In [52]:
clf_v5.best_score_

0.47103806994519054

In [53]:
scores = test_clf(clf_v5.best_estimator_, my_dataset, features_list_full)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver5(LogReg)") 

Accuracy: 0.76724137931, Precision: 0.363692488067, Recall: 0.725, F1: 0.471038069945


Try logistic regression on manually selected list of features

In [54]:
features_list_pick = [
 'poi',
 'bonus',
 'deferred_income',
 'expenses',
 'from_poi_to_this_person_perc',
 'from_this_person_to_poi_perc',
 'gross_payments',
 'long_term_incentive',
 'long_term_incentive_perc',
 'other',
 'restricted_stock',
 'salary',
 'salary_perc',
 'shared_receipt_with_poi',
 'shared_receipt_with_poi_perc',
 'total_stock_value'
]

data = featureFormat(my_dataset, features_list_pick, sort_keys = True)
labels, features = targetFeatureSplit(data)


In [60]:
param_lr = {
            "feature_selection__score_func" : [f_classif],
            "feature_selection__percentile" : [100],
            "classification__C" : np.logspace(-2,2,num=5,endpoint=True)
            }
clf_v6 = constructLogReg(param=param_lr)
clf_v6 = load_or_fit(clf_v6, features, labels, "../impl/dev/clf_v6.pkl")

Failed to load fitted classifier
Start fitting...
Classifier fitted in  6.08 s
Fitted classifier was dumped to  ../impl/dev/clf_v6.pkl


In [61]:
clf_v6.best_params_

{'classification__C': 1.0,
 'feature_selection__percentile': 100,
 'feature_selection__score_func': <function sklearn.feature_selection.univariate_selection.f_classif>}

In [63]:
clf_v6.best_score_

0.4495104389074977

In [64]:
scores = test_clf(clf_v6.best_estimator_, my_dataset, features_list_pick)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver6(LogReg)") 

Accuracy: 0.772068965517, Precision: 0.353054556555, Recall: 0.67, F1: 0.449510438907


### Decision Tree

In [93]:
data = featureFormat(my_dataset, features_list_full, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [94]:
clf_v7 = constructDT()
clf_v7 = load_or_fit(clf_v7, features, labels, "../impl/dev/clf_v7.pkl")

Failed to load fitted classifier
Start fitting...
Classifier fitted in  753.692 s
Fitted classifier was dumped to  ../impl/dev/clf_v7.pkl


In [96]:
clf_v7.best_score_

0.43233605283605281

In [97]:
scores = test_clf(clf_v7.best_estimator_, my_dataset, features_list_full)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver7(DT)") 

Accuracy: 0.823103448276, Precision: 0.404313492063, Recall: 0.4975, F1: 0.429126567877


### SVM

In [98]:
clf_v8 = constructSVM(param = {
            "feature_selection__score_func" : [f_classif],
            "feature_selection__percentile" : [30, 50, 70, 100],
            "classification__C" : np.logspace(-2,2,num=5,endpoint=True),
            "classification__gamma" : np.logspace(-1,1,num=5,endpoint=True)
            })
clf_v8 = load_or_fit(clf_v8, features, labels, "../impl/dev/clf_v8.pkl")

Failed to load fitted classifier
Start fitting...
Classifier fitted in  159.923 s
Fitted classifier was dumped to  ../impl/dev/clf_v8.pkl


In [99]:
clf_v8.best_score_

0.47911654685184102

In [100]:
scores = test_clf(clf_v8.best_estimator_, my_dataset, features_list_full)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver8(SVM)") 

Accuracy: 0.817586206897, Precision: 0.417656898657, Recall: 0.615, F1: 0.479116546852


## Analysis of importance

Logistic regression was selected as a final classifier.
Next I will analyze feature importance based on coefficients of logistic regression.

In [66]:
data = featureFormat(my_dataset, features_list_full, sort_keys = True)
labels, features = targetFeatureSplit(data)

final_clf = clf_v5.best_estimator_
final_clf.fit(features, labels)

Pipeline(steps=[('scaling', MinMaxScaler(copy=True, feature_range=(0, 1))), ('feature_selection', SelectPercentile(percentile=70,
         score_func=<function f_classif at 0x117394a28>)), ('classification', LogisticRegression(C=0.10000000000000001, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])

In [72]:
len(features_list_full)

35

In [81]:
feat = np.array(features_list_full[1:])
sel_feat = feat[final_clf.named_steps['feature_selection'].get_support()]
sel_feat

array(['to_messages', 'shared_receipt_with_poi_perc', 'expenses',
       'long_term_incentive', 'from_poi_to_this_person', 'deferred_income',
       'shared_receipt_with_poi', 'other', 'bonus_perc',
       'long_term_incentive_perc', 'director_fees', 'gross_payments',
       'bonus', 'total_stock_value', 'from_this_person_to_poi',
       'other_perc', 'restricted_stock', 'salary', 'deferred_income_perc',
       'total_payments', 'from_poi_to_this_person_perc',
       'exercised_stock_options', 'salary_perc',
       'from_this_person_to_poi_perc'], 
      dtype='|S30')

In [88]:
sorted(zip(sel_feat, *final_clf.named_steps['classification'].coef_), key=lambda s: abs(s[1]) , reverse=True)

[('bonus_perc', 0.49050521879589942),
 ('deferred_income', -0.45157412939258657),
 ('exercised_stock_options', 0.3470091776306421),
 ('shared_receipt_with_poi_perc', 0.33435054735265907),
 ('from_this_person_to_poi_perc', 0.32917538230366422),
 ('long_term_incentive_perc', 0.30285327762752551),
 ('total_stock_value', 0.29567808814373142),
 ('expenses', 0.2925797147324109),
 ('salary', 0.28209926153241111),
 ('bonus', 0.26818112045952591),
 ('shared_receipt_with_poi', 0.19122736580832075),
 ('director_fees', -0.19003692718561885),
 ('salary_perc', 0.15384040116668052),
 ('long_term_incentive', 0.14994070178112423),
 ('other_perc', 0.14711408932559275),
 ('from_this_person_to_poi', 0.12217186623803525),
 ('restricted_stock', 0.1042811682825396),
 ('from_poi_to_this_person', 0.097986278115711825),
 ('from_poi_to_this_person_perc', 0.08446714298496634),
 ('gross_payments', 0.081422150985282868),
 ('total_payments', 0.067987331481177152),
 ('other', 0.058974615084958257),
 ('deferred_income

## Evaluation

In [89]:
test_classifier(final_clf, my_dataset, features_list_full)

Pipeline(steps=[('scaling', MinMaxScaler(copy=True, feature_range=(0, 1))), ('feature_selection', SelectPercentile(percentile=70,
         score_func=<function f_classif at 0x117394a28>)), ('classification', LogisticRegression(C=0.10000000000000001, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False))])
	Accuracy: 0.75969	Precision: 0.33056	Recall: 0.72400	F1: 0.45388	F2: 0.58479
	Total predictions: 29000	True positives: 2896	False positives: 5865	False negatives: 1104	True negatives: 19135



(0.7596896551724138, 0.33055587261728114, 0.724)