# POI Classification

In [1]:
import sys
import pickle
import pandas as pd
import numpy as np
from time import time
sys.path.append("../tools/")
sys.path.append("../impl/")

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

% matplotlib inline

In [2]:
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data, test_classifier
from preprocess import preprocess, ordered_columns



In [4]:
with open("../impl/final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

my_dataset = preprocess(data_dict)

In [5]:
my_dataset_df = pd.DataFrame.from_dict(my_dataset, orient="index")
my_dataset_df.replace('NaN', np.nan, inplace=True)

In [6]:
exclude = ["poi", "email_address"]
all_features_list = [f for f in my_dataset.items()[0][1].keys() if f not in exclude]

In [7]:
features_list_org = ['poi'] + ordered_columns + ["to_messages", "from_messages"]
features_list_ext = features_list_org + ["to_poi_perc", "from_poi_perc", "shared_with_poi_perc"]
features_list_full = ["poi"] + all_features_list

Dump cleaned dataset and features_list

In [9]:
with open("../impl/dev/my_dataset.pkl", "w") as dataset_outfile:
        pickle.dump(my_dataset, dataset_outfile)

with open("../impl/dev/features_list_org.pkl", "w") as featurelist_outfile:
        pickle.dump(features_list_org, featurelist_outfile)
with open("../impl/dev/features_list_ext.pkl", "w") as featurelist_outfile:
        pickle.dump(features_list_ext, featurelist_outfile)
with open("../impl/dev/features_list_full.pkl", "w") as featurelist_outfile:
        pickle.dump(features_list_full, featurelist_outfile)

In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import GridSearchCV, StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from sklearn.feature_selection import  SelectPercentile, mutual_info_classif, f_classif
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler


## Test of different classifiers

Next I will consider three different classifiers.
Before applying each classifier feature selection procedure is run. Note that parameters of feature selection are tuned along with other classification parameters. 
100% threshoold in SelectPercentile means that all features are used in classification

## Naive Bayes 

In [11]:
features_list = features_list_org
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [12]:
def constructNB(base_clf=None, param=None): 
    """
    Construct Naive Bayes classifier, define default construction proccess
    and deafault grid search setup 
    """    
    
    if not base_clf:
        base_clf = Pipeline([
                    ('feature_selection', SelectPercentile()),
                    ('classification', GaussianNB())
                  ])
    if not param:
        param = {
            "feature_selection__score_func" : [f_classif, mutual_info_classif],
            "feature_selection__percentile" : [30, 50, 70, 100],
            }
    
    clf = GridSearchCV(base_clf,
                    param_grid = param,
                    scoring = make_scorer(f1_score),
                    cv = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=32))
    return clf
        

In [13]:
clf_v1 = constructNB()

In [14]:
def load_or_fit(clf, features, labels, path, dump_new=True, features_path=None):
    """
    Load dumped version of classifier from provided path.
    Otherwise fit classifier, if dump_new is True, then dump it in a provided path.
    If features_path is specified, than dump current features_list to the features_path
    """
    
    try:
        with open(path, "r") as clf_infile:
            fitted_clf = pickle.load(clf_infile)
        print "Classifier was loaded from ", path
    except IOError:
        print "Failed to load fitted classifier\nStart fitting..."
        t0 = time()
        fitted_clf = clf
        fitted_clf.fit(features, labels)
        print "Classifier fitted in ", round(time()-t0, 3), "s"
        if dump_new:
            with open(path, "w") as clf_outfile:
                pickle.dump(fitted_clf, clf_outfile)
            print "Fitted classifier was dumped to ", path
        
    if features_path:
        with open(features_path, "w") as feat_outfile:
            pickle.dump(features_list, feat_outfile)
    
    return fitted_clf 
            

In [15]:
def test_clf(clf, my_dataset, feature_list, n_splits=100, random_state=32, test_size=0.2):
    data = featureFormat(my_dataset, features_list, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    sss = StratifiedShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=random_state)
    accuracy_list = []
    precision_list = []
    recall_list = []
    f1_list = []

    for train_index, test_index in sss.split(features, labels):
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_index:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_index:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )
        
        clf.fit(features_train, labels_train)
        pred = clf.predict(features_test)
        accuracy_list.append(accuracy_score(labels_test, pred))
        precision_list.append(precision_score(labels_test, pred))
        recall_list.append(recall_score(labels_test, pred))
        f1_list.append(f1_score(labels_test, pred))

    return np.mean(accuracy_list), np.mean(precision_list), np.mean(recall_list), \
            np.mean(f1_list)

In [16]:
def add_summary(fname, values, version):
    with open(fname, "a") as f:
        #if empty add header
        if f.tell() == 0: 
            f.write("Version, Accuracy, Precision, Recall, F1\n")
        line = version + ", %2.4f"*4 % values + "\n"
        f.write(line)

In [18]:
clf_v1 = load_or_fit(clf_v1, features, labels, path="../impl/dev/clf_v1.pkl", 
                     features_path = "../impl/dev/features_list_org.pkl")

Classifier was loaded from  ../impl/dev/clf_v1.pkl


In [19]:
clf_v1

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=32, test_size=0.2,
            train_size=None),
       error_score='raise',
       estimator=Pipeline(steps=[('feature_selection', SelectPercentile(percentile=10,
         score_func=<function f_classif at 0x117790758>)), ('classification', GaussianNB(priors=None))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'feature_selection__score_func': [<function f_classif at 0x117790758>, <function mutual_info_classif at 0x117a74c08>], 'feature_selection__percentile': [30, 50, 70, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(f1_score), verbose=0)

In [20]:
clf_v1.best_params_

{'feature_selection__percentile': 30,
 'feature_selection__score_func': <function sklearn.feature_selection.univariate_selection.f_classif>}

In [21]:
clf_v1.best_score_

0.35413636363636364

Best score correspond to best F1 value. Next some other scores are presented

In [25]:
scores = test_clf(clf_v1.best_estimator_, my_dataset, features_list_org)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver1") 

Accuracy: 0.850344827586, Precision: 0.44494047619, Recall: 0.33, F1: 0.354136363636


In [26]:
pd.DataFrame(
{ "score" : clf_v1.cv_results_["mean_test_score"], 
"percentile" : clf_v1.cv_results_["param_feature_selection__percentile"],
"score_fun" : clf_v1.cv_results_['param_feature_selection__score_func']
})

Unnamed: 0,percentile,score,score_fun
0,30,0.354136,<function f_classif at 0x117790758>
1,30,0.221919,<function mutual_info_classif at 0x117a74c08>
2,50,0.315711,<function f_classif at 0x117790758>
3,50,0.236975,<function mutual_info_classif at 0x117a74c08>
4,70,0.325613,<function f_classif at 0x117790758>
5,70,0.29119,<function mutual_info_classif at 0x117a74c08>
6,100,0.316618,<function f_classif at 0x117790758>
7,100,0.316618,<function mutual_info_classif at 0x117a74c08>


What features were used by NB?

In [27]:
f, pval = f_classif(features, labels)
l = zip(features_list_org[1:], f)
n_features = int(round(0.3*len(l)))
sorted(l, key = lambda s: s[1], reverse=True )[:n_features]

[('exercised_stock_options', 24.815079733218194),
 ('total_stock_value', 24.182898678566879),
 ('bonus', 20.792252047181535),
 ('salary', 18.289684043404513),
 ('deferred_income', 11.458476579280369)]

Note that NB classifier assumes that featues are independent, however we have included total payments and total stock values along with their constitution. Next we we run NB on reduced set of features and compare results.

In [28]:
features_list = [feat for feat in features_list_org if feat not in ["total_payments", "total_stock_value"]]
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

clf_v12 = constructNB()

In [29]:
clf_v12 = load_or_fit(clf_v12, features, labels, "../impl/dev/clf_v12.pkl", "dev/features_reduced.pkl")

Classifier was loaded from  ../impl/dev/clf_v12.pkl


In [30]:
print "Optimal parameters\n{}\n{}".format(clf_v12.best_params_, "-"*100)
print "Best test score\n{}\n{}".format(clf_v12.best_score_, "-"*100)

Optimal parameters
{'feature_selection__score_func': <function f_classif at 0x117790758>, 'feature_selection__percentile': 70}
----------------------------------------------------------------------------------------------------
Best test score
0.354200438038
----------------------------------------------------------------------------------------------------


Excluding correlated features did not change test score, however, we can notice that this time already 70% of features were included in the model. In next section we will try to resrtucture current features or add some new features in order to provide more information for the model.

In [31]:
scores = test_clf(clf_v12.best_estimator_, my_dataset, 
                                                       features_list)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver12(NB)") 

Accuracy: 0.794482758621, Precision: 0.396085107703, Recall: 0.425, F1: 0.354200438038


As for now, Naive Bayes gives nearly 0.35 test score. Both variants have variance and recall higher than 0.3.
The second classifier has higher recall - 0.42.

## Decision Tree

In [32]:
data = featureFormat(my_dataset, features_list_org, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [33]:
def constructDT(base_clf=None, param=None): 
    """
    Construct Naive Bayes classifier, define default construction proccess
    and deafault grid search setup 
    """    
    
    if not base_clf:
        base_clf = Pipeline([
                    ('feature_selection', SelectPercentile()),
                    ('classification', DecisionTreeClassifier(class_weight="balanced"))
                  ])
    if not param:
        param = {
            "feature_selection__score_func" : [f_classif, mutual_info_classif],
            "feature_selection__percentile" : [30, 50, 70, 100],
            "classification__min_samples_leaf" : [3, 5, 8, 15],
            "classification__criterion" : ["gini", "entropy"]
            }
    
    clf = GridSearchCV(base_clf,
                    param_grid = param,
                    scoring = make_scorer(f1_score),
                    cv = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=32))
    return clf

In [34]:
clf_v2 = constructDT()
clf_v2 = load_or_fit(clf_v2, features, labels, "../impl/dev/clf_v2.pkl")

Classifier was loaded from  ../impl/dev/clf_v2.pkl


In [35]:
clf_v2.best_params_

{'classification__criterion': 'entropy',
 'classification__min_samples_leaf': 15,
 'feature_selection__percentile': 100,
 'feature_selection__score_func': <function sklearn.feature_selection.mutual_info_.mutual_info_classif>}

In [36]:
clf_v2.best_score_

0.41331916776034427

In [37]:
scores = test_clf(clf_v2.best_estimator_, my_dataset, features_list_org)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver2(DT)") 

Accuracy: 0.741034482759, Precision: 0.314670204795, Recall: 0.695, F1: 0.418500058765


## SVM

In [38]:
data = featureFormat(my_dataset, features_list_org, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [39]:
def constructSVM(base_clf=None, param=None): 
    """
    Construct Naive Bayes classifier, define default construction proccess
    and deafault grid search setup 
    """    
    
    if not base_clf:
        base_clf = Pipeline([
                    ('scaling', MinMaxScaler()),     
                    ('feature_selection', SelectPercentile()),
                    ('classification', SVC(class_weight="balanced"))
                  ])
    if not param:
        param = {
            "feature_selection__score_func" : [f_classif, mutual_info_classif],
            "feature_selection__percentile" : [30, 50, 70, 100],
            "classification__C" : np.logspace(-2,2,num=5,endpoint=True),
            "classification__gamma" : np.logspace(-1,1,num=5,endpoint=True)
            }
    
    clf = GridSearchCV(base_clf,
                    param_grid = param,
                    scoring = make_scorer(f1_score),
                    cv = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=32))
    return clf

In [40]:
clf_v3 = constructSVM()
clf_v3 = load_or_fit(clf_v3, features, labels, "../impl/dev/clf_v3.pkl")

Classifier was loaded from  ../impl/dev/clf_v3.pkl


In [41]:
clf_v3.best_params_

{'classification__C': 0.10000000000000001,
 'classification__gamma': 10.0,
 'feature_selection__percentile': 30,
 'feature_selection__score_func': <function sklearn.feature_selection.univariate_selection.f_classif>}

In [42]:
clf_v3.best_score_

0.40470640812799907

In [43]:
scores = test_clf(clf_v3.best_estimator_, my_dataset, features_list_org)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver3(SVM)") 

Accuracy: 0.660689655172, Precision: 0.264783136712, Recall: 0.79, F1: 0.390650660504


## Logistic Regression

In [44]:
data = featureFormat(my_dataset, features_list_org, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [45]:
def constructLogReg(base_clf=None, param=None): 
    """
    Construct Naive Bayes classifier, define default construction proccess
    and deafault grid search setup 
    """    
    
    if not base_clf:
        base_clf = Pipeline([
                    ('scaling', MinMaxScaler()),     
                    ('feature_selection', SelectPercentile()),
                    ('classification', LogisticRegression(class_weight="balanced"))
                  ])
    if not param:
        param = {
            "feature_selection__score_func" : [f_classif, mutual_info_classif],
            "feature_selection__percentile" : [30, 50, 70, 100],
            "classification__C" : np.logspace(-2,2,num=5,endpoint=True)
            }
    
    clf = GridSearchCV(base_clf,
                    param_grid = param,
                    scoring = make_scorer(f1_score),
                    cv = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=32))
    return clf

In [46]:
clf_v4 = constructLogReg()
clf_v4 = load_or_fit(clf_v4, features, labels, "../impl/dev/clf_v4.pkl")

Classifier was loaded from  ../impl/dev/clf_v4.pkl


In [47]:
clf_v4.best_params_

{'classification__C': 0.10000000000000001,
 'feature_selection__percentile': 50,
 'feature_selection__score_func': <function sklearn.feature_selection.mutual_info_.mutual_info_classif>}

In [48]:
clf_v4.best_score_

0.40310758754489395

In [49]:
scores = test_clf(clf_v4.best_estimator_, my_dataset, features_list_org)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver4(LogReg)") 

Accuracy: 0.724827586207, Precision: 0.300125976148, Recall: 0.6575, F1: 0.391908545101


Calculated on reduced set of features

In [50]:
features_list = [feat for feat in features_list_org if feat not in ["total_payments", "total_stock_value"]]
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

clf_v41 = constructLogReg()
clf_v41 = load_or_fit(clf_v41, features, labels, "../impl/dev/clf_v41.pkl")

Classifier was loaded from  ../impl/dev/clf_v41.pkl


In [51]:
clf_v41.best_params_

{'classification__C': 0.10000000000000001,
 'feature_selection__percentile': 30,
 'feature_selection__score_func': <function sklearn.feature_selection.mutual_info_.mutual_info_classif>}

In [52]:
clf_v4.best_score_

0.40310758754489395

In [53]:
scores = test_clf(clf_v41.best_estimator_, my_dataset, features_list_org)
print "Accuracy: {}, Precision: {}, Recall: {}, F1: {}".format(*scores)
add_summary("../impl/dev/summary.csv",scores, "ver41(LogReg)") 

Accuracy: 0.715862068966, Precision: 0.293816487434, Recall: 0.6475, F1: 0.384401632853


## Adding new features

TO DO: Add new features, run Logistic Regression and Decision Tree

## Analysis of importance

In [125]:
#TO DO: Draw decision tree

# from IPython.display import Image

# export_graphviz(clf_v2.best_estimator_, "tree.dot", feature_names=features_list_org[1:],  
#                 class_names = ["Not POI", "POI"])
# ! dot -Tpng tree.dot > tree.png
# Image(filename="tree.png")

## Summary

TO DO: Show summury of all runs. Select classifier