# POI Classification

In [94]:
import sys
import pickle
import pandas as pd
import numpy as np
sys.path.append("../tools/")

In [2]:
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data, test_classifier
from preprocess import preprocess, ordered_columns



In [3]:
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)

my_dataset = preprocess(data_dict)

In [14]:
exclude = ["poi", "email_address"]
all_features_list = [f for f in my_dataset.items()[0][1].keys() if f not in exclude]

In [16]:
features_list_org = ['poi'] + ordered_columns + ["to_messages", "from_messages"]
features_list_ext = features_list_org + ["to_poi_perc", "from_poi_perc", "shared_with_poi_perc"]
features_list_full = ["poi"] + all_features_list

Dump cleaned dataset and features_list

In [93]:
with open("dev/my_dataset.pkl", "w") as dataset_outfile:
        pickle.dump(my_dataset, dataset_outfile)

with open("dev/features_list_org.pkl", "w") as featurelist_outfile:
        pickle.dump(features_list_org, featurelist_outfile)
with open("dev/features_list_ext.pkl", "w") as featurelist_outfile:
        pickle.dump(features_list_ext, featurelist_outfile)
with open("dev/features_list_full.pkl", "w") as featurelist_outfile:
        pickle.dump(features_list_full, featurelist_outfile)

In [46]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
                f1_score, make_scorer
from sklearn.feature_selection import SelectKBest, SelectPercentile, \
        mutual_info_classif, VarianceThreshold, chi2, f_classif
from sklearn.pipeline import Pipeline


## Test of different classifiers

Next I will consider three different classifiers.
Before applying each classifier feature selection procedure is run. Note that parameters of feature selection are tuned along with other classification parameters. 
100% threshoold in SelectPercentile means that all features are used in classification

## Naive Bayes 

In [19]:
data = featureFormat(my_dataset, features_list_org, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [56]:
base_clf = Pipeline([
  ('feature_selection', SelectPercentile()),
  ('classification', GaussianNB())
])

param = {
    "feature_selection__score_func" : [f_classif, mutual_info_classif],
    "feature_selection__percentile" : [30, 50, 70, 100],
}

clf_v1 = GridSearchCV(base_clf,
                    param_grid = param,
                    scoring = make_scorer(f1_score),
                    cv = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=32))

In [57]:
clf_v1.fit(features, labels)

  'precision', 'predicted', average, warn_for)


GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=32, test_size=0.2,
            train_size=None),
       error_score='raise',
       estimator=Pipeline(steps=[('feature_selection', SelectPercentile(percentile=10,
         score_func=<function f_classif at 0x1137ca668>)), ('classification', GaussianNB(priors=None))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'feature_selection__score_func': [<function f_classif at 0x1137ca668>, <function mutual_info_classif at 0x102b5d668>], 'feature_selection__percentile': [30, 50, 70, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(f1_score), verbose=0)

In [59]:
clf_v1.best_estimator_

Pipeline(steps=[('feature_selection', SelectPercentile(percentile=30,
         score_func=<function f_classif at 0x1137ca668>)), ('classification', GaussianNB(priors=None))])

In [62]:
clf_v1.best_score_

0.35413636363636364

In [79]:
pd.DataFrame(
{ "score" : clf_v1.cv_results_["mean_test_score"], 
"percentile" : clf_v1.cv_results_["param_feature_selection__percentile"],
"score_fun" : clf_v1.cv_results_['param_feature_selection__score_func']
})

Unnamed: 0,percentile,score,score_fun
0,30,0.354136,<function f_classif at 0x1137ca668>
1,30,0.210464,<function mutual_info_classif at 0x102b5d668>
2,50,0.303934,<function f_classif at 0x1137ca668>
3,50,0.261888,<function mutual_info_classif at 0x102b5d668>
4,70,0.29916,<function f_classif at 0x1137ca668>
5,70,0.302222,<function mutual_info_classif at 0x102b5d668>
6,100,0.295472,<function f_classif at 0x1137ca668>
7,100,0.295472,<function mutual_info_classif at 0x102b5d668>


In [81]:
accuracy, precision, recall = test_classifier(clf_v1.best_estimator_, my_dataset, features_list_org, folds = 1000)

Pipeline(steps=[('feature_selection', SelectPercentile(percentile=30,
         score_func=<function f_classif at 0x1137ca668>)), ('classification', GaussianNB(priors=None))])
	Accuracy: 0.85120	Precision: 0.43004	Recall: 0.35650	F1: 0.38983	F2: 0.36912
	Total predictions: 15000	True positives:  713	False positives:  945	False negatives: 1287	True negatives: 12055



In [91]:
with open("clf_v1.pkl", "w") as clf_outfile:
        pickle.dump(clf_v1, clf_outfile)

## Decision Tree

In [82]:
data = featureFormat(my_dataset, features_list_org, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [83]:
base_clf = Pipeline([
  ('feature_selection', SelectPercentile()),
  ('classification', DecisionTreeClassifier(class_weight="balanced"))
])

param = {
    "feature_selection__score_func" : [f_classif, mutual_info_classif],
    "feature_selection__percentile" : [30, 50, 70, 100],
    "classification__min_samples_leaf" : [3, 5, 8, 15],
    "classification__criterion" : ["gini", "entropy"]
}

clf_v2 = GridSearchCV(base_clf,
                    param_grid = param,
                    scoring = make_scorer(f1_score),
                    cv = StratifiedShuffleSplit(n_splits=100, test_size=0.2, random_state=32))

In [85]:
clf_v2.fit(features, labels)

GridSearchCV(cv=StratifiedShuffleSplit(n_splits=100, random_state=32, test_size=0.2,
            train_size=None),
       error_score='raise',
       estimator=Pipeline(steps=[('feature_selection', SelectPercentile(percentile=10,
         score_func=<function f_classif at 0x1137ca668>)), ('classification', DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'classification__min_samples_leaf': [3, 5, 8, 15], 'feature_selection__score_func': [<function f_classif at 0x1137ca668>, <function mutual_info_classif at 0x102b5d668>], 'classification__criterion': ['gini', 'entropy'], 'feature_selection__percentile': [30, 50, 70, 100]},
       pre_dispatch='2*n_jobs', refi

In [86]:
clf_v2.best_estimator_

Pipeline(steps=[('feature_selection', SelectPercentile(percentile=50,
         score_func=<function mutual_info_classif at 0x102b5d668>)), ('classification', DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=15,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

In [87]:
clf_v2.best_score_

0.41488685007802656

In [88]:
accuracy, precision, recall = test_classifier(clf_v2.best_estimator_, my_dataset, features_list_org, folds = 1000)

Pipeline(steps=[('feature_selection', SelectPercentile(percentile=50,
         score_func=<function mutual_info_classif at 0x102b5d668>)), ('classification', DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=15,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])
	Accuracy: 0.75980	Precision: 0.33264	Recall: 0.79650	F1: 0.46929	F2: 0.62280
	Total predictions: 15000	True positives: 1593	False positives: 3196	False negatives:  407	True negatives: 9804



## SVM

In [18]:
data = featureFormat(my_dataset, features_list_org, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [None]:
base_clf = Pipeline([
  ('scaling', MinMaxScaler()),     
  ('feature_selection', SelectPercentile()),
  ('classification', SVC()
])

param = {
    "feature_selection__score_func" : [f_classif, mutual_info_classif],
    "feature_selection__percentile" : [30, 50, 70, 100],
}

clf_v1 = GridSearchCV(base_clf,
                    param_grid = param,
                    scoring = make_scorer(f1_score),
                    cv = StratifiedKFold(random_state=42))

In [89]:
import sklearn

In [90]:
sklearn.__file__

'//anaconda/lib/python2.7/site-packages/sklearn/__init__.pyc'