# Analysis of Enron Data
## Based on Udacity intro to machine learning course

### Data sources:
- Raw email text data can be found at: https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tgz
and a breakdown of emails by sender can be found [here](data/emails_by_address/)
- The financial data was compiled from [this file](data/financial_data.pdf)
- The persons of interest "pois" come from [this file](data/poi_names.txt)

### Support functions and classes

In [2]:
import numpy as np
from tools.feature_format import featureFormat

### Get the data already preprocessed in [outlier_removal](outlier_removal.ipynb) and [imputing_data](imputing_data.ipynb)

In [3]:
import pickle

with open('imputer_dicts.pkl', 'rb') as f:
    imputer_dicts = pickle.load(f)

mean_data_dict = imputer_dicts['mean']

# Start with all features except: 'email_address'
all_feature_names = ['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments',\
'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi',\
'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances',\
'from_messages', 'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income',\
'long_term_incentive', 'from_poi_to_this_person']

mean_data = featureFormat(mean_data_dict, all_feature_names)
labels, features = mean_data[:,0], mean_data[:, 1:]

### Make a pipeline

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

pipe = Pipeline([
                 ('scaler', StandardScaler()),
                 ('polynomials', PolynomialFeatures(2)),
                 ('feature_selr', SelectFromModel(
                                                 ExtraTreesClassifier(
                                                     random_state=2,
                                                     class_weight='balanced'), 
                                                 threshold='mean')),
                ('estimator', GaussianNB())
                ])

In [6]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report

def score_display(clf, features, labels, title):
    print title, ':\n'
    for scoring in ['recall', 'precision', 'f1']:
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=24)
        scores = cross_val_score(pipe, features, labels, cv=cv, scoring=scoring)
        mean, std = np.mean(scores), np.std(scores)
        print '-', scoring, scores
        print 'Mean: %0.2f (+/-%0.2f)' % (mean, std)

score_display(pipe, features, labels, 'Naive-Bayes')

Naive-Bayes :

- recall [ 0.5  0.6  0.2]
Mean: 0.43 (+/-0.17)
- precision [ 0.42857143  0.375       0.33333333]
Mean: 0.38 (+/-0.04)
- f1 [ 0.46153846  0.46153846  0.25      ]
Mean: 0.39 (+/-0.10)


In [12]:
### Try a variety of classifiers

from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

params = {'estimator': [SVC(), 
                        AdaBoostClassifier(),
                        ExtraTreesClassifier(),
                        RandomForestClassifier(),
                        LogisticRegression()]
         }

grid = GridSearchCV(pipe, param_grid=params,
                    cv=15, verbose=1,
                    n_jobs=-1, scoring='f1')

grid.fit(features, labels)
print grid.best_params_
print grid.best_score_

Fitting 15 folds for each of 5 candidates, totalling 75 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done  68 out of  75 | elapsed:    5.1s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:    5.2s finished


{'estimator': AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)}
0.19621749409


In [47]:
def find_top_scores(grid, cutoff=5, cv=5):

    grid_results = grid.cv_results_

    score_keys = []
    end_str = '_test_score'
    start_str = 'split'
    for num in range(cv):
        complete_str = start_str+str(num)+end_str
        score_keys.append(complete_str)

    ranks = grid_results['rank_test_score']
    for rank in range(1,cutoff+1):
        i = np.where(ranks == rank)[0][0]
        print type(grid_results['params'][i]['estimator'])
        best_test_scores = []
        for key in score_keys:
            best_test_scores.append(grid_results[key][i])
        print np.mean(best_test_scores), np.std(best_test_scores)
    return best_test_scores

best = find_top_scores(grid, 5, 15)

<class 'sklearn.ensemble.weight_boosting.AdaBoostClassifier'>
0.2 0.361068373539
<class 'sklearn.ensemble.forest.ExtraTreesClassifier'>
0.177777777778 0.362433476229
<class 'sklearn.linear_model.logistic.LogisticRegression'>
0.144444444444 0.303477784083
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
0.1 0.270801280155
<class 'sklearn.svm.classes.SVC'>
0.0 0.0


In [43]:
from sklearn.tree import DecisionTreeClassifier
DTC = DecisionTreeClassifier(class_weight='balanced', random_state=42)
ABC = AdaBoostClassifier(base_estimator=DTC, random_state=42)
ABC_params = {'polynomials__degree': [2,3],
              'estimator': [ABC],
              'estimator__base_estimator__max_depth': [None, 3, 6, 12, 24, 48],
              'estimator__base_estimator__min_samples_leaf': [1, 2, 4, 8, 16],
              'estimator__n_estimators': [25, 50, 100, 250],
              'estimator__learning_rate': [0.01, 0.1, 1, 10]}

ETC = ExtraTreesClassifier(class_weight='balanced', random_state=42)
ETC_params = {'polynomials__degree': [2,3],
              'estimator': [ETC],
              'estimator__max_depth': [None, 3, 6, 12, 24, 48],
              'estimator__min_samples_leaf': [1, 2, 4, 8, 16],
              'estimator__n_estimators': [25, 50, 100],
              'estimator__max_features': [1, 2, 4, 6, 8, 10]}

RF = RandomForestClassifier(class_weight='balanced', random_state=42)
RF_params = {'polynomials__degree': [2,3],
             'estimator': [RF],
             'estimator__max_depth': [None, 3, 6, 12, 24, 48],
             'estimator__min_samples_leaf': [1, 2, 4, 8, 16],
             'estimator__n_estimators': [25, 50, 100],
             'estimator__max_features': [1, 2, 4, 6, 8, 10]}

LR = LogisticRegression(solver='lbfgs')
LR_params = {'polynomials__degree': [2,3],
             'estimator': [LR],
             'estimator__C': [1e-2, 1e-1, 1, 1e1, 1e2],
             'estimator__class_weight': [None, 'balanced'],
             'estimator__max_iter': [100, 200],
             'estimator__tol': [1e-4, 5e-4, 1e-3],
             'estimator__warm_start': [False, True]}

large_grid_params = [ABC_params, ETC_params, RF_params, LR_params]

In [44]:
import warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    large_grid = GridSearchCV(pipe,
                              param_grid=large_grid_params,
                              cv=5, 
                              verbose=1,
                              n_jobs=-1,
                              scoring='f1')

    large_grid.fit(features, labels)

Fitting 5 folds for each of 3360 candidates, totalling 16800 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 326 tasks      | elapsed:   33.1s
[Parallel(n_jobs=-1)]: Done 576 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 926 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 1376 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 1926 tasks      | elapsed:  7.7min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 10.4min
[Parallel(n_jobs=-1)]: Done 3326 tasks      | elapsed: 13.2min
[Parallel(n_jobs=-1)]: Done 4176 tasks      | elapsed: 16.0min
[Parallel(n_jobs=-1)]: Done 5126 tasks      | elapsed: 19.6min
[Parallel(n_jobs=-1)]: Done 6176 tasks      | elapsed: 22.8min
[Parallel(n_jobs=-1)]: Done 7326 tasks      | elapsed: 26.4min
[Parallel(n_jobs=-1)]: Done 8576 tasks      | elapsed: 30.4min
[Parallel(n_jobs=-1)]: Done 9926 tasks      | elapsed: 34.7min
[Parallel(n_jobs=-1)]: Done 11376 tasks      | elapsed: 39.4min
[Parallel(n_jobs=-1)]: Done 12926 tasks      | elapsed: 44

In [46]:
with open('large_grid.pkl', 'wb') as f:
    pickle.dump(large_grid, f)

In [27]:
### Dump classifier, dataset, and features_list so anyone can check your results.

from tools.tester import dump_classifier_and_data
my_dataset = fin_data
features_list = selected_feature_names

dump_classifier_and_data(clf, my_dataset, features_list)