# Analysis of Enron Data
## Based on Udacity intro to machine learning course

### Data sources:
- Raw email text data can be found at: https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tgz
and a breakdown of emails by sender can be found [here](data/emails_by_address/)
- The financial data was compiled from [this file](data/financial_data.pdf)
- The persons of interest "pois" come from [this file](data/poi_names.txt)

### Support functions and classes

In [2]:
import numpy as np
from tools.feature_format import featureFormat

### Get the data already preprocessed in [outlier_removal](outlier_removal.ipynb) and [imputing_data](imputing_data.ipynb)

In [31]:
import pickle

with open('imputer_dicts.pkl', 'rb') as f:
    imputer_dicts = pickle.load(f)

mean_data_dict = imputer_dicts['mean']

# Start with all features except: 'email_address'
all_feature_names = ['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments',\
'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi',\
'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances',\
'from_messages', 'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income',\
'long_term_incentive', 'from_poi_to_this_person']

mean_data = featureFormat(mean_data_dict, all_feature_names)
labels, pre_features = mean_data[:,0], mean_data[:, 1:]

### Make a pipeline

In [32]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier

pipe = Pipeline([
                 ('scaler', StandardScaler()),
                 ('polynomials_addr', PolynomialFeatures(2)),
                 ('feature_selr', SelectFromModel(
                                                 ExtraTreesClassifier(
                                                     random_state=2,
                                                     class_weight='balanced'), 
                                                 threshold='mean'))
                ])

In [33]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

def fit_print_scores(clf, features, labels):
    
    f1_scores = cross_val_score(clf, features, labels, cv=6, scoring='f1')
    print '-f1 score: %0.2f (+/- %0.2f)' % (f1_scores.mean(), 
                                           f1_scores.std() * 2)
    precision_scores = cross_val_score(clf, features, labels, cv=6, scoring='precision')
    print '-precision score: %0.2f (+/- %0.2f)' % (precision_scores.mean(), 
                                                  precision_scores.std() * 2)
    recall_scores = cross_val_score(clf, features, labels, cv=6, scoring='recall')
    print '-recall score: %0.2f (+/- %0.2f)' % (recall_scores.mean(), 
                                                  recall_scores.std() * 2)

features = pipe.fit_transform(pre_features, labels)

fit_print_scores(GaussianNB(), features, labels)

-f1 score: 0.33 (+/- 0.68)
-precision score: 0.36 (+/- 0.78)
-recall score: 0.31 (+/- 0.62)


In [36]:
### Try a variety of classifiers

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
print 'Gaussian naive Bayes:'
fit_print_scores(clf, features, labels)

from sklearn.svm import SVC 
clf = SVC()
print '\nSupport vector machine:'
fit_print_scores(clf, features, labels)

from sklearn.ensemble import ExtraTreesClassifier, AdaBoostClassifier

clf = ExtraTreesClassifier()
print '\nExtra trees:'
fit_print_scores(clf, features, labels)

clf = AdaBoostClassifier()
print '\nAdaBoost:'
fit_print_scores(clf, features, labels)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
print('\nLogistic regression:')
fit_print_scores(clf, features, labels)

Gaussian naive Bayes:
-f1 score: 0.33 (+/- 0.68)
-precision score: 0.36 (+/- 0.78)
-recall score: 0.31 (+/- 0.62)

Support vector machine:
-f1 score: 0.00 (+/- 0.00)
-precision score: 0.00 (+/- 0.00)
-recall score: 0.00 (+/- 0.00)

Extra trees:
-f1 score: 0.19 (+/- 0.56)
-precision score: 0.25 (+/- 0.76)
-recall score: 0.17 (+/- 0.33)

AdaBoost:
-f1 score: 0.06 (+/- 0.25)
-precision score: 0.06 (+/- 0.25)
-recall score: 0.06 (+/- 0.25)

Logistic regression:
-f1 score: 0.28 (+/- 0.57)
-precision score: 0.44 (+/- 0.92)
-recall score: 0.22 (+/- 0.50)


The most promising classifiers are:
AdaBoost, Naive Bayes, ExtraTrees, and logistic regression. Will tune parameters for AdaBoost and ExtraTrees and logistic regression. Naive Bayes does not have parameters to tune. So will test on the same data as the other three to have a direct comparison.  

In [37]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels =\
train_test_split(features, labels, test_size = 0.4, random_state=0)

clf = GaussianNB()
clf.fit(train_features, train_labels)
y_true, y_pred = test_labels, clf.predict(test_features)
print classification_report(y_true, y_pred)

             precision    recall  f1-score   support

        0.0       0.94      0.90      0.92        51
        1.0       0.38      0.50      0.43         6

avg / total       0.88      0.86      0.87        57



In [None]:
score_keys = []
end_str = '_test_score'
start_str = 'split'
for num in range(5):
    complete_str = start_str+str(num)+end_str
    score_keys.append(complete_str)

In [71]:
### Tune classifiers to achieve better than .3 precision and recall 

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

tuned_parameters = {'base_estimator__max_depth' : [None, 3, 6, 12, 24, 48],
                    'base_estimator__min_samples_leaf': [1, 2, 4, 8, 16],
                    'n_estimators': [25, 50, 100, 250],
                    'learning_rate': [0.01, 0.1, 1, 10]}

scores = ['f1']

DTC = DecisionTreeClassifier(random_state=1, class_weight='balanced')

ABC = AdaBoostClassifier(base_estimator=DTC, random_state=2)

print "# Tuning hyper-parameters for %s\n" % score
ABC_search = GridSearchCV(ABC, param_grid=tuned_parameters, cv=5, 
                   verbose=1, n_jobs=-1,
                   scoring='%s' % score)
ABC_search.fit(train_features, train_labels)

print 'Best parameters set found on development set:'
print ABC_search.best_params_
print

cv_results = ABC_search.cv_results_
best_test_scores = []
best_idx = ABC_search.best_index_
for key in score_keys:
    best_test_scores.append(cv_results[key][best_idx])

best_mean, best_std = np.mean(best_test_scores), np.std(best_test_scores)
print 'Detailed scores:'
print '%0.2f +/-(%0.2f)' %(best_mean, best_std)
print best_test_scores
print

print 'Scores on test set:'
y_true, y_pred = test_labels, ABC_search.predict(test_features)
print(classification_report(y_true, y_pred))

# Tuning hyper-parameters for f1

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


[Parallel(n_jobs=-1)]: Done 460 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done 639 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1125 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1890 tasks      | elapsed:  3.6min
[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed:  4.6min finished


Best parameters set found on development set:
{'n_estimators': 100, 'learning_rate': 0.1, 'base_estimator__max_depth': 3, 'base_estimator__min_samples_leaf': 8}

Detailed scores:
0.43 +/-(0.25)
[0.66666666666666663, 0.5, 0.0, 0.33333333333333331, 0.66666666666666663]

Scores on test set:
             precision    recall  f1-score   support

        0.0       0.93      0.98      0.95        51
        1.0       0.67      0.33      0.44         6

avg / total       0.90      0.91      0.90        57



In [72]:
tuned_parameters = {'max_depth' : [None, 3, 6, 12, 24, 48],
                    'min_samples_leaf': [1, 2, 4, 8],
                    'n_estimators': [25, 50, 100],
                    'max_features': [1, 2, 4, 6, 8, 10]}

scores = ['f1']

ETC = ExtraTreesClassifier(class_weight='balanced')

print "# Tuning hyper-parameters for %s\n" % score
ETC_search = GridSearchCV(ETC, param_grid=tuned_parameters, cv=5, 
                   verbose=1, n_jobs=-1,
                   scoring='f1')
ETC_search.fit(train_features, train_labels)

print 'Best parameters set found on development set:'
print ETC_search.best_params_
print

cv_results = ETC_search.cv_results_
best_test_scores = []
best_idx = ETC_search.best_index_
for key in score_keys:
    best_test_scores.append(cv_results[key][best_idx])

best_mean, best_std = np.mean(best_test_scores), np.std(best_test_scores)
print 'Detailed scores:'
print '%0.2f +/-(%0.2f)' %(best_mean, best_std)
print best_test_scores
print

print 'Scores on test set:'
y_true, y_pred = test_labels, ETC_search.predict(test_features)
print(classification_report(y_true, y_pred))

# Tuning hyper-parameters for f1

Fitting 5 folds for each of 432 candidates, totalling 2160 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   31.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done 2160 out of 2160 | elapsed:  4.9min finished


Best parameters set found on development set:
{'max_features': 1, 'n_estimators': 100, 'max_depth': 3, 'min_samples_leaf': 4}

Detailed scores:
0.54 +/-(0.34)
[0.5, 0.80000000000000004, 0.40000000000000002, 0.0, 1.0]

Scores on test set:
             precision    recall  f1-score   support

        0.0       0.90      0.86      0.88        51
        1.0       0.12      0.17      0.14         6

avg / total       0.82      0.79      0.80        57



In [73]:
tuned_parameters = {
                    'C': [1e-2, 1e-1, 1, 1e1, 1e2],
                    'class_weight': [None, 'balanced'],
                    'max_iter': [100, 200],
                    'tol': [1e-4, 5e-4, 1e-3],
                    'warm_start': [False, True]
                    }

LR = LogisticRegression(solver='lbfgs')

LR_search = GridSearchCV(LR, param_grid=tuned_parameters, cv=5, 
                   verbose=1, n_jobs=-1,
                   scoring='f1')

LR_search.fit(train_features, train_labels)

print 'Best parameters set found on development set:'
print LR_search.best_params_
print 

cv_results = LR_search.cv_results_
best_test_scores = []
best_idx = LR_search.best_index_
for key in score_keys:
    best_test_scores.append(cv_results[key][best_idx])

best_mean, best_std = np.mean(best_test_scores), np.std(best_test_scores)
print 'Detailed scores:'
print '%0.2f +/-(%0.2f)' %(best_mean, best_std)
print best_test_scores
print

print 'Scores on test set:'
y_true, y_pred = test_labels, LR_search.predict(test_features)
print(classification_report(y_true, y_pred))

Fitting 5 folds for each of 120 candidates, totalling 600 fits
Best parameters set found on development set:
{'warm_start': False, 'C': 0.1, 'max_iter': 100, 'tol': 0.0001, 'class_weight': None}

Detailed scores:
0.33 +/-(0.28)
[0.0, 0.5, 0.5, 0.0, 0.66666666666666663]

Scores on test set:
             precision    recall  f1-score   support

        0.0       0.89      0.98      0.93        51
        1.0       0.00      0.00      0.00         6

avg / total       0.80      0.88      0.84        57



[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:    3.5s finished


In [27]:
### Dump classifier, dataset, and features_list so anyone can check your results.

from tools.tester import dump_classifier_and_data
my_dataset = fin_data
features_list = selected_feature_names

dump_classifier_and_data(clf, my_dataset, features_list)