# Analysis of Enron Data
## Based on Udacity intro to machine learning course

### Data sources:
- Raw email text data can be found at: https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tgz
and a breakdown of emails by sender can be found [here](data/emails_by_address/)
- The financial data was compiled from [this file](data/financial_data.pdf)
- The persons of interest "pois" come from [this file](data/poi_names.txt)

### Support functions and classes

In [35]:
import numpy as np

# For make_features_labels
from tools.feature_format import featureFormat, targetFeatureSplit

# For IsolationForestTransformer
from sklearn.base import TransformerMixin
from sklearn.ensemble import IsolationForest

def make_features_labels(dataset, feature_names, remove_nan=False):
    """
    Quick way to split a dataset into features and labels based on feature names
    """
    data = featureFormat(dataset, feature_names, 
                         sort_keys=True, remove_NaN=remove_nan, remove_all_zeroes=False)
    labels, features = targetFeatureSplit(data)
    labels = np.array(labels)
    features = np.array(features)
    
    return features, labels

class IsolationForestTransformer(IsolationForest):
    """
    Makes IsolationForest classifier into a transformer that returns an array with
    potential outliers removed
    
    Inputs:
    X: Array with (n_samples, n_features)
    
    arguments:
    num_to_remove[=False]: an int of how many potential sample outliers should be 
    removed from X
    """
    
    def __init__(self, num_to_remove=None, **kwargs):
        self.num_to_remove = num_to_remove
        super(IsolationForestTransformer, self).__init__(**kwargs)
          
    def get_num_to_remove(self):
        return self.num_to_remove
    
    def transform(self, X, y):
        scores = self.decision_function(X)
        
        try:
            start = self.get_num_to_remove()
        except AttributeError:
            print 'num_to_remove was not provided'
        
        idxs = np.argsort(scores)
        
        # returns X minus the features with the lowest anomaly scores
        return X[idxs[start:],:], y[idxs[start:]]

### Get the data

In [65]:
# Load the dictionary containing financial data and some email features (to_ from_poi)
import pickle


with open('data/final_project_dataset.pkl', 'rb') as f:
    fin_data = pickle.load(f)

# Remove 'TOTAL' from data
del fin_data['TOTAL']

# Start with all features except: 'email_address'
all_feature_names = ['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments',\
'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi',\
'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances',\
'from_messages', 'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income',\
'long_term_incentive', 'from_poi_to_this_person']

all_features, all_labels = make_features_labels(fin_data, all_feature_names)

In [68]:
sum(all_features == 0)

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 20,  0,  0,
        0, 12])

### Make a pipeline

In [37]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, Imputer, PolynomialFeatures
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

pipe1 = Pipeline([
                ('imputer', Imputer(strategy='mean',
                                    verbose=1)),
                ('scaler', StandardScaler())
                ])


In [45]:
X_imped_scaled = pipe.fit_transform(all_features, all_labels)
IFT = IsolationForestTransformer(num_to_remove=4, random_state=0)
IFT.fit(X_imped_scaled)
X_outlied, y_outlied = IFT.transform(X_imped_scaled, all_labels)
print X_outlied.shape, y_outlied.shape

(141, 19) (141,)


In [54]:
pipe2 = Pipeline([
                ('polynomials_addr', PolynomialFeatures(2)),
                ('feature_selr', SelectFromModel(
                                                 ExtraTreesClassifier(
                                                     random_state=2,
                                                     class_weight='balanced'), 
                                                 threshold='mean'))
                ])

In [55]:
poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(X_outlied)
print X_poly.shape

X_ready = pipe2.fit_transform(X_outlied, y_outlied)
print X_ready.shape

(141, 210)
(141, 66)


In [57]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn import metrics

def fit_print_scores(clf, features, labels):
    
    f1_scores = cross_val_score(clf, features, labels, cv=100, scoring='f1')
    print '-f1 score: %0.2f (+/- %0.2f)' % (f1_scores.mean(), 
                                           f1_scores.std() * 2)
    precision_scores = cross_val_score(clf, features, labels, cv=5, scoring='precision')
    print '-precision score: %0.2f (+/- %0.2f)' % (precision_scores.mean(), 
                                                  precision_scores.std() * 2)
    recall_scores = cross_val_score(clf, features, labels, cv=5, scoring='recall')
    print '-recall score: %0.2f (+/- %0.2f)' % (recall_scores.mean(), 
                                                  recall_scores.std() * 2)

print 'All features:'
fit_print_scores(GaussianNB(), X_ready, y_outlied)

All features:


  'recall', 'true', average, warn_for)


-f1 score: 0.04 (+/- 0.33)
-precision score: 0.10 (+/- 0.27)
-recall score: 0.22 (+/- 0.59)


### Create new features from the email text

word_dict_subset is a dictionary that contains emails from all of the people
with financial data AND email data available separated into 'to' and 'from' categories. 
It was made like this:
```python
from tools.email_analysis import find_email_paths, word_dict_maker
data_emails = [item['email_address'] for key, item in fin_data.items()]
emailpath_tuples = find_emailpaths()
fin_emailpath_tuples = [tup for tup in emailpath_tuples if tup[1] in data_emails]
word_dict_subset = word_dict_maker(fin_emailpath_tuples)
```
word-dict_subset has 86 people total, 14 pois

In [None]:
import pickle
with open('data/word_dict_subset.pkl', 'rb') as f:
    data_text = pickle.load(f)

#Separate data into emails from, to, or all-together
from tools.email_analysis import email_list_and_labels
to_emails, to_email_labels = email_list_and_labels(data_text, 'to')
from_emails, from_email_labels = email_list_and_labels(data_text, 'from')
all_emails, all_email_labels = email_list_and_labels(data_text, 'all')

# Create a tf-idf vectors of each person's to, from, and all-together emails
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5)

to_feature_matrix = vectorizer.fit_transform(to_emails)
from_feature_matrix = vectorizer.fit_transform(from_emails)
all_feature_matrix = vectorizer.fit_transform(all_emails)

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
print 'To emails scores:'
fit_print_scores(clf, to_feature_matrix.toarray(), to_email_labels)
print '\nFrom emails scores:'
fit_print_scores(clf, from_feature_matrix.toarray(), from_email_labels)
print '\nAll emails scores:'
fit_print_scores(clf, all_feature_matrix.toarray(), all_email_labels)

Looks like email word extraction does not yield very promising results, will drop for now.

In [59]:
### Try a variety of classifiers
# for ease rename balanced_selected_features and balanced_selected_labels
from sklearn.preprocessing import MinMaxScaler
features = X_ready
labels = y_outlied

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
print 'Gaussian naive Bayes:'
fit_print_scores(clf, features, labels)

#Use scaled features for SVC
from sklearn.svm import SVC 
clf = SVC()
print '\nSupport vector machine:'
fit_print_scores(clf, features, labels)

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
clf = RandomForestClassifier()
print '\nRandom forest:'
fit_print_scores(clf, features, labels)

clf = ExtraTreesClassifier()
print '\nExtra trees:'
fit_print_scores(clf, features, labels)

clf = AdaBoostClassifier()
print '\nAdaBoost:'
fit_print_scores(clf, features, labels)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
print('\nLogistic regression:')
fit_print_scores(clf, features, labels)

Gaussian naive Bayes:
-f1 score: 0.04 (+/- 0.33)
-precision score: 0.10 (+/- 0.27)
-recall score: 0.22 (+/- 0.59)

Support vector machine:
-f1 score: 0.00 (+/- 0.00)
-precision score: 0.00 (+/- 0.00)
-recall score: 0.00 (+/- 0.00)

Random forest:
-f1 score: 0.01 (+/- 0.13)
-precision score: 0.30 (+/- 0.80)
-recall score: 0.17 (+/- 0.42)

Extra trees:
-f1 score: 0.04 (+/- 0.36)
-precision score: 0.63 (+/- 0.74)
-recall score: 0.22 (+/- 0.59)

AdaBoost:
-f1 score: 0.02 (+/- 0.24)
-precision score: 0.24 (+/- 0.41)
-recall score: 0.28 (+/- 0.55)

Logistic regression:
-f1 score: 0.01 (+/- 0.14)
-precision score: 0.24 (+/- 0.78)
-recall score: 0.22 (+/- 0.59)


The most promising classifiers are:
AdaBoost, Naive Bayes, and ExtraTrees. Will tune parameters for AdaBoost and ExtraTrees. Naive Bayes does not have parameters to tune. So will test on the same data as the other two to have a direct comparison.  

In [60]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels =\
train_test_split(features, labels, test_size = 0.4, random_state=0)

clf = GaussianNB()
clf.fit(train_features, train_labels)
y_true, y_pred = test_labels, clf.predict(test_features)
print classification_report(y_true, y_pred)

             precision    recall  f1-score   support

        0.0       0.92      0.96      0.94        50
        1.0       0.60      0.43      0.50         7

avg / total       0.88      0.89      0.89        57



In [62]:
### Tune classifiers to achieve better than .3 precision and recall 

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

tuned_parameters = {'base_estimator__max_depth' : [None, 3, 6, 12, 24, 48],
                    'base_estimator__min_samples_leaf': [1, 2, 4, 8, 16],
                    'n_estimators': [25, 50, 100, 250],
                    'learning_rate': [0.01, 0.1, 1, 10]}

scores = ['precision', 'recall']

DTC = DecisionTreeClassifier(random_state=1, class_weight='balanced')

ABC = AdaBoostClassifier(base_estimator=DTC, random_state=2)

clf_dict = {'ABC': {}}
for score in scores:
    print "# Tuning hyper-parameters for %s\n" % score
    clf = GridSearchCV(ABC, param_grid=tuned_parameters, cv=5, 
                       verbose=1, n_jobs=-1,
                       scoring='%s' % score)
    clf.fit(train_features, train_labels)

    print 'Best parameters set found on development set:'
    print clf.best_params_

    print 'Detailed classification report:'
    
    y_true, y_pred = test_labels, clf.predict(test_features)
    print(classification_report(y_true, y_pred))

# Tuning hyper-parameters for precision

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


[Parallel(n_jobs=-1)]: Done 252 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 414 tasks      | elapsed:   42.1s
[Parallel(n_jobs=-1)]: Done 664 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1431 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed:  4.1min finished


Best parameters set found on development set:
{'n_estimators': 25, 'learning_rate': 1, 'base_estimator__max_depth': 3, 'base_estimator__min_samples_leaf': 1}
Detailed classification report:
             precision    recall  f1-score   support

        0.0       0.89      0.98      0.93        50
        1.0       0.50      0.14      0.22         7

avg / total       0.84      0.88      0.85        57

# Tuning hyper-parameters for recall

Fitting 5 folds for each of 480 candidates, totalling 2400 fits


[Parallel(n_jobs=-1)]: Done 377 tasks      | elapsed:   29.4s
[Parallel(n_jobs=-1)]: Done 538 tasks      | elapsed:   47.4s
[Parallel(n_jobs=-1)]: Done 788 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 1977 tasks      | elapsed:  3.0min


Best parameters set found on development set:
{'n_estimators': 25, 'learning_rate': 10, 'base_estimator__max_depth': None, 'base_estimator__min_samples_leaf': 16}
Detailed classification report:
             precision    recall  f1-score   support

        0.0       0.00      0.00      0.00        50
        1.0       0.12      1.00      0.22         7

avg / total       0.02      0.12      0.03        57



[Parallel(n_jobs=-1)]: Done 2400 out of 2400 | elapsed:  3.6min finished
  'precision', 'predicted', average, warn_for)


In [64]:
tuned_parameters = {'max_depth' : [None, 3, 6, 12, 24, 48],
                    'min_samples_leaf': [1, 2, 4, 8, 16],
                    'n_estimators': [25, 50, 100, 250],
                    'max_features': [1, 2, 4, 6, 8, 10]}

scores = ['precision', 'recall']

ETC = ExtraTreesClassifier(random_state=2, class_weight='balanced')

clf_dict['ETC'] = {}
for score in scores:
    print "# Tuning hyper-parameters for %s\n" % score
    clf = GridSearchCV(ETC, param_grid=tuned_parameters, cv=5, 
                       verbose=1, n_jobs=-1,
                       scoring='%s_macro' % score)
    clf.fit(train_features, train_labels)

    print 'Best parameters set found on development set:'
    print clf.best_params_

    print 'Detailed classification report:'
    
    y_true, y_pred = test_labels, clf.predict(test_features)
    print(classification_report(y_true, y_pred))

# Tuning hyper-parameters for precision

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.7s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  5.9min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 12.1min finished


Best parameters set found on development set:
{'max_features': 2, 'n_estimators': 50, 'max_depth': 3, 'min_samples_leaf': 2}
Detailed classification report:
             precision    recall  f1-score   support

        0.0       0.92      0.98      0.95        50
        1.0       0.75      0.43      0.55         7

avg / total       0.90      0.91      0.90        57

# Tuning hyper-parameters for recall

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


[Parallel(n_jobs=-1)]: Done  59 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done 209 tasks      | elapsed:   40.2s
[Parallel(n_jobs=-1)]: Done 459 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 809 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 1259 tasks      | elapsed:  4.3min
[Parallel(n_jobs=-1)]: Done 1809 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 2459 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 3209 tasks      | elapsed: 10.5min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 11.8min finished


Best parameters set found on development set:
{'max_features': 2, 'n_estimators': 25, 'max_depth': None, 'min_samples_leaf': 4}
Detailed classification report:
             precision    recall  f1-score   support

        0.0       0.91      0.86      0.89        50
        1.0       0.30      0.43      0.35         7

avg / total       0.84      0.81      0.82        57



In [69]:
tuned_parameters = {'max_depth' : [None, 3, 6, 12, 24, 48],
                    'min_samples_leaf': [1, 2, 4, 8, 16],
                    'n_estimators': [25, 50, 100, 250],
                    'max_features': [1, 2, 4, 6, 8, 10]}

scores = ['f1']

ETC = ExtraTreesClassifier(random_state=2, class_weight='balanced')

clf_dict['ETC'] = {}
for score in scores:
    print "# Tuning hyper-parameters for %s\n" % score
    clf = GridSearchCV(ETC, param_grid=tuned_parameters, cv=5, 
                       verbose=1, n_jobs=-1,
                       scoring='%s_macro' % score)
    clf.fit(train_features, train_labels)

    print 'Best parameters set found on development set:'
    print clf.best_params_

    print 'Detailed classification report:'
    
    y_true, y_pred = test_labels, clf.predict(test_features)
    print(classification_report(y_true, y_pred))

# Tuning hyper-parameters for f1

Fitting 5 folds for each of 720 candidates, totalling 3600 fits


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   37.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed:  7.9min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 3600 out of 3600 | elapsed: 11.6min finished


Best parameters set found on development set:
{'max_features': 2, 'n_estimators': 50, 'max_depth': 3, 'min_samples_leaf': 2}
Detailed classification report:
             precision    recall  f1-score   support

        0.0       0.92      0.98      0.95        50
        1.0       0.75      0.43      0.55         7

avg / total       0.90      0.91      0.90        57



In [26]:
# Set random_state to None
clf.set_params(random_state=None)

ExtraTreesClassifier(bootstrap=False, class_weight='balanced',
           criterion='gini', max_depth=None, max_features=6,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=16, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [27]:
### Dump classifier, dataset, and features_list so anyone can check your results.

from tools.tester import dump_classifier_and_data
clf = clf_dict['ETC']['recall'].best_estimator_
my_dataset = fin_data
features_list = selected_feature_names

dump_classifier_and_data(clf, my_dataset, features_list)

In [28]:
run tools/tester.py

Loading data
Done loading
Start testing


. . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 

 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .