Analysis of Enron Data. Based on Udacity intro to machine learning course

email data can be found at: https://www.cs.cmu.edu/~./enron/enron_mail_20150507.tgz
and 'data/emails_by_address/'
financial data can be found in 'financial_data.pdf' in this repository

In [None]:
# Suppress package warnings, most of which are caused by deprecations
import warnings
warnings.filterwarnings('ignore')

In [None]:
### Load the dictionary containing financial data and some email features (to_ from_poi)
import pickle
with open('data/final_project_dataset.pkl', 'rb') as f:
    fin_data_dict = pickle.load(f)

# Remove outliers
fin_data_dict.pop('TOTAL', 0)

# Store to my_dataset for easy export below.
my_dataset = fin_data_dict


In [None]:
### Select features to use

# features_list is a list of feature names in the financial data.
# The first feature must be "poi".

import numpy as np
from tools.feature_format import featureFormat, targetFeatureSplit

def make_features_labels(dataset, feature_names):
    """
    Quick way to split a dataset into features and labels based on feature names
    """
    data = featureFormat(dataset, feature_names, sort_keys = True)
    labels, features = targetFeatureSplit(data)
    labels = np.array(labels)
    features = np.array(features)
    
    return features, labels

# Start with all features except: 'email_address'

all_feature_names = ['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments',\
'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi',\
'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances',\
'from_messages', 'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income',\
'long_term_incentive', 'from_poi_to_this_person']

all_features, all_labels = make_features_labels(my_dataset, all_feature_names)

# Select the most important features based on ExtraTreesClassifier
from feature_selection import importance_plotter
selected_feature_names = importance_plotter(all_features, all_labels, 
                                            np.array(all_feature_names[1:]))

In [None]:
### Confirm that the feature selection did help performance:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, StratifiedShuffleSplit
from sklearn import metrics

# Make features and labels based on the new selection
selected_feature_names.insert(0, 'poi')
selected_features, selected_labels = make_features_labels(my_dataset, selected_feature_names)

clf = GaussianNB()
def fit_print_scores(clf, features, labels):
    
    cv = StratifiedShuffleSplit(test_size=0.3, random_state=42)
    f1_scores = cross_val_score(clf, features, labels, cv=5, scoring='f1')
    print '-f1 score: %0.2f (+/- %0.2f)' % (f1_scores.mean(), 
                                           f1_scores.std() * 2)
    precision_scores = cross_val_score(clf, features, labels, cv=5, scoring='precision')
    print '-precision score: %0.2f (+/- %0.2f)' % (precision_scores.mean(), 
                                                  precision_scores.std() * 2)
    recall_scores = cross_val_score(clf, features, labels, cv=5, scoring='recall')
    print '-recall score: %0.2f (+/- %0.2f)' % (recall_scores.mean(), 
                                                  recall_scores.std() * 2)

print 'All features:'
fit_print_scores(clf, all_features, all_labels)
print '\nSelected features:'
fit_print_scores(clf, selected_features, selected_labels)

Looks like this feature selection approach does not really improve performance, especially since the recall score suffers a lot. 

Try again, but this time selecting features by training the decision trees that determine feature importance on a data set with equal amounts of pois and non-pois. This should address imbalanced trees that are biased towards non-pois. 

In [None]:
from balanced_trees import data_balancer
from sklearn.model_selection import train_test_split
train_bal_features, _, train_bal_labels, _ = train_test_split(all_features, all_labels, 
                                                    test_size=0.3, random_state=42)

balanced_features, balanced_labels = data_balancer(train_bal_features, train_bal_labels)

In [None]:
balanced_selected_feature_names = importance_plotter(balanced_features, balanced_labels, 
                                                     np.array(all_feature_names[1:]))

In [None]:
balanced_selected_feature_names.insert(0, 'poi')
balanced_selected_features, balanced_selected_labels = \
    make_features_labels(my_dataset, balanced_selected_feature_names)
    
print 'All features:'
fit_print_scores(clf, all_features, all_labels)
print '\nSelected features after balancing:'
fit_print_scores(clf, balanced_selected_features, balanced_selected_labels)

This seems better, at least the recal score doesn't suffer as much and the f1 and precision scores are still decent.The confidence intervals have also gotten smaller.

In [None]:
# Plot the most highly correlated features to see if there is any redundancy or outliers 
from feature_selection import correlation_plotter
correlation_plotter(balanced_selected_feature_names[1:], my_dataset)

From the correlation graphs it seems like there is not much redundancy in the selected features.

In [None]:
### Create new features
# word_dict_subset is a dictionary that contains emails from all of the people
# with financial data AND email data available separated into 'to' and 'from categories. 
# 86 people total, 14 pois

import pickle
with open('data/word_dict_subset.pkl', 'rb') as f:
    data_text = pickle.load(f)

#Separate data into emails from, to, or all-together
from tools.email_analysis import email_list_and_labels
to_emails, to_email_labels = email_list_and_labels(data_text, 'to')
from_emails, from_email_labels = email_list_and_labels(data_text, 'from')
all_emails, all_email_labels = email_list_and_labels(data_text, 'all')

# Create a tf-idf vectors of each person's to, from, and all-together emails
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5)

to_feature_matrix = vectorizer.fit_transform(to_emails)
from_feature_matrix = vectorizer.fit_transform(from_emails)
all_feature_matrix = vectorizer.fit_transform(all_emails)

In [None]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
print 'To emails scores:'
fit_print_scores(clf, to_feature_matrix.toarray(), to_email_labels)
print '\nFrom emails scores:'
fit_print_scores(clf, from_feature_matrix.toarray(), from_email_labels)
print '\nAll emails scores:'
fit_print_scores(clf, all_feature_matrix.toarray(), all_email_labels)

Looks like email word extraction does not yield very promising results, will drop for now.

In [None]:
### Try a variety of classifiers
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# for ease rename balanced_selected_features and balanced_selected_labels
features = balanced_selected_features
scaled_features = scaler.fit_transform(features)
labels = balanced_selected_labels

# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
print 'Gaussian naive Bayes:'
fit_print_scores(clf, features, labels)

from sklearn.svm import SVC 
clf = SVC()
print '\nSupport vector machine:'
fit_print_scores(clf, scaled_features, labels)

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
clf = RandomForestClassifier()
print '\nRandom forest:'
fit_print_scores(clf, features, labels)

clf = ExtraTreesClassifier()
print '\nExtra trees:'
fit_print_scores(clf, features, labels)

clf = AdaBoostClassifier()
print '\nAdaBoost:'
fit_print_scores(clf, features, labels)

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
print('\nLogistic regression:')
fit_print_scores(clf, features, labels)

The most promising classifiers are:
AdaBoost, Naive Bayes, and Random Forest. Will tune parameters for these. 

In [None]:
### Tune classifier to achieve better than .3 precision and recall 
train_features, test_features, train_labels, test_labels =\
train_test_split(features, labels, test_size = 0.4, random_state=0)

from sklearn.model_selection import GridSearchCV
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

scores = ['precision', 'recall']

for score in scores:
    print "# Tuning hyper-parameters for %s\n" % score
    cv = StratifiedShuffleSplit(test_size=0.3, random_state=1)
    clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=cv,
                       scoring='%s_macro' % score)
    clf.fit(train_features, train_labels)

    print 'Best parameters set found on development set:'
    print clf.best_params_
    print '\n'
    print 'Grid scores on development set:'
    print '\n'
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print '%0.3f (+/-%0.03f) for %r' % (mean, std * 2, params)
    print '\n'

    print 'Detailed classification report:'
    print '\n'
    
    y_true, y_pred = test_labels, clf.predict(test_features)
    print(classification_report(y_true, y_pred))

In [None]:
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
from tools.tester import dump_classifier_and_data

dump_classifier_and_data(clf, my_dataset, features_list)