# Using Machine Learning to Identify Fraud in Enron Emails

By Trevor Cook

In [1]:
# %load poi_id.py
#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")

import numpy as np
import matplotlib.pyplot as plt
from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data

# Select what features to use.
# The first feature must be "poi".
features_list = ['poi', 'salary', 'to_messages', 'deferral_payments', 'total_payments', 'exercised_stock_options', 'bonus', 'restricted_stock', 'shared_receipt_with_poi', 'restricted_stock_deferred', 'total_stock_value', 'expenses', 'loan_advances', 'from_messages', 'other', 'from_this_person_to_poi', 'director_fees', 'deferred_income', 'long_term_incentive', 'from_poi_to_this_person']

# Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "r") as data_file:
    data_dict = pickle.load(data_file)


In [28]:
# Explore the dataset
poi = 0
non_poi = 0
for v in data_dict.itervalues():
    if v['poi'] == True:
        poi += 1
    else:
        non_poi += 1

print 'Number of data points: ', sum(len(v) for v in data_dict.itervalues())
print 'Number of features: ', len(features_list)
print 'Number of POIs: ', poi
print 'Number of non POIs: ', non_poi
print 'Number of employees: ', poi + non_poi

Number of data points:  3066
Number of features:  20
Number of POIs:  18
Number of non POIs:  128
Number of employees:  146


In [None]:
# Remove outliers
data_dict.pop('TOTAL', 0)
data_dict.pop('THE TRAVEL AGENCY IN THE PARK', 0)

# Create new features
import math
# Add new features to data_dict
for name in data_dict:
    from_ratio = float(data_dict[name]['from_poi_to_this_person']) / float(data_dict[name]['to_messages']) 
    # Corrects for non-string NaN values resulting from new features
    if math.isnan(from_ratio):
        data_dict[name]['percent_from_poi'] = 0
    else:
        data_dict[name]['percent_from_poi'] = from_ratio

for name in data_dict:
    to_ratio = float(data_dict[name]['from_this_person_to_poi']) / float(data_dict[name]['from_messages'])
    if math.isnan(to_ratio):
        data_dict[name]['percent_to_poi'] = 0
    else:
        data_dict[name]['percent_to_poi'] = to_ratio

# Append new features to features_list
features_list.append('percent_from_poi')
features_list.append('percent_to_poi')


# Store to my_dataset for easy export below.
my_dataset = data_dict

# Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

In [2]:
# Try a vareity of classifiers

# Split data into training and testing sets
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

from sklearn.naive_bayes import GaussianNB
from sklearn import tree
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier

# Create classifiers
nb_clf = GaussianNB()
tree_clf = tree.DecisionTreeClassifier()
svm_clf = svm.SVC()
knn_clf = KNeighborsClassifier()

# Set range of parameters for classifiers
nb_params = {
    'feature_selection__k': [8, 10, 12, 14]
}
tree_params = {
    'feature_selection__k': [8, 10, 12, 14],
    'algorithm__criterion': ['gini', 'entropy'],
    'algorithm__max_depth': range(2, 12, 2)
}
svm_params = {
    'feature_selection__k': [8, 10, 12, 14],
    'algorithm__kernel': ['rbf', 'poly'],
    'algorithm__C': [1, 10, 100],
    'algorithm__gamma': [.001, .01, 1]
}
knn_params = {
    'feature_selection__k': [8, 10, 12, 14],
    'algorithm__n_neighbors': range(2, 6)
}

from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from pprint import pprint

# Choose an algorithm
def choose_algorithm(classifier, params):
    '''
    Function that takes in an algorithm classifier and their respective parameters as inputs.
    Performs SelectKBest feature selection, MinMaxScaler preprocessing, and GridSearchCV
    for parameter selection into a pipeline. Prints parameter options, pipeline steps, f1 score,
    optimal parameters, and evaluation metrics. Returns pipeline classifier.
    '''
    select = SelectKBest()
    scaler = MinMaxScaler()
    
    # Steps to be fed into pipeline
    steps = [('feature_selection', select),
             ('min_max_scaler', scaler),
             ('algorithm', classifier)]
    
    pipeline = Pipeline(steps)
    
    folds = 100
    # StratifiedShuffleSplit returns stratified randomized folds
    sss = StratifiedShuffleSplit(labels_train, n_iter=folds, random_state=42)
    gs = GridSearchCV(pipeline, param_grid = params, cv=sss, scoring = 'f1')
    
    print 'Parameters:'
    pprint(params)
    print ""
    
    # Print out pipeline steps
    print"Pipeline: \n", [step for step, _ in pipeline.steps], '\n'
    
    # Fit training data to GridSearchCV
    gs.fit(features_train, labels_train)
    
    # Print f1 score
    score = gs.best_score_
    print 'Score: \n', score, '\n'
    
    # Fetch optimal parameters found
    best_params = gs.best_estimator_.get_params()
    print 'Best Parameters: '
    for name in params.keys():
        print name, ': ', best_params[name]
    
    pred = gs.predict(features_test)
    # Print Confusion Matrix and Classification Report evaluation metrics
    print '\n Confusion Matrix:'
    print confusion_matrix(labels_test, pred)
    
    report = classification_report(labels_test, pred)
    print '\n Classification Report:'
    print report
    
    clf = gs.best_estimator_
    return clf
        

In [11]:
print 'Naive Bayes Algorithm: \n'
choose_algorithm(nb_clf, nb_params)

Naive Bayes Algorithm: 

Parameters:
{'feature_selection__k': [8, 10, 12, 14]}

Pipeline: 
['feature_selection', 'min_max_scaler', 'algorithm'] 

Score: 
0.134333333333 

Best Parameters: 
feature_selection__k :  8

 Confusion Matrix:
[[35  3]
 [ 2  3]]

 Classification Report:
             precision    recall  f1-score   support

        0.0       0.95      0.92      0.93        38
        1.0       0.50      0.60      0.55         5

avg / total       0.89      0.88      0.89        43



Pipeline(steps=[('feature_selection', SelectKBest(k=8, score_func=<function f_classif at 0x10e99c668>)), ('min_max_scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('algorithm', GaussianNB())])

In [12]:
print 'Decision Tree Algorithm: \n'
choose_algorithm(tree_clf, tree_params)

Decision Tree Algorithm: 

Parameters:
{'algorithm__criterion': ['gini', 'entropy'],
 'algorithm__max_depth': [2, 4, 6, 8, 10],
 'feature_selection__k': [8, 10, 12, 14]}

Pipeline: 
['feature_selection', 'min_max_scaler', 'algorithm'] 

Score: 
0.423 

Best Parameters: 
feature_selection__k :  8
algorithm__criterion :  gini
algorithm__max_depth :  6

 Confusion Matrix:
[[34  4]
 [ 2  3]]

 Classification Report:
             precision    recall  f1-score   support

        0.0       0.94      0.89      0.92        38
        1.0       0.43      0.60      0.50         5

avg / total       0.88      0.86      0.87        43



Pipeline(steps=[('feature_selection', SelectKBest(k=8, score_func=<function f_classif at 0x10e99c668>)), ('min_max_scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('algorithm', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])

In [15]:
print 'Support Vector Machine Algorithm: \n'
choose_algorithm(svm_clf, svm_params)

Support Vector Machine Algorithm: 

Parameters:
{'algorithm__C': [1, 10, 100],
 'algorithm__gamma': [0.001, 0.01, 1],
 'algorithm__kernel': ['rbf', 'poly'],
 'feature_selection__k': [8, 10, 12, 14]}

Pipeline: 
['feature_selection', 'min_max_scaler', 'algorithm'] 

Score: 
0.154666666667 

Best Parameters: 
algorithm__gamma :  1
feature_selection__k :  14
algorithm__C :  100
algorithm__kernel :  rbf

 Confusion Matrix:
[[36  2]
 [ 4  1]]

 Classification Report:
             precision    recall  f1-score   support

        0.0       0.90      0.95      0.92        38
        1.0       0.33      0.20      0.25         5

avg / total       0.83      0.86      0.84        43



Pipeline(steps=[('feature_selection', SelectKBest(k=14, score_func=<function f_classif at 0x10e99c668>)), ('min_max_scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('algorithm', SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [14]:
print 'K Nearest Neighbor Algorithm: \n'
choose_algorithm(knn_clf, knn_params)

K Nearest Neighbor Algorithm: 

Parameters:
{'algorithm__n_neighbors': [2, 3, 4, 5],
 'feature_selection__k': [8, 10, 12, 14]}

Pipeline: 
['feature_selection', 'min_max_scaler', 'algorithm'] 

Score: 
0.0983333333333 

Best Parameters: 
feature_selection__k :  8
algorithm__n_neighbors :  3

 Confusion Matrix:
[[36  2]
 [ 4  1]]

 Classification Report:
             precision    recall  f1-score   support

        0.0       0.90      0.95      0.92        38
        1.0       0.33      0.20      0.25         5

avg / total       0.83      0.86      0.84        43



Pipeline(steps=[('feature_selection', SelectKBest(k=8, score_func=<function f_classif at 0x10e99c668>)), ('min_max_scaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('algorithm', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform'))])

In [13]:
### Task 5: Tune your classifier to achieve better than .3 precision and recall 
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Assign the Decision Tree algorithm to clf as it produced the best f1 score
clf = choose_algorithm(tree_clf, tree_params)
    
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)

Parameters:
{'algorithm__criterion': ['gini', 'entropy'],
 'algorithm__max_depth': [2, 4, 6, 8, 10],
 'feature_selection__k': [8, 10, 12, 14]}

Pipeline: 
['feature_selection', 'min_max_scaler', 'algorithm'] 

Score: 
0.400666666667 

Best Parameters: 
feature_selection__k :  8
algorithm__criterion :  gini
algorithm__max_depth :  8

 Confusion Matrix:
[[35  3]
 [ 2  3]]

 Classification Report:
             precision    recall  f1-score   support

        0.0       0.95      0.92      0.93        38
        1.0       0.50      0.60      0.55         5

avg / total       0.89      0.88      0.89        43



# Enron Response Questions

1: Summarize for us the goal of this project and how machine learning is useful in trying to accomplish it. As part of your answer, give some background on the dataset and how it can be used to answer the project question. Were there any outliers in the data when you got it, and how did you handle those?  [relevant rubric items: “data exploration”, “outlier investigation”]

The purpose of this project is to performing machine learning techniques by building an algorithm that can identify Enron employees who may have been involved in the company's fraud. A dataset containing information on various financial and email indicators of different Enron employees was used to help build the model. This includes features such as the employee name, salary, bonus, number of emails sent, and whether or not they are a POI (Person of Interest). The next step is to explore the dataset to look for any relationships or outliers between the data. For example, I found that there was a 'Total' column included in the data, as well as a datapoint called 'The Travel Agency in the Park'. These datapoints are irrelevant to our investigation as they are not Enron employees. Once the dataset has been cleaned for outliers, I was able to use the relevant features to help make predictions on the likelihood that a given employee was a POI.

2: What features did you end up using in your POI identifier, and what selection process did you use to pick them? Did you have to do any scaling? Why or why not? As part of the assignment, you should attempt to engineer your own feature that does not come ready-made in the dataset -- explain what feature you tried to make, and the rationale behind it. (You do not necessarily have to use it in the final analysis, only engineer and test it.) In your feature selection step, if you used an algorithm like a decision tree, please also give the feature importances of the features that you use, and if you used an automated feature selection function like SelectKBest, please report the feature scores and reasons for your choice of parameter values.  [relevant rubric items: “create new features”, “properly scale features”, “intelligently select feature”]

My initial features list contained all 20 available features within the dataset. I also decided to create two additional features called 'percent_from_poi' and 'percent_to_poi' using the original data provided. These features represent the ratio between the number of emails received (sent) from a POI to the total number of emails received (sent). The idea behind this is that I would like to investigate whether employees are likely to be POI's themselves if they communicate frequently with other POI's. I then preprocessed my features list by optimizing feature selection and feature importance. I used SelectKBest to select the best K features, as well as MinMaxScaler to add weights to the remaining features. Preprocessing data in machine learning is an important step before picking an algorithm as it  reduces the processing time of the algorithms. For example, I noticed that using MinMaxScaler reduced the completion time of the support vector machine algorithm from several hours to several minutes. Another reason is that some features may not be relevant when making predictions, and can therefore be ignored.

3: What algorithm did you end up using? What other one(s) did you try? How did model performance differ between algorithms?  [relevant rubric item: “pick an algorithm”]

I tested four differect algorithms: Naive Bayes, Decision Tree, Support Vector Machine, and K Nearest Neighbor.  I noticed when running these algorithms that they each took different amount of times to execute, and also made different predictions. The support vector machine took the longest time to run, and K Nearest Neighbor returned the poorest classification results. Based on the evaluation metrics of each algorithm, I decided to choose the Decision Tree algorithm as it returned the highest f1 score.

4: What does it mean to tune the parameters of an algorithm, and what can happen if you don’t do this well?  How did you tune the parameters of your particular algorithm? (Some algorithms do not have parameters that you need to tune -- if this is the case for the one you picked, identify and briefly explain how you would have done it for the model that was not your final choice or a different model that does utilize parameter tuning, e.g. a decision tree classifier).  [relevant rubric item: “tune the algorithm”]

Several machine learning algorithms take in parameters that determine how it will be performed. This is an important step in the machine learning process as it will determine the results of the model. If not done properly, the algorithm may run the risk of over-fitting or under-fitting the data. This will cause the results from the training data to differ from the testing data. Fortunately, scikit-learn provides a class called GridSearchCV to help find the optimal parameters of the model. For the Decision Tree algorithm, I tested the 'criterion' parameters of 'gini' and 'entropy', as well as 'max_depth' of 2, 4, 6, 8, and 10. After simulating each of the possible parameters through brute force, GridSearchCV returned the optimal parameters to be 'criterion':'gini' and 'max_depth': 6.

5: What is validation, and what’s a classic mistake you can make if you do it wrong? How did you validate your analysis?  [relevant rubric item: “validation strategy”]

In machine learning, data is split up into training and testing sets. The training set is used to train the model, while the remaining test set is used for predictions. We 'pretend' to not know the results of the test set, and compare predictions to actual results to measure the results. However, there is a risk of overfitting the data as knowledge about the test set can 'leak' into the model. To solve this problem, cross-validation may be used by splitting the test data into smaller subsets. After several rounds of validation, the average results are returned. For this project, I used Stratified Shuffle Split, which splits up a random portion of the data several times. 

6: Give at least 2 evaluation metrics and your average performance for each of them.  Explain an interpretation of your metrics that says something human-understandable about your algorithm’s performance. [relevant rubric item: “usage of evaluation metrics”]

Evaluation metrics are used to measure the results of an algorithm. I chose to include a classification report for each algorith since this returns information on precision, recall, and f1 score. I also included a confustion matrix, which is a matrix of actual classes vs. predicted classes. By looking at the matrix, we can get a visual representation of the algorithms' true positives, true negatives, false positives, and false negatives. The Decision Tree algorithm identified 34/38 non-POIs correctly (recall = 0.89), and 3/5 POIs correctly (recall = 0.6) in the training dataset. It inacurately classified 2 POIs as non-POIs (precision = 0.94), and 4 non-POIs as POIs (precision = 0.43).

<h3>References:</h3>

https://civisanalytics.com/blog/data-science/2016/01/06/workflows-python-using-pipeline-gridsearchcv-for-compact-code/
http://abshinn.github.io/python/sklearn/2014/06/08/grid-searching-in-all-the-right-places/
https://github.com/amueller/scipy_2015_sklearn_tutorial/tree/master/notebooks
https://www.youtube.com/watch?v=80fZrVMurPM
https://www.youtube.com/watch?v=Ud-FsEWegmA&t=8173s