# Identifying Fraud From Enron Email

A nanodegree project.

In [1]:
#!/usr/bin/python

import sys
import pickle
sys.path.append("../tools/")

from feature_format import featureFormat, targetFeatureSplit
from tester import dump_classifier_and_data
from pprint import pprint
import ggplot

In [2]:
### Load the dictionary containing the dataset
data_dict = pickle.load(open("final_project_dataset.pkl", "r") )

## Part One - Understanding the Dataset and Question

### Data Exploration
To better understand the dataset, an exploration is performed here and the results are summarized as following,
- there are 146 data points with 21 features.
- there are 18 people who is an point of interest.
- 1,358 data points are missing.
- the top 3 features with most missing values are "loan_advances", "director_fees", and "restricted_stock_deferred".

More detailed exploration and analysis are listed as below.

In [5]:
# number of data points
len(data_dict.keys())

146

In [3]:
# number of features available
len(data_dict['METTS MARK'])

21

In [15]:
# available features
data_dict["METTS MARK"].keys()

['salary',
 'to_messages',
 'deferral_payments',
 'total_payments',
 'exercised_stock_options',
 'bonus',
 'restricted_stock',
 'shared_receipt_with_poi',
 'restricted_stock_deferred',
 'total_stock_value',
 'expenses',
 'loan_advances',
 'from_messages',
 'other',
 'from_this_person_to_poi',
 'poi',
 'director_fees',
 'deferred_income',
 'long_term_incentive',
 'email_address',
 'from_poi_to_this_person']

In [6]:
# people of interest
count = 0
for item, value in data_dict.iteritems():
    if value["poi"]:
        print item
        count += 1
count

HANNON KEVIN P
COLWELL WESLEY
RIEKER PAULA H
KOPPER MICHAEL J
SHELBY REX
DELAINEY DAVID W
LAY KENNETH L
BOWEN JR RAYMOND M
BELDEN TIMOTHY N
FASTOW ANDREW S
CALGER CHRISTOPHER F
RICE KENNETH D
SKILLING JEFFREY K
YEAGER F SCOTT
HIRKO JOSEPH
KOENIG MARK E
CAUSEY RICHARD A
GLISAN JR BEN F


18

In [28]:
# create a dictionary for all missing values
missing = {}
for item, value in data_dict.iteritems():
    for key, figure in value.iteritems():
        if figure == "NaN":
            if key not in missing:
                missing[key] = 1
            else:
                missing[key] += 1

In [30]:
# number of missing values
number_of_missing = 0
for item, value in missing.iteritems():
    number_of_missing += value
number_of_missing

1358

In [27]:
missing

{'bonus': 64,
 'deferral_payments': 107,
 'deferred_income': 97,
 'director_fees': 129,
 'email_address': 35,
 'exercised_stock_options': 44,
 'expenses': 51,
 'from_messages': 60,
 'from_poi_to_this_person': 60,
 'from_this_person_to_poi': 60,
 'loan_advances': 142,
 'long_term_incentive': 80,
 'other': 53,
 'restricted_stock': 36,
 'restricted_stock_deferred': 128,
 'salary': 51,
 'shared_receipt_with_poi': 60,
 'to_messages': 60,
 'total_payments': 21,
 'total_stock_value': 20}

In [41]:
# check who isn't missing the feature 'load_advances'
# outputs the person's name and a boolean value indicated whether the person is a poi.
for item, value in data_dict.iteritems():
    if value["loan_advances"] != "NaN":
        print "name: ", item, "poi:", value["poi"]

name:  LAY KENNETH L poi: True
name:  PICKERING MARK R poi: False
name:  TOTAL poi: False
name:  FREVERT MARK A poi: False


In [42]:
# check who isn't missing the feature 'director_fees'
# outputs the person's name and a boolean value indicated whether the person is a poi.
for item, value in data_dict.iteritems():
    if value["director_fees"] != "NaN":
        print "name", item, "poi:", value["poi"]

name CHAN RONNIE poi: False
name BELFER ROBERT poi: False
name URQUHART JOHN A poi: False
name MENDELSOHN JOHN poi: False
name WAKEHAM JOHN poi: False
name POWERS WILLIAM poi: False
name DUNCAN JOHN H poi: False
name LEMAISTRE CHARLES poi: False
name MEYER JEROME J poi: False
name PEREIRA PAULO V. FERRAZ poi: False
name BLAKE JR. NORMAN P poi: False
name TOTAL poi: False
name JAEDICKE ROBERT poi: False
name WINOKUR JR. HERBERT S poi: False
name BHATNAGAR SANJAY poi: False
name SAVAGE FRANK poi: False
name GRAMM WENDY L poi: False


In [43]:
# check who isn't missing the feature 'restricted_stock_deferred'
# outputs the person's name and a boolean value indicated whether the person is a poi.
for item, value in data_dict.iteritems():
    if value["restricted_stock_deferred"] != "NaN":
        print "name", item, "poi:", value["poi"]

name PIPER GREGORY F poi: False
name LOWRY CHARLES P poi: False
name CHAN RONNIE poi: False
name BELFER ROBERT poi: False
name CLINE KENNETH W poi: False
name DETMERING TIMOTHY J poi: False
name BANNANTINE JAMES M poi: False
name GATHMANN WILLIAM D poi: False
name HAEDICKE MARK E poi: False
name NOLES JAMES L poi: False
name TOTAL poi: False
name ALLEN PHILLIP K poi: False
name JAEDICKE ROBERT poi: False
name REYNOLDS LAWRENCE poi: False
name BHATNAGAR SANJAY poi: False
name CARTER REBECCA C poi: False
name DERRICK JR. JAMES V poi: False
name BAY FRANKLIN R poi: False


In [44]:
# check who isn't missing the feature 'deferral_payments'
# outputs the person's name and a boolean value indicated whether the person is a poi.
for item, value in data_dict.iteritems():
    if value["deferral_payments"] != "NaN":
        print "name", item, "poi:", value["poi"]

name BAXTER JOHN C poi: False
name MEYER ROCKFORD G poi: False
name HORTON STANLEY C poi: False
name PIPER GREGORY F poi: False
name HUMPHREY GENE E poi: False
name GIBBS DANA R poi: False
name COLWELL WESLEY poi: True
name MULLER MARK S poi: False
name WALTERS GARETH W poi: False
name BELFER ROBERT poi: False
name RIEKER PAULA H poi: True
name HAYES ROBERT E poi: False
name DETMERING TIMOTHY J poi: False
name SULLIVAN-SHAKLOVITZ COLLEEN poi: False
name LINDHOLM TOD A poi: False
name LAY KENNETH L poi: True
name OLSON CINDY K poi: False
name GAHN ROBERT S poi: False
name HAEDICKE MARK E poi: False
name BAZELIDES PHILIP J poi: False
name BELDEN TIMOTHY N poi: True
name THORN TERENCE H poi: False
name FOY JOE poi: False
name PRENTICE JAMES poi: False
name GRAY RODNEY poi: False
name NOLES JAMES L poi: False
name TOTAL poi: False
name WASAFF GEORGE poi: False
name ALLEN PHILLIP K poi: False
name SHARP VICTORIA T poi: False
name BADUM JAMES P poi: False
name REYNOLDS LAWRENCE poi: False
na

As shown above, it doesn't seem to have a clear pattern on whether a poi is missing a value or not. The investigation on missing values ends here, and the missing values will be replaced with '0' after feature formatting.

### Outlier Investigation

#### Task 1: Select Features
As a starting point, all the available features will be selected and put into the model. Later in this report, some features will be removed based on their PCA importance score.

In [None]:
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
features_list = ['poi',
                 'salary',
                 'to_messages',
                 'deferral_payments',
                 'total_payments',
                 'exercised_stock_options',
                 'bonus',
                 'restricted_stock',
                 'shared_receipt_with_poi',
                 'restricted_stock_deferred',
                 'total_stock_value',
                 'expenses',
                 'loan_advances',
                 'from_messages',
                 'other',
                 'from_this_person_to_poi',
                 'poi',
                 'director_fees',
                 'deferred_income',
                 'long_term_incentive',
                 'from_poi_to_this_person']

## Task 2: Remove outliers

In [None]:
def outlierCleaner(predictions, ages, net_worths):
    """
        clean away the 10% of points that have the largest
        residual errors (different between the prediction
        and the actual net worth)

        return a list of tuples named cleaned_data where
        each tuple is of the form (age, net_worth, error)
    """

    cleaned_data = []

    length = int(len(predictions) * 0.9)

    for i in range(len(predictions)):
        result = ages[i], net_worths[i], (net_worths[i] - predictions[i]) ** 2
        cleaned_data.append(tuple(result))

    cleaned_data.sort(key=lambda value: value[2])

    cleaned_data = cleaned_data[: length]
    print len(cleaned_data)
    return cleaned_data

In [None]:
from sklearn.covariance import EllipticEnvelope
outlier_cleaner = EllipticEnvelope(contamination = 0.1)


## Part Two - Optimize Feature Selection

## Part Three - Pick and Tune an Algorithm

## Part Four - Validate and Evaluate

## Task 3: Create new feature(s)

In [None]:
import numpy as np
from matplotlib import pyplot as plt

from sklearn import linear_model, datasets


n_samples = 1000
n_outliers = 50


X, y, coef = datasets.make_regression(n_samples=n_samples, n_features=1,
                                      n_informative=1, noise=10,
                                      coef=True, random_state=0)

# Add outlier data
np.random.seed(0)
X[:n_outliers] = 3 + 0.5 * np.random.normal(size=(n_outliers, 1))
y[:n_outliers] = -3 + 10 * np.random.normal(size=n_outliers)

# Fit line using all data
model = linear_model.LinearRegression()
model.fit(X, y)

# Robustly fit linear model with RANSAC algorithm
model_ransac = linear_model.RANSACRegressor(linear_model.LinearRegression())
model_ransac.fit(X, y)
inlier_mask = model_ransac.inlier_mask_
outlier_mask = np.logical_not(inlier_mask)

# Predict data of estimated models
line_X = np.arange(-5, 5)
line_y = model.predict(line_X[:, np.newaxis])
line_y_ransac = model_ransac.predict(line_X[:, np.newaxis])

# Compare estimated coefficients
print("Estimated coefficients (true, normal, RANSAC):")
print(coef, model.coef_, model_ransac.estimator_.coef_)

plt.plot(X[inlier_mask], y[inlier_mask], '.g', label='Inliers')
plt.plot(X[outlier_mask], y[outlier_mask], '.r', label='Outliers')
plt.plot(line_X, line_y, '-k', label='Linear regressor')
plt.plot(line_X, line_y_ransac, '-b', label='RANSAC regressor')
plt.legend(loc='lower right')
plt.show()

In [None]:
### Store to my_dataset for easy export below.
my_dataset = data_dict

### Extract features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list, sort_keys = True)
labels, features = targetFeatureSplit(data)

## Task 4: Try a varity of classifiers

In [None]:
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html

In [None]:
# Provided to give you a starting point. Try a variety of classifiers.
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()

## Task 5: Tune your classifier to achieve better than .3 precision and recall 

In [None]:
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info: 
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html

# Example starting point. Try investigating other evaluation techniques!
from sklearn.cross_validation import train_test_split
features_train, features_test, labels_train, labels_test = \
    train_test_split(features, labels, test_size=0.3, random_state=42)

## Task 6: Dump your classifier, dataset, and features_list so anyone can

In [None]:
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.

dump_classifier_and_data(clf, my_dataset, features_list)