In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.cross_validation  import train_test_split
from sklearn.metrics import classification_report
from tools import tester
from tools.tester import dump_classifier_and_data
from sklearn import preprocessing
from sklearn.svm import SVC

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

##### Finding POI's in the Enron Dataset

## Understanding the Dataset and Question

### Data Exploration

##### Dataset Contents

**final_project_dataset.pkl**
Financial data from the included `enron61712insiderpay.pdf` have been combined into a dictionary in the included final_project_dataset.pkl file. In the dictionary, the key is the person's name, and the value is another dictionary, which contains the names of all the features and their values for that person. The features in the data fall into three major types, namely financial features, email features and POI labels.

**financial features**:
```python
# (all units are in US dollars)
[
 'salary',
 'deferral_payments',
 'total_payments',
 'loan_advances',
 'bonus',
 'restricted_stock_deferred',
 'deferred_income',
 'total_stock_value',
 'expenses',
 'exercised_stock_options',
 'other',
 'long_term_incentive',
 'restricted_stock',
 'director_fees'
]
```

**email features**:
```python
# (units are generally number of emails messages that reference the ; notable exception is ‘email_address’, which is a text string)
[
 'to_messages',
 'email_address',
 'from_poi_to_this_person',
 'from_messages',
 'from_this_person_to_poi',
 'poi', # POI Label (boolean, represented as integer).
 'shared_receipt_with_poi'
] 
```

#### Persons of Interest

In [2]:
cat ./poi_names.txt

http://usatoday30.usatoday.com/money/industries/energy/2005-12-28-enron-participants_x.htm
First column indicates whether or not the email inbox can be found in the Enron dataset

(y) Lay, Kenneth
(y) Skilling, Jeffrey
(n) Howard, Kevin
(n) Krautz, Michael
(n) Yeager, Scott
(n) Hirko, Joseph
(n) Shelby, Rex
(n) Bermingham, David
(n) Darby, Giles
(n) Mulgrew, Gary
(n) Bayley, Daniel
(n) Brown, James
(n) Furst, Robert
(n) Fuhs, William
(n) Causey, Richard
(n) Calger, Christopher
(n) DeSpain, Timothy
(n) Hannon, Kevin
(n) Koenig, Mark
(y) Forney, John
(n) Rice, Kenneth
(n) Rieker, Paula
(n) Fastow, Lea
(n) Fastow, Andrew
(y) Delainey, David
(n) Glisan, Ben
(n) Richter, Jeffrey
(n) Lawyer, Larry
(n) Belden, Timothy
(n) Kopper, Michael
(n) Duncan, David
(n) Bowen, Raymond
(n) Colwell, Wesley
(n) Boyle, Dan
(n) Loehr, Christopher


This file contains a list of 35 people who were a person of interest in the Enron scandal. A POI is defined as someone who was:
* indicted
* settled without admitting guilt
* testified in exchange for immunity

#### Summary of Dataset

In [3]:
enron_data = pickle.load(open("./final_project_dataset.pkl"))

In [4]:
enron_data.iteritems().next()

('METTS MARK',
 {'bonus': 600000,
  'deferral_payments': 'NaN',
  'deferred_income': 'NaN',
  'director_fees': 'NaN',
  'email_address': 'mark.metts@enron.com',
  'exercised_stock_options': 'NaN',
  'expenses': 94299,
  'from_messages': 29,
  'from_poi_to_this_person': 38,
  'from_this_person_to_poi': 1,
  'loan_advances': 'NaN',
  'long_term_incentive': 'NaN',
  'other': 1740,
  'poi': False,
  'restricted_stock': 585062,
  'restricted_stock_deferred': 'NaN',
  'salary': 365788,
  'shared_receipt_with_poi': 702,
  'to_messages': 807,
  'total_payments': 1061827,
  'total_stock_value': 585062})

#### Features in the dataset:

In the dataset there are 146 employees with 20 features and 'poi' label for each employee. To further summarize the features in the dataset, we will convert the dictionary to a pandas DataFrame.

In [5]:
# Replace "Nan" with NaN
for columns in enron_data.itervalues():
    for k,v in columns.iteritems():
        if type(v) is str and v.lower() == "nan":
            columns[k] = np.nan

In [6]:
enron_df = pd.DataFrame.from_dict(enron_data, orient="index")
enron_df

Unnamed: 0,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,total_stock_value,expenses,loan_advances,from_messages,other,from_this_person_to_poi,poi,director_fees,deferred_income,long_term_incentive,email_address,from_poi_to_this_person
ALLEN PHILLIP K,201955.0,2902.0,2869717.0,4484442.0,1729541.0,4175000.0,126027.0,1407.0,-126027.0,1729541.0,13868.0,,2195.0,152.0,65.0,False,,-3081055.0,304805.0,phillip.allen@enron.com,47.0
BADUM JAMES P,,,178980.0,182466.0,257817.0,,,,,257817.0,3486.0,,,,,False,,,,,
BANNANTINE JAMES M,477.0,566.0,,916197.0,4046157.0,,1757552.0,465.0,-560222.0,5243487.0,56301.0,,29.0,864523.0,0.0,False,,-5104.0,,james.bannantine@enron.com,39.0
BAXTER JOHN C,267102.0,,1295738.0,5634343.0,6680544.0,1200000.0,3942714.0,,,10623258.0,11200.0,,,2660303.0,,False,,-1386055.0,1586055.0,,
BAY FRANKLIN R,239671.0,,260455.0,827696.0,,400000.0,145796.0,,-82782.0,63014.0,129142.0,,,69.0,,False,,-201641.0,,frank.bay@enron.com,
BAZELIDES PHILIP J,80818.0,,684694.0,860136.0,1599641.0,,,,,1599641.0,,,,874.0,,False,,,93750.0,,
BECK SALLY W,231330.0,7315.0,,969068.0,,700000.0,126027.0,2639.0,,126027.0,37172.0,,4343.0,566.0,386.0,False,,,,sally.beck@enron.com,144.0
BELDEN TIMOTHY N,213999.0,7991.0,2144013.0,5501630.0,953136.0,5249999.0,157569.0,5521.0,,1110705.0,17355.0,,484.0,210698.0,108.0,True,,-2334434.0,,tim.belden@enron.com,228.0
BELFER ROBERT,,,-102500.0,102500.0,3285.0,,,,44093.0,-44093.0,,,,,,False,3285.0,,,,
BERBERIAN DAVID,216582.0,,,228474.0,1624396.0,,869220.0,,,2493616.0,11892.0,,,,,False,,,,david.berberian@enron.com,


There is a row with the name, "TOTAL". This row should be removed.

In [7]:
# Omit the TOTAL index
enron_df.drop('TOTAL', inplace=True)

**Financial Features**

In [8]:
enron_df.loc[:, ['salary',
                 'deferral_payments',
                 'total_payments',
                 'loan_advances',
                 'bonus',
                 'restricted_stock_deferred',
                 'deferred_income',]].describe()

Unnamed: 0,salary,deferral_payments,total_payments,loan_advances,bonus,restricted_stock_deferred,deferred_income
count,94.0,38.0,124.0,3.0,81.0,17.0,48.0
mean,284087.5,841602.5,2623421.0,27975000.0,1201773.0,621892.8,-581049.8
std,177131.1,1289323.0,9488106.0,46382560.0,1441679.0,3845528.0,942076.4
min,477.0,-102500.0,148.0,400000.0,70000.0,-1787380.0,-3504386.0
25%,211802.0,79644.5,386380.2,1200000.0,425000.0,-329825.0,-611209.2
50%,258741.0,221063.5,1100246.0,2000000.0,750000.0,-140264.0,-151927.0
75%,308606.5,867211.2,2084663.0,41762500.0,1200000.0,-72419.0,-37926.0
max,1111258.0,6426990.0,103559800.0,81525000.0,8000000.0,15456290.0,-833.0


In [9]:
enron_df.loc[:, ['total_stock_value',
                 'expenses',
                 'exercised_stock_options',
                 'other',
                 'long_term_incentive',
                 'restricted_stock',
                 'director_fees']].describe()

Unnamed: 0,total_stock_value,expenses,exercised_stock_options,other,long_term_incentive,restricted_stock,director_fees
count,125.0,94.0,101.0,92.0,65.0,109.0,16.0
mean,3352073.0,54192.010638,2959559.0,465276.7,746491.2,1147424.0,89822.875
std,6532883.0,46108.377454,5499450.0,1389719.0,862917.4,2249770.0,41112.700735
min,-44093.0,148.0,3285.0,2.0,69223.0,-2604490.0,3285.0
25%,494136.0,22479.0,506765.0,1209.0,275000.0,252055.0,83674.5
50%,1095040.0,46547.5,1297049.0,51984.5,422158.0,441096.0,106164.5
75%,2606763.0,78408.5,2542813.0,357577.2,831809.0,985032.0,112815.0
max,49110080.0,228763.0,34348380.0,10359730.0,5145434.0,14761690.0,137864.0


**Email Features**

In [10]:
enron_df.loc[:, ['to_messages',
                 'email_address',
                 'from_poi_to_this_person',
                 'from_messages',
                 'from_this_person_to_poi',
                 'shared_receipt_with_poi']].describe()

Unnamed: 0,to_messages,from_poi_to_this_person,from_messages,from_this_person_to_poi,shared_receipt_with_poi
count,86.0,86.0,86.0,86.0,86.0
mean,2073.860465,64.895349,608.790698,41.232558,1176.465116
std,2582.700981,86.979244,1841.033949,100.073111,1178.317641
min,57.0,0.0,12.0,0.0,2.0
25%,541.25,10.0,22.75,1.0,249.75
50%,1211.0,35.0,41.0,8.0,740.5
75%,2634.75,72.25,145.5,24.75,1888.25
max,15149.0,528.0,14368.0,609.0,5521.0


**Persons of Interest**

In [11]:
enron_poi = enron_df[enron_df['poi']==True]
print("Number of POI's: " + str(len(enron_poi)))
enron_poi

Number of POI's: 18


Unnamed: 0,salary,to_messages,deferral_payments,total_payments,exercised_stock_options,bonus,restricted_stock,shared_receipt_with_poi,restricted_stock_deferred,total_stock_value,expenses,loan_advances,from_messages,other,from_this_person_to_poi,poi,director_fees,deferred_income,long_term_incentive,email_address,from_poi_to_this_person
BELDEN TIMOTHY N,213999.0,7991.0,2144013.0,5501630.0,953136.0,5249999.0,157569.0,5521.0,,1110705.0,17355.0,,484.0,210698.0,108.0,True,,-2334434.0,,tim.belden@enron.com,228.0
BOWEN JR RAYMOND M,278601.0,1858.0,,2669589.0,,1350000.0,252055.0,1593.0,,252055.0,65907.0,,27.0,1621.0,15.0,True,,-833.0,974293.0,raymond.bowen@enron.com,140.0
CALGER CHRISTOPHER F,240189.0,2598.0,,1639297.0,,1250000.0,126027.0,2188.0,,126027.0,35818.0,,144.0,486.0,25.0,True,,-262500.0,375304.0,christopher.calger@enron.com,199.0
CAUSEY RICHARD A,415189.0,1892.0,,1868758.0,,1000000.0,2502063.0,1585.0,,2502063.0,30674.0,,49.0,307895.0,12.0,True,,-235000.0,350000.0,richard.causey@enron.com,58.0
COLWELL WESLEY,288542.0,1758.0,27610.0,1490344.0,,1200000.0,698242.0,1132.0,,698242.0,16514.0,,40.0,101740.0,11.0,True,,-144062.0,,wes.colwell@enron.com,240.0
DELAINEY DAVID W,365163.0,3093.0,,4747979.0,2291113.0,3000000.0,1323148.0,2097.0,,3614261.0,86174.0,,3069.0,1661.0,609.0,True,,,1294981.0,david.delainey@enron.com,66.0
FASTOW ANDREW S,440698.0,,,2424083.0,,1300000.0,1794412.0,,,1794412.0,55921.0,,,277464.0,,True,,-1386055.0,1736055.0,andrew.fastow@enron.com,
GLISAN JR BEN F,274975.0,873.0,,1272284.0,384728.0,600000.0,393818.0,874.0,,778546.0,125978.0,,16.0,200308.0,6.0,True,,,71023.0,ben.glisan@enron.com,52.0
HANNON KEVIN P,243293.0,1045.0,,288682.0,5538001.0,1500000.0,853064.0,1035.0,,6391065.0,34039.0,,32.0,11350.0,21.0,True,,-3117011.0,1617011.0,kevin.hannon@enron.com,32.0
HIRKO JOSEPH,,,10259.0,91093.0,30766064.0,,,,,30766064.0,77978.0,,,2856.0,,True,,,,joe.hirko@enron.com,


**Number of NaN's**:

In [12]:
enron_df.isnull().sum()

salary                        51
to_messages                   59
deferral_payments            107
total_payments                21
exercised_stock_options       44
bonus                         64
restricted_stock              36
shared_receipt_with_poi       59
restricted_stock_deferred    128
total_stock_value             20
expenses                      51
loan_advances                142
from_messages                 59
other                         53
from_this_person_to_poi       59
poi                            0
director_fees                129
deferred_income               97
long_term_incentive           80
email_address                 34
from_poi_to_this_person       59
dtype: int64

In [13]:
sum(enron_df.isnull().sum())

1352

### Data Exploration Findings
* 145 people
* 20 features
* 18 persons of interest
* 1352 NaN entries
  * director_fees, loan_advances, and restricted_stock_deferred are the top 3 columns with the most NaN entries

Since there were so many 'NaN' entries, I went back to the pdf that the data was derived from and noticed that entries with '-' were being interpreted as NaN. To fix this I will be replacing the 'NaN's with the value zero.

In [14]:
enron_df.fillna(0, inplace=True)

### Outlier Investigation

One outlier that was removed earlier was the "Total" index, which was represented the total sums of each columns. There were two entries that were invalid data points. One was "THE TRAVEL AGENCY IN THE PARK", which can not be a person and definitely not a person of interest. The other invalid entry was "LOCKHART EUGENE E", which had NaN values for all the features.

In [15]:
# Drop email_address column
enron_df.drop('email_address', axis=1, inplace=True)

enron_df.drop("THE TRAVEL AGENCY IN THE PARK", inplace=True)
enron_df.drop("LOCKHART EUGENE E", inplace=True)

**Indexes Removed**
* TOTAL
* THE TRAVEL AGENCY IN THE PARK
* LOCKHART EUGENE E

The 'email_address' column was also removed since it is irrelevant data.

## Optimize Feature Selection/Engineering

### Create new features

Do POI's receive more emails from other POI's compared to non POI's?

In [16]:
enron_df['from_poi_ratio'] = enron_df['from_poi_to_this_person'] / enron_df['from_messages']
enron_df.fillna(0, inplace=True)

Do POI's write more emails to other POI's compared to non POI's?

In [17]:
enron_df['to_poi_ratio'] = enron_df['from_this_person_to_poi'] / enron_df['to_messages']
enron_df.fillna(0, inplace=True)

Do POI's have a bigger bonus to salary ratio?

In [18]:
enron_df['bonus_ratio'] = enron_df['bonus'] / enron_df['salary']

In [19]:
enron_df[['poi','bonus_ratio']]

Unnamed: 0,poi,bonus_ratio
ALLEN PHILLIP K,False,20.672922
BADUM JAMES P,False,
BANNANTINE JAMES M,False,0.0
BAXTER JOHN C,False,4.492666
BAY FRANKLIN R,False,1.668955
BAZELIDES PHILIP J,False,0.0
BECK SALLY W,False,3.02598
BELDEN TIMOTHY N,True,24.53282
BELFER ROBERT,False,
BERBERIAN DAVID,False,0.0


For NaN values, the labels are more POI than not so these values will be filled with 0, since there seems to be a weak correlation between POI's and a large bonus_ratio.

In [20]:
enron_df.fillna(0, inplace=True)

### Feature Engineering Conclusion:
* Three features were created:
  * The ratio of (emails from poi's sent to the person) to (from received)
  * The ratio of (emails from this person sent to poi's) to (emails sent)
  * The ratio of bonus to salary
* In the next step feature selection will be done with selectKBest, PCA, and inherently with the feature importance in various tree classifier algorithms. 

## Classifier Pipelines

In [21]:
# Separate labels and features
enron_df_labels = enron_df['poi']
enron_df_features = enron_df[enron_df.columns.difference(['poi'])]

### Baseline Classifier: Gaussian Naive Bayes

In [22]:
pipeline = Pipeline([
        ('kbest', SelectKBest()),
        ('gnb', GaussianNB())])
folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter= folds, random_state = 42, test_size=0.20)
parameters = {"kbest__k": [1, 2, 3, 5, 8, 13, 19], "kbest__score_func": [f_classif]}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)

  'precision', 'predicted', average, warn_for)


GridSearchCV(cv=StratifiedShuffleSplit(labels=[False False ...,  True False], n_iter=100, test_size=0.2, random_state=42),
       error_score='raise',
       estimator=Pipeline(steps=[('kbest', SelectKBest(k=10, score_func=<function f_classif at 0x114f330c8>)), ('gnb', GaussianNB())]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kbest__k': [1, 2, 3, 5, 8, 13, 19], 'kbest__score_func': [<function f_classif at 0x114f330c8>]},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=0)

In [23]:
kbest = clf.best_estimator_.steps[0][1]
kbest.get_support()
features = sorted(zip(enron_df_features.columns, kbest.scores_, kbest.get_support()), key=lambda x: x[1])

my_list = [x[0] for x in features if x[2] == True]
my_list = ['poi'] + my_list
my_list

['poi',
 'deferred_income',
 'salary',
 'bonus',
 'total_stock_value',
 'exercised_stock_options']

In [24]:
data = enron_df[my_list].transpose().to_dict()

In [25]:
dump_classifier_and_data(GaussianNB(), data, my_list)

In [26]:
tester.main()

GaussianNB()
	Accuracy: 0.85464	Precision: 0.48876	Recall: 0.38050	F1: 0.42789	F2: 0.39814
	Total predictions: 14000	True positives:  761	False positives:  796	False negatives: 1239	True negatives: 11204



Using PCA instead of selectKBest:

In [27]:
pipeline = Pipeline([
        ("scale", preprocessing.StandardScaler()),
        ('pca', PCA()),
        ('gnb', GaussianNB())])
folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter= folds, random_state = 42, test_size=0.20)
parameters = {
    "pca__n_components": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19],
}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)

GridSearchCV(cv=StratifiedShuffleSplit(labels=[False False ...,  True False], n_iter=100, test_size=0.2, random_state=42),
       error_score='raise',
       estimator=Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, n_components=None, whiten=False)), ('gnb', GaussianNB())]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'pca__n_components': [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19]},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=0)

In [28]:
pca = clf.best_estimator_.steps[1][1]

In [29]:
pca = clf.best_estimator_.steps[1][1]
pca.n_components

12

In [30]:
pca_nb = Pipeline([
        ("scale", preprocessing.StandardScaler()),
        ('pca', PCA(n_components=pca.n_components)),
        ('gnb', GaussianNB())])

In [31]:
features_list = list(enron_df.columns)
features_list.remove('poi')
features_list = ['poi'] + features_list

In [32]:
dump_classifier_and_data(pca_nb, enron_df.transpose().to_dict(), features_list)

In [33]:
tester.main()

Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, n_components=12, whiten=False)), ('gnb', GaussianNB())])
	Accuracy: 0.79773	Precision: 0.28075	Recall: 0.33100	F1: 0.30381	F2: 0.31956
	Total predictions: 15000	True positives:  662	False positives: 1696	False negatives: 1338	True negatives: 11304



PCA in our case performs poorly when compared to selectKBest. This indicates that variance is needed in the dataset.

### Next Classifier: Decision Tree

#### With SelectKBest Feature Selection 

In [34]:
pipeline = Pipeline([
        ('kbest', SelectKBest()),
        ('dt', DecisionTreeClassifier())])
folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter= folds, random_state = 42, test_size=0.20)
parameters = {"kbest__k": [1, 2, 3, 5, 8, 13, 19], 'dt__max_features': [None, 'auto', 'log2'],
              'dt__criterion': ['gini', 'entropy']}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)

GridSearchCV(cv=StratifiedShuffleSplit(labels=[False False ...,  True False], n_iter=100, test_size=0.2, random_state=42),
       error_score='raise',
       estimator=Pipeline(steps=[('kbest', SelectKBest(k=10, score_func=<function f_classif at 0x114f330c8>)), ('dt', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kbest__k': [1, 2, 3, 5, 8, 13, 19], 'dt__criterion': ['gini', 'entropy'], 'dt__max_features': [None, 'auto', 'log2']},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=0)

In [35]:
kbest = clf.best_estimator_.steps[0][1]
kbest.get_support()
features = sorted(zip(enron_df_features.columns, kbest.scores_, kbest.get_support()), key=lambda x: x[1])

my_list = [x[0] for x in features if x[2] == True]
my_list = ['poi'] + my_list
my_list

['poi',
 'to_messages',
 'director_fees',
 'from_this_person_to_poi',
 'to_poi_ratio',
 'other',
 'from_poi_ratio',
 'from_poi_to_this_person',
 'expenses',
 'loan_advances',
 'shared_receipt_with_poi',
 'total_payments',
 'restricted_stock',
 'long_term_incentive',
 'bonus_ratio',
 'deferred_income',
 'salary',
 'bonus',
 'total_stock_value',
 'exercised_stock_options']

In [36]:
clf.best_estimator_.steps[1][1]

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='log2', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [37]:
data = enron_df[my_list].transpose().to_dict()
dump_classifier_and_data(clf.best_estimator_.steps[1][1], data, my_list)

In [38]:
tester.main()

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='log2', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
	Accuracy: 0.81080	Precision: 0.29739	Recall: 0.30750	F1: 0.30236	F2: 0.30542
	Total predictions: 15000	True positives:  615	False positives: 1453	False negatives: 1385	True negatives: 11547



#### Without SelectKBest Feature Selection 

In [39]:
pipeline = Pipeline([
        ('dt', DecisionTreeClassifier())])
folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter= folds, random_state = 42, test_size=0.20)
parameters = {'dt__max_features': [1, 2, 3, 5, 8, 13, 19],
              'dt__criterion': ['gini', 'entropy']}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)

GridSearchCV(cv=StratifiedShuffleSplit(labels=[False False ...,  True False], n_iter=100, test_size=0.2, random_state=42),
       error_score='raise',
       estimator=Pipeline(steps=[('dt', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'dt__criterion': ['gini', 'entropy'], 'dt__max_features': [1, 2, 3, 5, 8, 13, 19]},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=0)

In [40]:
data = enron_df.transpose().to_dict()
dump_classifier_and_data(clf.best_estimator_.steps[0][1], data, features_list)

In [41]:
tester.main()

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=1, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
	Accuracy: 0.82073	Precision: 0.31144	Recall: 0.28450	F1: 0.29736	F2: 0.28951
	Total predictions: 15000	True positives:  569	False positives: 1258	False negatives: 1431	True negatives: 11742



#### With PCA

In [42]:
pipeline = Pipeline([
        ("scale", preprocessing.StandardScaler()),
        ('pca', PCA()),
        ('dt', DecisionTreeClassifier())])
folds = 100
cv = StratifiedShuffleSplit(enron_df_labels, n_iter= folds, random_state = 42, test_size=0.20)
parameters = {"pca__n_components": [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19],
              'dt__max_features': [None, 'auto', 'log2'],
              'dt__criterion': ['gini', 'entropy']}
clf = GridSearchCV(pipeline, param_grid=parameters, cv=cv, scoring='f1')
clf.fit(enron_df_features, enron_df_labels)

GridSearchCV(cv=StratifiedShuffleSplit(labels=[False False ...,  True False], n_iter=100, test_size=0.2, random_state=42),
       error_score='raise',
       estimator=Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, n_components=None, whiten=False)), ('dt', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'pca__n_components': [2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19], 'dt__criterion': ['gini', 'entropy'], 'dt__max_features': [None, 'auto', 'log2']},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1', verbose=0)

In [43]:
pca = clf.best_estimator_.steps[1][1]
pca.n_components

15

In [44]:
pca_dt = Pipeline([
        ("scale", preprocessing.StandardScaler()),
        ('pca', PCA(n_components=pca.n_components)),
        ('dt', clf.best_estimator_.steps[2][1])])

In [45]:
dump_classifier_and_data(pca_dt, enron_df.transpose().to_dict(), features_list)
tester.main()

Pipeline(steps=[('scale', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, n_components=15, whiten=False)), ('dt', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))])
	Accuracy: 0.79620	Precision: 0.25269	Recall: 0.27000	F1: 0.26106	F2: 0.26635
	Total predictions: 15000	True positives:  540	False positives: 1597	False negatives: 1460	True negatives: 11403

