-
Notifications
You must be signed in to change notification settings - Fork 0
/
poi_id.py
628 lines (541 loc) · 27.1 KB
/
poi_id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
# coding: utf-8
# # Machine Learning Project
#!/usr/bin/python
import sys
import pickle
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
sys.path.append("../tools/")
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn import tree
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.decomposition import PCA
from sklearn.cross_validation import KFold, ShuffleSplit, StratifiedShuffleSplit, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score,classification_report
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import time
from feature_format import featureFormat, targetFeatureSplit
from tester import test_classifier, dump_classifier_and_data
### Task 1: Select what features you'll use.
### features_list is a list of strings, each of which is a feature name.
### The first feature must be "poi".
t=time.time()
# these are all the features
features_list_all = ['poi','salary', 'total_payments', 'bonus', 'deferred_income',
'total_stock_value', 'expenses', 'exercised_stock_options', 'long_term_incentive',
'restricted_stock', 'director_fees', 'to_messages', 'from_poi_to_this_person',
'from_messages','from_this_person_to_poi', 'shared_receipt_with_poi']
### Load the dictionary containing the dataset
with open("final_project_dataset.pkl", "rb") as data_file:
data_dict = pickle.load(data_file)
### Task 2: Remove outliers
### Task 3: Create new feature(s)
### Store to my_dataset for easy export below.
my_dataset = data_dict
#exploratory functions
#to look at the structure of the dictionary:
def find_by_name(name):
'''
Enter the name of the person at it appears in the dictionary to retrieve all of her information
For instance, to find Jeff Skilling, use:
find_by_name("SKILLING JEFFREY K")
'''
for key,value in my_dataset.items():
if key==name:
print(key, value)
break
return 1
def find_an_interesting_person(fieldname, limit):
'''
This function finds all the people whose some "fieldname" value is above a certain "limit"
For instance to find everyone with a salary above 1000000, use:
find_an_interesting_person("salary", 1000000)
'''
names=[]
for key,value in my_dataset.items():
if value[fieldname]!="NaN":
if int(value[fieldname]) >= limit:
print(key)
names.append(key)
return names
def find_outliers(fieldname):
'''
To find and print out statistical outliers for a particular feature using the bounds:
q1-1.5*iqr and q3+1.5*iqr where q1 and q3 are first and third quartile, and iqr is interquartile range
If the bounds is above the 100th percentile or below zeroth percentile, then just print out top of bottom
5% respectively. This function does not remove outliers - it just prints out name of persons and values of that field
so we can study what to remove and what is a legitimate data point.
'''
value_list=[]
name_list=[]
outlier_names=[]
outlier_values=[]
poi_nan_value=0
non_poi_nan_value=0
poi=0
non_poi=0
#Arrange the fieldname value into a nice list ignoring NANs
for key,value in my_dataset.items():
if value["poi"]==1:
poi+=1
else:
non_poi+=1
if value[fieldname]!="NaN":
value_list.append(value[fieldname])
name_list.append(key)
else:
if value["poi"]==1:
poi_nan_value+=1
else:
non_poi_nan_value+=1
#statistical analysis: calculate IQR and Q1, Q3
iqr = stats.iqr(value_list)
q1=np.percentile(value_list, 25)
q3=np.percentile(value_list, 75)
low_range=q1-1.5*iqr
high_range=q3+1.5*iqr
if low_range < 0:
low_range = np.percentile(value_list, 5)
if high_range > max(value_list):
high_range = np.percentile(value_list, 95)
#Find all the outlier indices
outliers=np.where(np.logical_or(value_list<=low_range, value_list>=high_range))[0]
#print out results
print("*************"+fieldname+"******************")
print("total_numbers", len(my_dataset.keys()), poi, non_poi)
print("Number of NaN: ", poi_nan_value, non_poi_nan_value)
print(high_range, low_range)
for i in outliers:
print(name_list[i]+": ",value_list[i])
outlier_names.append(name_list[i])
outlier_values.append(value_list[i])
return outlier_names,outlier_values
#only getting rid of one outlier which is not a legitimate data point
#there are some other extreme values, but they tend to be POIs and there are so few POIs to train the data on
#that throwing them away seems like a waste
my_dataset.pop("TOTAL", 0)
'''
find_outliers('salary')
find_outliers('total_payments')
find_outliers('bonus')
find_outliers('deferred_income')
find_outliers('total_stock_value')
find_outliers('expenses')
find_outliers('exercised_stock_options')
find_outliers('long_term_incentive')
find_outliers('restricted_stock')
find_outliers('director_fees')
find_outliers('to_messages')
find_outliers('from_poi_to_this_person')
find_outliers('from_messages')
find_outliers('from_this_person_to_poi')
find_outliers('shared_receipt_with_poi')
'''
#this function makes a 3D plot that lets us compare three features
#by making a 3D plot.
def compare_three_features(feature_names, scaled=False, xlimit=0, ylimit=0, zlimit=0, ts_size=0.3):
'''
the features name should be in the following format:
feature_names=['poi', 'from_this_person_to_poi', 'from_poi_to_this_person', 'salary']
where the first field 'poi' is used to distinguish pois from non-pois
and the rest are axes of the 3D plot.
For example, try this:
feature_names=['poi', 'from_this_person_to_poi', 'from_poi_to_this_person', 'salary']
compare_three_features(feature_names)
It allows scaling each feature (by maximum value) to demonstrate if it would be meaning ful to scale this
feature while conbining.
'''
short_data = featureFormat(my_dataset, feature_names, sort_keys = True)
short_labels, short_features = targetFeatureSplit(short_data)
#simple train-test split:
#why split? Because I want to understand the trend, not look at all the information
#then I might mentally overfit
x_train, x_test, y_train, y_test = train_test_split(short_features, short_labels, test_size=ts_size, random_state=42)
#find correlaion between these two:
first=[]
second=[]
third=[]
for item in x_train:
first.append(item[0])
second.append(item[1])
third.append(item[2])
if scaled:
first/=max(first)
second/=max(second)
third/=max(third)
#print("first", first)
#print("second", second)
r2=np.corrcoef([first, second, third]) #[0, 1]
print("correlation coefficient", r2)
#let's divide up the training sample into the two classes:
#all pois:
first_poi=[]
first_non_poi=[]
second_poi=[]
second_non_poi=[]
third_poi=[]
third_non_poi=[]
for i in range(0,len(y_train)):
if y_train[i] == 1:
first_poi.append(first[i])
second_poi.append(second[i])
third_poi.append(third[i])
else:
first_non_poi.append(first[i])
second_non_poi.append(second[i])
third_non_poi.append(third[i])
#plot
fig = plt.figure()
ax=fig.add_subplot(111, projection='3d')
if xlimit!=0:
plt.xlim(-10, xlimit)
if ylimit!=0:
plt.ylim(-10, ylimit)
if zlimit!=0:
plt.zlim(-10, zlimit)
ax.scatter(first_non_poi, second_non_poi, third_non_poi, facecolor="blue", s=20, edgecolors="blue", alpha=0.5)
ax.scatter(first_poi, second_poi, third_poi, facecolor="none", edgecolors="red", s=20)
ax.set_xlabel(feature_names[1])
ax.set_ylabel(feature_names[2])
ax.set_zlabel(feature_names[3])
plt.show()
return 1
#making a composite feature either be linearly combining scaled features,
#or by combining them radially
def combine_features(feature_names, new_feature, scaled=False, ignore_nan=False, method="linear"):
'''
This is an attempt to make a composite feature by either combining them radially = x^2 + y^2
or linearly = x + y
Any number of things can be combined this way, and a new dictionary is returned with the dataset added.
If scaled is True, each field is scaled before combining.
ignore_nan set to True means even if one field has NaN value, the others are combined, assuming
the NaN is 0. Not ignoring nan (ignore_nan = False) means we just set the new feature to 0 if any of the
component featutes is 0.
'''
edited_dict=my_dataset
list_of_features={}
#This part makes list that would help us scale features:
for name in feature_names:
list_of_features[name]=[]
for item in edited_dict.values():
if item[name]!="NaN":
list_of_features[name].append(item[name])
#combining features:
if method=="linear":
pow_ind=1
elif method=="radial":
pow_ind=2
else:
raise NameError("Method must be 'linear' or 'radial'")
#adding new feature:
for item in edited_dict.values():
combined_value=0
nan_flag = 0
for name in feature_names:
if item[name]=="NaN":
nan_flag = 1
numer = 0
item[name] = 0
else:
numer = item[name]-min(list_of_features[name])
if scaled:
combined_value+=pow(numer/(max(list_of_features[name]) - min(list_of_features[name])), pow_ind)
else:
combined_value+=pow(item[name], pow_ind)
item[new_feature]=combined_value
#if one of the fields is nan, and we are not supposed to ignore nan,
#then set new feature to zero
if nan_flag==1 and not ignore_nan:
item[new_feature]=0
return edited_dict
#finding the best features:
def find_best_features(dataset=my_dataset, feature_names=features_list_all):
### Extract features and labels from dataset for local testing
short_data = featureFormat(dataset, feature_names, sort_keys = True) #my_dataset or new_dict
#print("short_data", short_data)
short_labels, short_features = targetFeatureSplit(short_data)
short_features=MinMaxScaler().fit_transform(short_features)
x_train, x_test, y_train, y_test = train_test_split(short_features, short_labels, test_size=0.3, random_state=None, stratify=short_labels)
print("***********To check automated best features:*****************")
selector = SelectKBest(chi2, k='all').fit(x_train,y_train)
#names = selector.get_feature_names()
scores = selector.scores_
print(feature_names)
print(scores)
return scores
#visualize:
#feature_names = ["poi","bonus", "total_payments", 'shared_receipt_with_poi']
#compare_three_features(feature_names, True, "radial")
### Task 4: Try a varity of classifiers
### Please name your classifier clf for easy export below.
### Note that if you want to do PCA or other multi-stage operations,
### you'll need to use Pipelines. For more info:
### http://scikit-learn.org/stable/modules/pipeline.html
### Task 5: Tune your classifier to achieve better than .3 precision and recall
### using our testing script. Check the tester.py script in the final project
### folder for details on the evaluation method, especially the test_classifier
### function. Because of the small size of the dataset, the script uses
### stratified shuffle split cross validation. For more info:
### http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.StratifiedShuffleSplit.html
#---------------------------------------------------------------------------------------
#Many rounds of trial and error with many combination of features, finally boiled down to:
#feature_names=["poi", 'from_this_person_to_poi', 'long_term_incentive', "salary"]
'''
print("ATTEMPT 3")
print('Adding salary and bonus linearly:')
feature_names=["salary", "bonus"]
new_dict=combine_features(feature_names, "salary_plus_bonus", True, False, method="linear")
features_list_new = features_list_all
features_list_new.append("salary_plus_bonus")
find_best_features(new_dict, features_list_new)
best_decision_tree_classifier(new_dict, features_list_new)
'''
print("time elapsed to try all the new features:")
print(time.time()-t)
t=time.time()
#simple decision tree for feature selection:
def simple_decision_tree(dataset=my_dataset, feature_names=["poi", "shared_receipt_with_poi", 'from_this_person_to_poi', 'from_poi_to_this_person','long_term_incentive', "salary",'exercised_stock_options', "bonus"]):
'''
Just a simple decision tree for feature selection
'''
#setting up pipeline estimator lisst:
clf=tree.DecisionTreeClassifier(min_samples_split=5)
test_classifier(clf, dataset, feature_names)
return 1
print("**************************Trying different combinations of features*****************************")
print("***************Initial Intuition:***************")
print("These features:", ['from_this_person_to_poi', 'long_term_incentive', "salary"])
simple_decision_tree(my_dataset, ["poi", 'from_this_person_to_poi', 'long_term_incentive', "salary"])
print("***************Let's try different combinations of the best features using selectKbest***************")
print("These features:", ['bonus', 'long_term_incentive', "salary"])
simple_decision_tree(my_dataset, ["poi", 'bonus', 'long_term_incentive', "salary"])
print("These features:", ['exercised_stock_options', 'long_term_incentive', "salary", "bonus"])
simple_decision_tree(my_dataset, ["poi", 'exercised_stock_options', 'long_term_incentive', "salary", "bonus"])
print("These features:", ['exercised_stock_options', 'total_stock_value', "salary", "bonus"])
simple_decision_tree(my_dataset, ["poi", 'exercised_stock_options', 'total_stock_value', "salary", "bonus"])
print("These features:", ['exercised_stock_options', 'total_stock_value', "salary", "bonus", "shared_receipt_with_poi"])
simple_decision_tree(my_dataset, ["poi", 'exercised_stock_options', 'total_stock_value', "salary", "bonus", "shared_receipt_with_poi"])
print("SEVEN FEATURES")
simple_decision_tree(my_dataset)
#Maybe ratio of from messages to messages from this person to POI would be an indicator.
#No do not see any particular trend there if I make a plot.
#-----------------------------------------------------------------------------------------
# ## Automated version
#slighly modified version of test classifier, to be used with
#gridsearchCV to see what the best parameters for each fold
def grid_search_test_classifier(clf, dataset, feature_list, folds = 5):
data = featureFormat(dataset, feature_list, sort_keys = True)
labels, features = targetFeatureSplit(data)
cv = StratifiedShuffleSplit(labels, folds, random_state = 42)
true_negatives = 0
false_negatives = 0
true_positives = 0
false_positives = 0
param_dict_list=[]
for train_idx, test_idx in cv:
features_train = []
features_test = []
labels_train = []
labels_test = []
for ii in train_idx:
features_train.append( features[ii] )
labels_train.append( labels[ii] )
for jj in test_idx:
features_test.append( features[jj] )
labels_test.append( labels[jj] )
### fit the classifier using training set, and test on test set
clf.fit(features_train, labels_train)
print("Training length", len(features_train), "Test set", len(features_test))
param_dict_list.append(clf.best_params_)
print("best params", clf.best_params_)
predictions = clf.predict(features_test)
for prediction, truth in zip(predictions, labels_test):
if prediction == 0 and truth == 0:
true_negatives += 1
elif prediction == 0 and truth == 1:
false_negatives += 1
elif prediction == 1 and truth == 0:
false_positives += 1
elif prediction == 1 and truth == 1:
true_positives += 1
else:
print("Warning: Found a predicted label not == 0 or 1.")
print("All predictions should take value 0 or 1.")
print("Evaluating performance for processed predictions:")
break
best_params=clf.best_params_
real_dict={}
for keys, itemss in best_params.items():
real_dict[keys]=[]
for each_dict in param_dict_list:
for key1, item1 in each_dict.items():
if key1==keys:
real_dict[keys].append(item1)
if isinstance(itemss, int):
print("*******MEDIAN********* ", keys, np.median(real_dict[keys]))
try:
total_predictions = true_negatives + false_negatives + false_positives + true_positives
accuracy = 1.0*(true_positives + true_negatives)/total_predictions
precision = 1.0*true_positives/(true_positives+false_positives)
recall = 1.0*true_positives/(true_positives+false_negatives)
f1 = 2.0 * true_positives/(2*true_positives + false_positives+false_negatives)
f2 = (1+2.0*2.0) * precision*recall/(4*precision + recall)
print(PERF_FORMAT_STRING.format(accuracy, precision, recall, f1, f2, display_precision = 5))
print(RESULTS_FORMAT_STRING.format(total_predictions, true_positives, false_positives, false_negatives, true_negatives))
print("")
except:
print("Got a divide by zero when trying out:", clf)
print("Precision or recall may be undefined due to a lack of true positive predicitons.")
def automated_fitting(feature_names=["poi", "shared_receipt_with_poi", 'from_this_person_to_poi', 'from_poi_to_this_person','long_term_incentive', "salary",'exercised_stock_options', "bonus"]):
'''
This function takes in a list of featues, and runs pca and a list of models along with grid searches
for best fit parameters to find the best possible restuls.
It prints out both the results of cross validation and validation using the tester.py script.
It uses decisiontree, adaboost, randomforest, extratree, mlp and svc. I removed knearestneighbors.
It probably does poorly due to "0" values from "NaN".
'''
#setting up pipeline estimator lisst:
print("******************This is the code to tune parameters using grid search CV******************")
models_to_try = [[('reduce_dim', PCA()), ('clf', tree.DecisionTreeClassifier())],
[('reduce_dim', PCA()), ('clf', AdaBoostClassifier())],
#[('reduce_dim', PCA()), ('clf', KNeighborsClassifier())],
[('reduce_dim', PCA()), ('clf', RandomForestClassifier())]]
#[('reduce_dim', PCA()), ('clf', ExtraTreesClassifier())],
#[('reduce_dim', PCA()), ('clf', MLPClassifier())],
#[('reduce_dim', PCA()), ('clf', SVC())]]
#setting up parameters
p_grid=[dict(reduce_dim__n_components=[1, 2, 3, 4, 5, 6], clf__min_samples_split=[2,3,4,5,8,10], clf__presort=("True", "False")),
dict(reduce_dim__n_components=[1, 2, 3, 4, 5, 6], clf__n_estimators=[2,3,5,7,10,40,50,90,120,160], clf__random_state=[15,54,42,5,0]),#, clf__base_estimator=["None", " DecisionTreeClassifier()"], clf__base_estimator__max_depth=[2, 3, 5, 10, 20, 40]),
#dict(reduce_dim__n_components=[1, 2, 3, 4, 5, 6], clf__n_neighbors=[5,10,20, 30],clf__weights=["uniform","distance"],clf__algorithm=["auto"]),
dict(reduce_dim__n_components=[1, 2, 3, 4, 5, 6], clf__n_estimators=[2,3,5,7,10,40,50], clf__min_samples_split=[2,3,4,5,8,10], clf__random_state=[15,54,42,5,0])]
#dict(reduce_dim__n_components=[1, 2, 3, 4, 5, 6], clf__n_estimators=[2,3,5,7,10,40,50], clf__min_samples_split=[2,3,4,5,8,10], clf__random_state=[15,54,42,5,0]),
#dict(reduce_dim__n_components=[1, 2, 3, 4, 5, 6],clf__solver=['lbfgs','sgd','adam'], clf__alpha=[1e-5, 1e-4, 1e-3, 1, 10, 100]),
#dict(reduce_dim__n_components=[1, 2, 3, 4, 5, 6], clf__C=[0.1, 1, 5], clf__gamma=[1e-6, 1e-3, 0.1, 1, 10, 100])]
model_names=["Decision Tree", "Adaboost", "RandomForest"]#, "ExtraTrees", "MLP", "SVC"]
ind=0
for model in models_to_try:
pipe=Pipeline(model)
clf=GridSearchCV(pipe, param_grid=p_grid[ind])
grid_search_test_classifier(clf, my_dataset, feature_names)
ind+=1
#automated_fitting()
### Extract features and labels from dataset for local testing
feature_names=['exercised_stock_options', 'total_stock_value', "salary", "bonus"]
#["poi", "shared_receipt_with_poi", 'from_this_person_to_poi', 'from_poi_to_this_person','long_term_incentive', "salary",'exercised_stock_options', "bonus"] #
def best_automated_classifier(algorithm="DecisionTree", dataset=my_dataset, feature_names=["poi", 'exercised_stock_options', 'total_stock_value', "salary", "bonus"]):
'''
Afer trying on several classifiers (decision tree, randomforest, adaboost)
I decided random forest does the best job
But the best combination of parameters for the other two can still be commented out and tried.
Randomforest without any new features shows the best performance.
But decision tree with "bonus", "total_payments", "shared_receipt_with_poi" scaled and radially added also
comes pretty close.
'''
#feature scaling only for SVC - decision trees and ensemble methods employing
#weak estimators that are dec trees aren't sensitibe to scaling:
#Best RandomForest:
if algorithm=="RandomForest":
pipe_model=[('reduce_dim', PCA(n_components=2)), ('clf', RandomForestClassifier(n_estimators=5, min_samples_split=5, random_state=15, class_weight={1:0.85, 0:0.15}))]
#0.5 all around
elif algorithm=="DecisionTree":
#Best Decision tree:
pipe_model=[('reduce_dim', PCA(n_components=4)), ('clf', tree.DecisionTreeClassifier(min_samples_split=3, presort="True"))]
#Accuracy: 0.78462 Precision: 0.33333 Recall: 0.40000 F1: 0.36364 F2: 0.38462
elif algorithm=="AdaBoost":
#Best Adaboost:
pipe_model=[('reduce_dim', PCA(n_components=4)), ('clf', AdaBoostClassifier(n_estimators=7, random_state=15))]
#Accuracy: 0.84615 Precision: 0.50000 Recall: 0.40000 F1: 0.44444 F2: 0.41667
else:
raise NameError("Algorithm must be RandomForest, DecisionTree or AdaBoost")
#print training and test size:
print("best automated with "+algorithm)
#pipe=Pipeline(pipe_model)
#p_grid=dict(reduce_dim__n_components=[2], clf__n_estimators=[7], clf__random_state=[15])
#clf=GridSearchCV(pipe, param_grid=p_grid)#, cv=StratifiedKFold(3))
clf=Pipeline(pipe_model)
#print("Best params",clf.best_params_)
test_classifier(clf, dataset, feature_names)
return clf, feature_names
print("*************************Creating and Testing Composite Features:*************************")
print("ATTEMPT 1")
print("test by radially adding the following two features:")
feature_names=['shared_receipt_with_poi', 'from_poi_to_this_person']
print(feature_names)
new_dict=combine_features(feature_names, "shared_receipt_from_this_person_to_poi", True, False, method="radial")
features_list_new = features_list_all
features_list_new.append("shared_receipt_from_this_person_to_poi")
find_best_features(new_dict, features_list_new)
best_automated_classifier("AdaBoost", new_dict, features_list_new)
print("ATTEMPT 2")
print('combine "bonus", "total_payments", "shared_receipt_with_poi" radially')
new_dict=combine_features(["bonus", "total_payments", 'shared_receipt_with_poi'], "best_3", True, False, "radial")
features_list_new = features_list_all
features_list_new.append("best_3")
find_best_features(new_dict, features_list_new)
best_automated_classifier("AdaBoost", new_dict, features_list_new)
best_automated_classifier("RandomForest", new_dict, features_list_new)
best_automated_classifier("DecisionTree", new_dict, features_list_new)
print("********************************************************************************************")
print("Final Best automated classifier (not using composite features here) is RandomForest")
print("RandomForest with the seven features I picked is written in .pkl file")
print("But here I print out results for Decision Tree and Adaboost as well.")
print(time.time()-t)
t=time.time()
clf, features_list = best_automated_classifier()
best_automated_classifier("RandomForest")
best_automated_classifier("AdaBoost")
### Task 6: Dump your classifier, dataset, and features_list so anyone can
### check your results. You do not need to change anything below, but make sure
### that the version of poi_id.py that you submit can be run on its own and
### generates the necessary .pkl files for validating your results.
dump_classifier_and_data(clf, my_dataset, features_list)
'''
#Future suggestions to explore:
#GridSearchCV along with SelectKBest could be used to find the best features. If this of interest, take a look at this template:
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedShuffleSplit
cv = StratifiedShuffleSplit(n_splits = 1000, test_size = 0.30, train_size = 0.70, random_state = 42)
kbest = SelectKBest()
values = # TODO: create a list containing values to be tuned for 'k'. For eg. [1, 2, 3]
clf = # TODO: specify a classifier to be used in the pipeline below
param_grid = {'kbest__k': values}
pipe = Pipeline([('kbest', kbest), ('name_classifier', clf)])
def select_k_best(pipe, cv, param_grid):
"""Perform feature selection process using GridSearchCV and SelectKBest
Args:
pipe: pipeline object
cv: int, cross-validation generator or an iterable, optional
param_grid : dict or list of dictionaries
Dictionary with parameters names (string) as keys and lists of parameter
settings to try as values, or a list of such dictionaries, in which case the
grids spanned by each dictionary in the list are explored. This enables searching
over any sequence of parameter settings.
Returns:
best_clf: estimator object
best_features: tuple
Contains a list of the best features
"""
grid_search = GridSearchCV(estimator = pipe,
param_grid = param_grid,
cv = cv,
verbose = 1)
grid_search.fit(features, labels)
best_clf = grid_search.best_estimator_
best_features = [features_list[i+1] for i in best_clf.named_steps['kbest'].get_support(indices=True)]
print('Best Features:', best_features)
return best_clf, best_features
best_clf, best_features = select_k_best(pipe, cv, param_grid)
'''