In [15]:
%run utils.ipynb

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

In [2]:
## Utils

def get_features(search):
    return pd.DataFrame(search.best_estimator_.feature_importances_, 
                        index=x.columns, 
                        columns=['importance']).sort_values(by=['importance'], ascending=False)

In [3]:
## Setup classifier

random_forest = RandomForestClassifier()

In [4]:
## Setup RandomSearchCV

# TODO: These are currently some basic examples from a tutorial, need to tweak these
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# How to measure quality of split
criterion = ['gini', 'entropy']

# Create the params grid
params_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion': criterion}

In [5]:
## Run RandomSearchCV (for binned y)

random_search_bins = run_random_search(random_forest, params_grid, x, y_abs['binned_y'])

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.3min finished


In [6]:
print('Best accuracy:', random_search_bins.best_score_)
print('Best params:', random_search_bins.best_params_)

Best accuracy: 0.7161304347826086
Best params: {'n_estimators': 1400, 'min_samples_leaf': 2, 'criterion': 'entropy', 'bootstrap': True, 'min_samples_split': 5, 'max_depth': 60, 'max_features': 'sqrt'}


In [8]:
## Run RandomSearchCV (for all y)

random_search_all = run_random_search(random_forest, params_grid, x, y_abs['y'])

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   25.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  4.2min finished


In [9]:
print('Best accuracy:', random_search_all.best_score_)
print('Best params:', random_search_all.best_params_)

Best accuracy: 0.617144927536232
Best params: {'n_estimators': 200, 'min_samples_leaf': 4, 'criterion': 'gini', 'bootstrap': True, 'min_samples_split': 5, 'max_depth': 20, 'max_features': 'auto'}


In [10]:
## Check features

binned_features = get_features(random_search_bins)
all_features = get_features(random_search_all)

print('Binned features:', binned_features.head())
print('All features:', all_features.head())

Binned features:                                  importance
work_num                           0.042604
value.Fluency of Ideas             0.040680
value.Computers and Electronics    0.038838
value.Memorization                 0.034062
value.Technology Design            0.026440
All features:                                      importance
value.Service Orientation              0.049277
value.Persuasion                       0.048918
value.Systems Evaluation               0.047835
value.Fluency of Ideas                 0.042264
value.Customer and Personal Service    0.032433


In [16]:
## Run Grid Search (because why not)

grid_search_bins = run_grid_search(random_forest, params_grid, x, y_abs['binned_y'])

Fitting 5 folds for each of 8640 candidates, totalling 43200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  5.4min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 12.3min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 21.8min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 27.5min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 34.1min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 41.5min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 49.5min
[Parallel(n_jobs=-1)]: Done 8442 tasks      | elapsed: 58.2min
[Parallel(n_jobs=-1)]: Done 9792 tasks      | elapsed: 67.6min
[Parallel(n_jobs=-1)]: Done 11242 tasks      |

In [17]:
grid_search_all = run_grid_search(random_forest, params_grid, x, y_abs['y'])

Fitting 5 folds for each of 8640 candidates, totalling 43200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  8.6min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed: 12.4min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 17.0min
[Parallel(n_jobs=-1)]: Done 3192 tasks      | elapsed: 22.2min
[Parallel(n_jobs=-1)]: Done 4042 tasks      | elapsed: 28.0min
[Parallel(n_jobs=-1)]: Done 4992 tasks      | elapsed: 34.6min
[Parallel(n_jobs=-1)]: Done 6042 tasks      | elapsed: 41.9min
[Parallel(n_jobs=-1)]: Done 7192 tasks      | elapsed: 49.9min
[Parallel(n_jobs=-1)]: Done 8442 tasks      | elapsed: 58.5min
[Parallel(n_jobs=-1)]: Done 9792 tasks      | elapsed: 67.9min
[Parallel(n_jobs=-1)]: Done 11242 tasks      |

In [18]:
print('Best accuracy (bins):', grid_search_bins.best_score_)
print('Best params (bins):', grid_search_bins.best_params_)

Best accuracy (bins): 0.732463768115942
Best params (bins): {'n_estimators': 200, 'max_depth': 20, 'min_samples_leaf': 2, 'criterion': 'gini', 'min_samples_split': 10, 'bootstrap': False, 'max_features': 'auto'}


In [19]:
print('Best accuracy (all):', grid_search_all.best_score_)
print('Best params (all):', grid_search_all.best_params_)

Best accuracy (all): 0.641840579710145
Best params (all): {'n_estimators': 200, 'max_depth': 40, 'min_samples_leaf': 4, 'criterion': 'gini', 'min_samples_split': 2, 'bootstrap': True, 'max_features': 'auto'}


In [20]:
binned_features = get_features(grid_search_bins)
all_features = get_features(grid_search_all)

print('Binned features:', binned_features.head())
print('All features:', all_features.head())

Binned features:                                  importance
value.Computers and Electronics    0.059780
value.Memorization                 0.047503
value.Fluency of Ideas             0.040716
value.Technology Design            0.036253
value.Static Strength              0.030326
All features:                                      importance
value.Persuasion                       0.061186
value.Fluency of Ideas                 0.055118
value.Systems Evaluation               0.044734
value.Service Orientation              0.040202
value.Customer and Personal Service    0.032836


In [68]:
model = grid_search_bins.best_estimator_

per_tree_pred = [tree.predict(x) for tree in model.estimators_]

per_sample_pred = [[]]*120
for tree in per_tree_pred:
    for index, pred in enumerate(tree):
        per_sample_pred[index].append(pred)
    
# confidence = []    
# for sample in per_sample_pred:
#     confidence.append(sum(sample)/len(sample))

In [86]:
sample = [[]]*120
for index, pred in enumerate(per_tree_pred[0]):
    print(pred)
    
type(sample[0])

0.0
1.0
0.0
1.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
0.0
1.0
1.0
1.0
1.0
1.0
1.0


list

In [55]:
preds = model.predict(x)

predictions = {'predictions': preds,
               'confidence': confidence}

In [60]:
predictions

{'confidence': [0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.6410833333333333,
  0.64108333333333

In [None]:
pd.DataFrame(predictions, index=x.index)