# Gannett Peak: Test Random Forest classifier model

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

  from numpy.core.umath_tests import inner1d


In [3]:
from dataPrep import *

# Tune hyperparameters

In [18]:
def paramTune():
    dOb = dataPrep()
    # Number of trees in random forest
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    # Number of features to consider at every split
    max_features = ['auto', None]
    # Maximum number of levels in tree
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    max_depth.append(None)
    # Minimum number of samples required to split a node
    min_samples_split = [2, 5, 10]
    # Minimum number of samples required at each leaf node
    min_samples_leaf = [1, 2, 4]
    # Method of selecting samples for training each tree
    bootstrap = [True, False]
    # Create the random grid
    random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
    model = RandomForestClassifier()
    tuned = RandomizedSearchCV(estimator = model, 
                                   param_distributions = random_grid, 
                                   n_iter = 100, cv = 5, verbose=2, 
                                   random_state=42, 
                                   n_jobs = -1)
    X,y,Xtest,ytest = dOb.dataClass(0,False)
    tuned.fit(X,y)
    return tuned
    

In [14]:
tmodel = paramTune()

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=None, max_depth=30, bootstrap=True 
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=None, max_depth=30, bootstrap=True 
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=None, max_depth=30, bootstrap=True 
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=None, max_depth=30, bootstrap=True 
[CV] n_estimators=400, min_samples_split=5, min_samples_leaf=1, max_features=None, max_depth=30, bootstrap=True 
[CV] n_estimators=2000, min_samples_split=5, min_samples_leaf=1, max_features=None, max_depth=10, bootstrap=True 
[CV] n_estimators=2000, min_samples_split=5, min_samples_leaf=1, max_features=None, max_depth=10, bootstrap=True 
[CV] n_estimators=2000, min_samples_split=5, min_samples_leaf=1, max_features=None, max_depth=10, bootstrap=True 
[CV] n_estimators=2000, min_sa

[Parallel(n_jobs=-1)]: Done 114 tasks      | elapsed: 42.5min


[CV]  n_estimators=400, min_samples_split=2, min_samples_leaf=1, max_features=None, max_depth=None, bootstrap=False, total= 4.6min
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_features=None, max_depth=20, bootstrap=True 
[CV]  n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=40, bootstrap=False, total= 4.2min
[CV] n_estimators=1000, min_samples_split=5, min_samples_leaf=2, max_features=None, max_depth=20, bootstrap=True 
[CV]  n_estimators=1400, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=40, bootstrap=False, total= 3.8min
[CV] n_estimators=2000, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=100, bootstrap=False 
[CV]  n_estimators=1600, min_samples_split=5, min_samples_leaf=1, max_features=None, max_depth=70, bootstrap=False, total=17.8min
[CV] n_estimators=2000, min_samples_split=10, min_samples_leaf=4, max_features=auto, max_depth=100, bootstrap=False 
[CV]  n_estimator

[Parallel(n_jobs=-1)]: Done 317 tasks      | elapsed: 106.6min


[CV]  n_estimators=1400, min_samples_split=10, min_samples_leaf=2, max_features=None, max_depth=80, bootstrap=True, total=12.3min
[CV] n_estimators=1000, min_samples_split=10, min_samples_leaf=1, max_features=None, max_depth=80, bootstrap=False 
[CV]  n_estimators=1800, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=10, bootstrap=False, total= 2.6min
[CV] n_estimators=1000, min_samples_split=10, min_samples_leaf=1, max_features=None, max_depth=80, bootstrap=False 
[CV]  n_estimators=1800, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=10, bootstrap=False, total= 2.8min
[CV] n_estimators=1000, min_samples_split=10, min_samples_leaf=1, max_features=None, max_depth=80, bootstrap=False 
[CV]  n_estimators=1400, min_samples_split=10, min_samples_leaf=2, max_features=None, max_depth=80, bootstrap=True, total=12.6min
[CV] n_estimators=1000, min_samples_split=10, min_samples_leaf=1, max_features=None, max_depth=80, bootstrap=False 
[CV]  n_estimato

[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 166.8min finished


In [15]:
tmodel.best_params_

{'n_estimators': 1000,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': None,
 'max_depth': 10,
 'bootstrap': True}

In [7]:
model = RandomForestClassifier(n_estimators=1000,
                                    min_samples_split=10,
                                    min_samples_leaf=2,
                                    max_features = None,
                                    max_depth = 10,
                                    bootstrap = True)

# Determine which lagging period performs the best

In [10]:
def trainLags(i):
    dOb = dataPrep()
    #for i in range(6):
    X,y,Xtest,ytest,Xval,yVal = dOb.dataClass(i,True)
    model = RandomForestClassifier(n_estimators=1000,
                                          min_samples_split=10,
                                          min_samples_leaf=2,
                                          max_features = None,
                                          max_depth = 10,
                                          bootstrap = True)
    model.fit(X, y)
    y_pred=model.predict(Xval)
    print(confusion_matrix(yVal, y_pred))
    print(model.score(Xval, yVal))

In [11]:
trainLags(0)

[[ 750 3827]
 [1096 4666]]
0.5238417641938292


In [12]:
trainLags(1)

[[ 787 3478]
 [1232 4529]]
0.5302214242968283


In [13]:
trainLags(2)

[[ 840 3525]
 [ 694 4827]]
0.5732348776046935


In [17]:
trainLags(3)

[[1115 2976]
 [1121 4629]]
0.5836805202723301


In [18]:
trainLags(4)

[[1117 3205]
 [1214 4321]]
0.5516891549152886


In [19]:
trainLags(5)

[[1453 2741]
 [ 562 4998]]
0.6613696944843142


### Lag of 5 months performs best on validation data, with lag of 2 months and 3 months also performing better than 55%, which is the average number of positives that we have in the training data. 

# Model with complete data

In [4]:
dOb = dataPrep()

In [5]:
X,y,Xtest,ytest,Xval,yVal = dOb.dataClass(2,True)
model = RandomForestClassifier(n_estimators=1000,
                                    min_samples_split=10,
                                    min_samples_leaf=2,
                                    max_features = None,
                                    max_depth = 10,
                                    bootstrap = True)
model.fit(X, y)
y_pred=model.predict(Xval)
print(confusion_matrix(yVal, y_pred))
print(model.score(Xval, yVal))

[[ 854 3511]
 [ 712 4809]]
0.5728302650212421


# Model excluding Salary, Important Roles, Intern Roles

In [6]:
X,y,Xtest,ytest,Xval,yVal = dOb.dataClass(2,True)
X.drop('SalaryChange', axis=1)
X.drop('weight_Important',axis=1)
X.drop('weight_Intern',axis=1)
Xval.drop('SalaryChange', axis=1)
Xval.drop('weight_Important',axis=1)
Xval.drop('weight_Intern',axis=1)
Xtest.drop('SalaryChange', axis=1)
Xtest.drop('weight_Important',axis=1)
Xtest.drop('weight_Intern',axis=1)
model = RandomForestClassifier(n_estimators=1000,
                                    min_samples_split=10,
                                    min_samples_leaf=2,
                                    max_features = None,
                                    max_depth = 10,
                                    bootstrap = True)
model.fit(X, y)
y_pred=model.predict(Xval)
print(confusion_matrix(yVal, y_pred))
print(model.score(Xval, yVal))

[[ 841 3524]
 [ 722 4799]]
0.5705037426663969


### Comparing the two models for Cary, since salary and role data are not part of their competitors' data. However, the two models performed very similarly on the validation data. 

### Note: We proceeded with a lag of 2 months based on the Vector Auto Regression model's performance