In [20]:
import pandas as pd
import numpy  as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import RFfeatureEval
import itertools
from calcMAPE import calcMAPE

dataloc = "/home/tbrownex/data/Hackett/HRdemo/"
file    = "data.csv"
TESTPCT = .2

In [21]:
df = pd.read_csv(dataloc+file)

In [22]:
df.sample(10)

Unnamed: 0,Stubborn,Enthusiasm,Autonomous,Dreamer,Commitment,Extrovert,Team Player,Social Network,Industry Yrs,Role Yrs,Competitor Yrs,Current Job Level,Annual Review Score,University Prestige,Degree,Grade
731,4,6,4,4,4,7,5,4,6,10,0,3,7,7,1,3
149,5,5,6,7,4,5,6,6,21,6,8,3,5,4,1,3
381,6,4,3,7,7,4,5,5,8,4,0,7,4,6,1,2
96,7,6,6,8,6,3,6,7,13,16,0,8,8,5,1,3
849,4,5,8,5,4,4,7,3,13,1,0,5,8,0,0,5
769,7,6,7,5,5,6,5,4,15,13,0,6,4,5,1,4
835,5,6,5,7,6,7,7,6,15,1,0,7,4,0,0,4
853,5,6,7,7,8,3,5,6,12,20,0,5,5,5,1,3
378,6,4,9,7,7,7,6,5,5,22,0,9,7,4,1,4
80,6,6,6,6,6,6,7,4,12,24,0,5,4,4,1,3


In [13]:
# Create Train and Test sets
train, test = train_test_split(df, test_size=TESTPCT)

# Separate the features and labels
trainY = train["Grade"]
testY  = test["Grade"]
del train["Grade"]
del test["Grade"]

In [14]:
# Calculate the accuracy for a naive forecaster to use as a performance baseline
preds = np.random.randint(low=1, high=6, size=df.shape[0])
naive = calcMAPE(df["Grade"], preds)
print(naive)

0.48


In [15]:
# Figure out if you need all the features
regr = RandomForestRegressor(n_estimators=100)

rf   = regr.fit(train, trainY)

cols = RFfeatureEval.process(train.columns, rf)

keep  = [col[0] for col in cols if col[1] > .01]    # discard non-useful features
train = train[keep]
test  = test[keep]

Feature        importance
University Prestige0.30
Commitment     0.12
Enthusiasm     0.12
Autonomous     0.10
Role Yrs       0.05
Social Network 0.05
Team Player    0.05
Industry Yrs   0.04
Competitor Yrs 0.03
Extrovert      0.03
Current Job Level0.03
Stubborn       0.03
Dreamer        0.02
Annual Review Score0.02
Degree         0.01


In [16]:
node_size = [.0002]
trees     = [300]
leaf_size = [4]

parms = itertools.product(node_size,trees,leaf_size)

results = []

for x in parms:
    node_size = x[0]
    trees     = x[1]
    leaf_size = x[2]
    regr = RandomForestRegressor(n_estimators = trees,\
                                 min_samples_split = node_size,\
                                 min_samples_leaf = leaf_size)
    rf    = regr.fit(train, trainY)
    preds = rf.predict(test)
    score = calcMAPE(preds, testY)
    tmp = (x,score)
    results.append(tmp)

In [19]:
print("{:<15}{}".format("Forecaster", "MAPE"))
print("{:<15}{}".format("Naive", naive))
for r in results:
    print("{:<15}{}".format("RF", r[1]))    

Forecaster     MAPE
Naive          0.48
RF             0.11
