#### Run Random Forest
Data consists of run times for matrix multiplication, using many different run-time options. The goal is to see what option set performs best (fastest run time)

Create Train and Test datasets


Run a generic version of RF to serve as a performance baseline

Run "randomSearch" to narrow down the parameter space

Run "gridSearch" to optimize the best parameters from randomSearch

In [1]:
import pandas as pd
import numpy  as np
import time
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import evaluate
import RFfeatureEval
import itertools
import pickle
from setPandasOptions import setPandasOptions
from getConfig import getConfig
from getData import getData
from prepData import prepData
from calcRMSE import calcRMSE

#setPandasOptions()

In [2]:
def saveModel(model, count):
    fname = "RFmodel_" + str(count)
    pickle.dump(model, open("/home/tbrownex/" + fname, 'wb'))

In [3]:
config = getConfig()
df = getData(config)
data = prepData(df, config)

In [4]:
# Figure out if you need all the features
regr = RandomForestRegressor(n_estimators=50)
rf   = regr.fit(data["trainX"], data["trainY"])

cols = RFfeatureEval.process(data["trainX"].columns, rf)

#keep  = [col[0] for col in cols if col[1] > .01]    # discard non-useful features
#data["trainX"] = data["trainX"][keep]
#data["testX"]  = data["testX"][keep]

Feature        importance
cycle          0.39
sensor11       0.10
sensor13       0.09
sensor15       0.07
sensor6        0.04
sensor12       0.02
sensor4        0.02
mean_sensor13  0.02
sensor9        0.02
sensor14       0.02
mean_sensor8   0.01
sensor7        0.01
mean_sensor14  0.01
sensor8        0.01
mean_sensor12  0.01
mean_sensor7   0.01
sensor2        0.01
sensor3        0.01
sensor21       0.01
mean_sensor9   0.01
mean_sensor11  0.01
mean_sensor18  0.01
mean_sensor4   0.01
setting1       0.01
mean_sensor6   0.01
sensor20       0.01
mean_sensor2   0.01
mean_sensor15  0.01
mean_sensor1   0.01
mean_sensor17  0.01
mean_sensor3   0.01
mean_sensor20  0.00
setting2       0.00
mean_sensor10  0.00
mean_sensor21  0.00
mean_sensor5   0.00
sensor17       0.00
mean_sensor19  0.00
sensor10       0.00
mean_sensor16  0.00
sensor16       0.00
sensor18       0.00
sensor5        0.00
sensor1        0.00
sensor19       0.00
setting3       0.00


In [5]:
node_size = [10]
trees     = [50]
leaf_size = [4]
parms = list(itertools.product(node_size,trees,leaf_size))
results = []
count = 0

for x in parms:
    count += 1
    node_size = x[0]
    trees     = x[1]
    leaf_size = x[2]
    regr = RandomForestRegressor(n_estimators = trees,\
                                 min_samples_split = node_size,\
                                 min_samples_leaf = leaf_size)
    rf    = regr.fit(data["trainX"], data["trainY"])
    preds = rf.predict(data["testX"])
    #score = evaluate.process(preds, data["testY"])
    score = calcRMSE(data["testY"], preds)
    tmp = (count, x, score)
    results.append(tmp)
    saveModel(rf, count)
    print("Done with {} of {}".format(count, len(parms)))

Done with 1 of 1


In [7]:
with open("/home/tbrownex/results.csv", "w") as f:
    hdr = "count"+"|"+"nodes"+"|"+"trees"+"|"+"leafs"+"|"+"score"+"\n"
    f.write(hdr)
    for x in results:
        count = x[0]
        parms = x[1]
        score = x[2]
        nodes = parms[0]
        trees = parms[1]
        leafs = parms[2]
        rec = str(count)+"|"+str(nodes)+"|"+str(trees)+"|"+str(leafs)+"|"+str(score)+"\n"
        f.write(rec)