#### Run Random Forest
Data consists of run times for matrix multiplication, using many different run-time options. The goal is to see what option set performs best (fastest run time)

Create Train and Test datasets


Run a generic version of RF to serve as a performance baseline

Run "randomSearch" to narrow down the parameter space

Run "gridSearch" to optimize the best parameters from randomSearch

In [7]:
import pandas as pd
import numpy  as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from calcMAPE import calcMAPE
import RFfeatureEval
import itertools

dataloc = "/home/tbrownex/data/test/cpu/"
file    = "data.csv"
TESTPCT = .2

In [8]:
df = pd.read_csv(dataloc+file)

In [9]:
df = df.sample(frac=0.4)

In [10]:
# Create Train and Test sets
train, test = train_test_split(df, test_size=TESTPCT)

# Separate the features and labels
trainY = train["MeanRunTime"]
testY  = test["MeanRunTime"]
del train["MeanRunTime"]
del test["MeanRunTime"]

In [5]:
# Run a simplistic version of RF to serve as a baseline
regr = RandomForestRegressor()              # using just the defaults
rf   = regr.fit(train, trainY)
    
preds = rf.predict(test)
    
score = evaluate.process(preds, testY)
print("baseline MAPE is {:.3f}".format(score))

baseline MAPE is 0.032


In [5]:
# Figure out if you need all the features
regr = RandomForestRegressor(n_estimators=100)
rf   = regr.fit(train, trainY)

cols = RFfeatureEval.process(train.columns, rf)

keep  = [col[0] for col in cols if col[1] > .01]    # discard non-useful features
train = train[keep]
test  = test[keep]

Feature        importance
NDIMC          0.22
NWG            0.19
MDIMC          0.18
MWG            0.16
SA             0.08
SB             0.08
KWG            0.06
KWI            0.03
VWN            0.00
VWM            0.00
STRM           0.00
STRN           0.00
MDIMA          0.00
NDIMB          0.00


In [11]:
node_size = [x for x in np.linspace(start = .0002, stop = .001, num = 5)]
trees     = [100]
leaf_size = [7,9,13]

parms = itertools.product(node_size,trees,leaf_size)

results = []

for x in parms:
    node_size = x[0]
    trees     = x[1]
    leaf_size = x[2]
    regr = RandomForestRegressor(n_estimators = trees,\
                                 min_samples_split = node_size,\
                                 min_samples_leaf = leaf_size)
    rf    = regr.fit(train, trainY)
    preds = rf.predict(test)
    score = calcMAPE(preds, testY)
    tmp = (x,score)
    results.append(tmp)

In [13]:
delim=","
with open("/home/tbrownex/results.csv", "w") as f:
    hdr = "nodes"+delim+"trees"+delim+"leafs"+delim+"score"+"\n"
    f.write(hdr)
    for x in results:
        parms = x[0]
        score = x[1]
        nodes = parms[0]
        trees = parms[1]
        leafs = parms[2]
        rec = str(nodes)+delim+str(trees)+delim+str(leafs)+delim+str(score)+"\n"
        f.write(rec)