In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import evaluate
import XGBoost

DATALOC       =  "/home/tbrownex/data/test/"
FILE                  =  "data.csv"
TESTPCT         = .2
DATASUBSET = 0.4              # Use this to process only part of the whole file

In [2]:
def getData():
    df = pd.read_csv(DATALOC+FILE)
    df = df.sample(frac=DATASUBSET)
    
    # Create Train and Test sets
    train, test = train_test_split(df, test_size=TESTPCT)
    
    # Separate the features and labels    
    d = {}
    d["trainY"] = train["MeanRunTime"]
    del train["MeanRunTime"]
    d["trainX"] = train
    
    d["testY"] = test["MeanRunTime"]
    del test["MeanRunTime"]
    d["testX"] = test
    return d

In [3]:
# Create different parameter configurations. The configuration should have already been optimized and should be very different
def getConfigs():
    parmList = []
    parms = {
        "n_estimators":     30,
        "max_depth":        12,
        "learning_rate":    0.2,
        "subsample":        0.7,
        "colsample_bytree": 1.0}
    parmList.append(parms)

    parms = {
        "n_estimators":     300,
        "max_depth":        12,
        "learning_rate":    0.2,
        "subsample":        0.7,
        "colsample_bytree": 1.0}
    parmList.append(parms)
    
    return parmList

In [4]:
dataDict = getData()
parmList = getConfigs()

results  = []
predList = []

for p in parmList:
    preds = XGBoost.process(p, dataDict)
    predList.append(preds)
    p["score"] = evaluate.process(preds, dataDict["testY"])
    results.append(p)

In [5]:
for x in results: print(x)

{'n_estimators': 30, 'learning_rate': 0.2, 'max_depth': 12, 'colsample_bytree': 1.0, 'score': 0.03862151460044706, 'subsample': 0.7}
{'n_estimators': 300, 'learning_rate': 0.2, 'max_depth': 12, 'colsample_bytree': 1.0, 'score': 0.04204164127562907, 'subsample': 0.7}


In [6]:
ensemble = (predList[0]+predList[1])/2

score = evaluate.process(ensemble, dataDict["testY"])
print("ensemble score is {:.3f}".format(score))

ensemble score is 0.037
