In [13]:
import numpy as np
import pandas as pd
from config import getClient
import xgboost as xgb
import time
import random

DELIM = ","
nonFeatures = ["date", "hour", "population"]

##### Prepare the Train and Test sets

In [14]:
dataloc = getClient("OutFront")

train = pd.read_csv(dataloc+"testData/train.csv", nrows=900000)
test  = pd.read_csv(dataloc+"testData/test.csv",  nrows=450000)

In [15]:
# Convert their date format to a standard date
train["date"] = pd.to_datetime(train["date"])
test["date"]  = pd.to_datetime(test["date"])

# Get a list of all the panels; from here we will chooose randomly
panelArray = train["panel"].unique()
#panelArray = np.random.choice(panelArray, size=4)

# access the datasets by panel
train = train.set_index("panel")
test  = test.set_index("panel")

In [16]:
save = open(dataloc+"predictions.csv", "w")
hdr = "panel"+DELIM+"date"+DELIM+"hour"+DELIM+"prediction"+DELIM+"actual"+"\n"
save.write(hdr)

34

In [17]:
def setParms():
    # NN hyper-parameters
    cbt      = [0.4, 0.6, 0.8]
    LR       = [.05, .06, .07]
    alpha    = [6]
    maxDepth = [7,8,9]
    numTrees = [40, 50, 60]
    
    parms = [[a,b,c,d,e] for a in cbt
             for b in LR
             for c in alpha
             for d in maxDepth
             for e in numTrees]
    return parms

In [18]:
def getError(predictions, Y):
    errors = abs(predictions - Y)
    errors = round((errors/Y).mean(),2)
    return errors

In [19]:
def writePreds(panel, testDates, preds, testY):
    dates = testDates[0].reset_index(drop=True)
    hours = testDates[1].reset_index(drop=True)
    for x in range(len(dates)):
        rec = (panel+DELIM+str(dates[x])+DELIM+str(hours[x])+DELIM+str(int(preds[x]))\
        +DELIM+str(testY[x])+"\n")
        save.write(rec)

In [20]:
# for a panel, split train and test, then X and Y
def formatPanel(panel):
    trn = train.loc[panel]
    tst = test.loc[panel]
    
    features = [x for x in trn.columns if x not in nonFeatures]
    
    dataDict = {}
    dataDict["trnX"] = np.array(trn[features])
    dataDict["trnY"] = np.array(trn["population"])
    dataDict["tstX"] = np.array(tst[features])
    dataDict["tstY"] = np.array(tst["population"])
    testDates = (tst["date"], tst["hour"])   # This is only for the random predictions
    return dataDict, testDates

In [21]:
# For a single panel, determine the best parameters
'''def optimizeModel(panel, dataDict):
    test_params = {
    "colsample_bytree":[.60, .7],
    "learning_rate":[.060, .065],
    'max_depth':[8,9],
    "reg_alpha":[6],
    "n_estimators":[45, 55]
    }'''

'def optimizeModel(panel, dataDict):\n    test_params = {\n    "colsample_bytree":[.60, .7],\n    "learning_rate":[.060, .065],\n    \'max_depth\':[8,9],\n    "reg_alpha":[6],\n    "n_estimators":[45, 55]\n    }'

In [22]:
# Input is the data for a panel and the parameters for the model
# Run the model and get the predictions for the test set
def runModel(dataDict, parms):
    regr = xgb.XGBRegressor(objective ='reg:linear',\
                            colsample_bytree = parms["cbt"],\
                            learning_rate    = parms["LR"],\
                            max_depth        = parms["maxDepth"],\
                            alpha            = parms["alpha"],\
                            n_estimators     = parms["numTrees"])
    model = regr.fit(dataDict["trnX"],dataDict["trnY"])
    preds =  model.predict(dataDict["tstX"])
    return preds

In [23]:
# Report on the best set of parameters for each panel
def writeResults(results):
    with open(dataloc+"XGBresults.csv", 'w') as summary:
        keys = results[0][1].keys()
        hdr = "panel"+","+",".join(keys)
        hdr += ","+"errorRate" + "\n"
        summary.write(hdr)
        
        for x in results:
            rec = x[0]+","+",".join([str(t) for t in x[1].values()])
            rec += ","+ str(x[2]) +"\n"         # errRate
            summary.write(rec)

In [24]:
# For each parameter combination, run all the panels
start = time.time()
results = []

parms = setParms()

for panel in panelArray:
    bestParms = None
    lowError = np.inf
    dataDict, testDates = formatPanel(panel)
    for parm in parms:
        parmDict = {}
        parmDict["cbt"]      = parm[0]
        parmDict["LR"]       = parm[1]
        parmDict["alpha"]    = parm[2]
        parmDict["maxDepth"] = parm[3]
        parmDict["numTrees"] = parm[4]
        
        preds    = runModel(dataDict, parmDict)

        error = getError(preds, dataDict["tstY"])
        if error < lowError:
            lowError  = error
            bestParms = parmDict
        # Just so we can see a graph, save once in a while
        if random.randint(1,500) == 51:
            writePreds(panel, testDates, preds, dataDict["tstY"])
    tup = (panel, parmDict, lowError)
    results.append(tup)

end = time.time()
print("Processing time: {:.0f} minutes".format((end-start)/60))

save.close()
writeResults(results)

Processing time: 95 minutes
