#### Generate list of panels and get the forecasts using different methods

In [1]:
import numpy  as np
import pandas as pd
from config import getClient
import forecast
import warnings
warnings.filterwarnings('ignore')
import time

DELIM = ","
RFparms = {"numTrees": 200,
           "depth":      8,
           "features":  16}
XGBparms = {"cbt":     0.8,
            "LR":      0.7,
            "alpha":     6,
            "maxDepth":  9,
            "numTrees": 60}

In [2]:
dataloc = getClient("OutFront")

In [3]:
train = pd.read_csv(dataloc+"testData/train.csv", nrows=1600000)
test  = pd.read_csv(dataloc+"testData/test.csv",  nrows=800000)

# Convert their date format to a standard date
train["date"] = pd.to_datetime(train["date"])
test["date"]  = pd.to_datetime(test["date"])

In [4]:
outlier  = open(dataloc+"outliers.csv", "w")

In [5]:
def calcError(predictions, Y):
    errors = abs(predictions - Y)
    errors = round((errors/Y).mean(),2)
    return errors

In [6]:
train = train.set_index("panel")
test  = test.set_index("panel")
grp   = train.groupby(level=0)

In [None]:
nonFeatures = ["date", "hour", "population"]

panelList = []
datesList = []
hoursList = []

RFerrors    = []
STLerrors   = []
XGBerrors   = []
naiveerrors = []

RFpreds    = []
STLpreds   = []
XGBpreds   = []
actuals    = []

start = time.time()
for panel, data in grp:
    tst = test.loc[panel]
    # These are simplistic forecasters: just use the Time Block
    meanMorning   = data.loc[data["block_morning"]  ==1]["population"].mean()
    meanAfternoon = data.loc[data["block_afternoon"]==1]["population"].mean()
    meanEvening   = data.loc[data["block_evening"]  ==1]["population"].mean()
    
    features = [x for x in data.columns if x not in nonFeatures]
    trainX = data[features]
    testX  = tst[features]
    trainX = np.array(trainX)
    testX  = np.array(testX)
    
    trainY = data["population"]
    testY  = tst["population"]
        
    d = {}                 # holds the data to pass to the forecasters
    d["trainX"] = trainX
    d["trainY"] = trainY
    d["testX"]  = testX
    d["testY"]  = testY
        
    predictions = forecast.RF(d, RFparms)
    errorRate   = calcError(predictions, testY)
    # A very high error rate usually means the data is messed up
    if errorRate < 4:
        RFerrors.append(errorRate)
        RFpreds.append(predictions)        
            
        predictions = forecast.STL(d)
        STLpreds.append(predictions)
        STLerrors.append(calcError(predictions, testY))
        
        predictions = forecast.XGB(d, XGBparms)
        XGBpreds.append(predictions)
        XGBerrors.append(calcError(predictions, testY))
        
        # Get the predictions for a naive forecaster
        tst.loc[tst["block_morning"]  ==1, "naive"] = meanMorning
        tst.loc[tst["block_afternoon"]==1, "naive"] = meanAfternoon
        tst.loc[tst["block_evening"]  ==1, "naive"] = meanEvening
        naiveerrors.append(calcError(np.array(tst["naive"]), testY))
            
        panelList.append(panel)
        datesList.append(test["date"].reset_index(drop=True))
        hoursList.append(test["hour"].reset_index(drop=True))
        actuals.append(testY)
    else:
        rec = panel + "\n"
        outlier.write(rec)
        
outlier.close()
end = time.time()
print("Processing time: {:.0f} minutes".format((end-start)/60))

In [None]:
# Summary has each panel and the errors associated with each algo
with open(dataloc+"summary.csv", "w") as output:
    hdr = ["panel","RF", "STL","XGB", "Naive"]
    hdr = ",".join(hdr) + "\n"
    output.write(hdr)
    errs = [list(a) for a in zip(panelList,RFerrors, STLerrors, XGBerrors, naiveerrors)]

    for x in errs:
        rec = ",".join(map(str, x)) +"\n"
        output.write(rec)

In [None]:
# "Predictions" has each panel and the errors associated with each algo
with open(dataloc+"predictions.csv", "w") as output:
    hdr = ["panel","date", "hour","actual","RF","STL","XGB"]
    hdr = ",".join(hdr) + "\n"
    output.write(hdr)
    for x, p in enumerate(panelList):
        
        preds = [list(a) for a in zip(datesList[x], hoursList[x], actuals[x], RFpreds[x], STLpreds[x], XGBpreds[x])]
        for y in preds:
            rec = p +DELIM+DELIM.join(map(str, y)) +"\n"
            output.write(rec)