#### Create csv input so optimizer routines don't have to do all the formatting

In [1]:
import numpy as np
import pandas as pd
from datetime import timedelta

from getConfig import getConfig
from getData   import getData

trainWeeks = 52
COLS = ["block", "dow", "weekNum", "month"]

In [None]:
types = {"hour": "int8",\
         "population": "float32",\
         "dow": "int8",\
         "weekNum": "int8",\
         "month": "int8",\
         "holiday": "int8"}

In [3]:
df = pd.read_csv(dataloc+"3_final.csv")

# Convert their date format to a standard date
df["date"] = pd.to_datetime(df["date"])

In [5]:
df  = df.set_index("panel")
grp = df.groupby(level=0)

In [6]:
def normalizeColumns(test, trainCols):
    ''' Because Train and Test will likely be over different date ranges, it's likely
    that Train has columns, e.g. "Week 28", which will not be in Test. In that case, add a
    "Week 28 column to Test, setting it to 0    '''
    cols = [x for x in trainCols if x not in test.columns]
    for x in cols:
        test[x] = 0
    test.sort_index(axis=1, inplace=True)
    return test

In [7]:
# X values need to be "one-hot encoded" for RF and NN
def formatX(train, test):
    train = pd.get_dummies(train, columns=COLS)
    train.sort_index(axis=1, inplace=True)
    trainCols = train.columns  # Need to make sure "test" has the same columns
    
    test = pd.get_dummies(test, columns=COLS)
    test = normalizeColumns(test, trainCols)
    assert (test.shape[1] == train.shape[1])
    return train, test

In [8]:
skipping = open(dataloc+"skipped.csv", "w")

In [9]:
tr = []
te = []

for panel, data in grp:
    trainEnd = data["date"].min() + timedelta(weeks=trainWeeks)
    train = data.loc[  data["date"] < trainEnd]
    test  = data.loc[~(data["date"] < trainEnd)]
    assert (data["population"].sum() - train["population"].sum() - test["population"].sum() < 1)
    if test.shape[0] > 0:
        train, test = formatX(train, test)
        tr.append(train)
        te.append(test)
    else:
        rec = panel + "\n"
        skipping.write(rec)
skipping.close()

In [11]:
tmp = pd.concat(tr)
tmp.to_csv(dataloc+"testData/"+"train.csv")
tmp = pd.concat(te)
tmp.to_csv(dataloc+"testData/"+"test.csv")