### Prepare Data
- convert categorical fields to dummies
- normalize data
- test/train/validation set split
    - use the last 3-4 years for testing and validation

### Train and Evaluate
- Train the model using 1) RandomForest and 2) XGBoost
- Evaluate the predictions agains the test set

### Prediction API
- Implement a function that takes raw data as input and produces the hourly predictions
    - implement a function that converts the dummies back to categorical
    - denormalize the data back to the original scale    

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dateutil import parser
from sklearn import model_selection, preprocessing, ensemble
%matplotlib inline

In [2]:
# data parameters
categorical_columns = ["vehicle_type", "location_id", "direction", "weekday", "week"]
dependent_columns = ["hour_{}".format(n) for n in range(1,25)]

# functions
def to_train_format(raw_data):
    """
    Converts categorical fields to dummies (and normalizes the values).
    Returns a DataFrame.
    """
    data = raw_data
    for col in categorical_columns:
        dummies = pd.get_dummies(data[col], prefix=col)
        data = pd.concat([data, dummies], axis=1)
        data = data.drop([col], axis=1)
    return data

def train_test_split(data, test_size=0.10):
    """
    Splits the formatted data into train and test sets.
    Returns X and Y + associated test sets as DataFrames.
    """
    X = data.drop(dependent_columns, axis=1)
    Y = data[dependent_columns]
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size)
    return X_train, X_test, Y_train, Y_test

def train(X_train, Y_train, model):
    """
    Fits the model to the train data.
    Modifies the model parameter.
    """
    model.fit(X_train, Y_train)
    model.features = X_train.columns
    model.targets = Y_train.columns

def evaluate(X_test, Y_test, model):
    """
    Returns the coefficient of determination R^2 of the prediction.
    """
    score = model.score(X_test, Y_test)
    return score

def predict(row, model):
    """
    Uses the trained model to predict with the given DataFrame.
    Returns a DataFrame including the predictions.
    """
    row_f = to_train_format(row)
    row_formatted = pd.DataFrame()
    for col in model.features:
        if col in row_f.columns:
            row_formatted[col] = row_f[col]
        else:
            row_formatted[col] = 0
    pred = pd.DataFrame(model.predict(row_formatted))
    pred.columns = model.targets
    return pred

def sample_prediction(sample_data, model):
    """
    Generates the prediction of a sample item.
    Returns a DataFrame containing the original and the 
    predicted values side-by-side.
    """
    sample = sample_data.sample(1)
    result = pd.concat([sample, predict(sample, model)]).fillna("").transpose()
    index = result.columns[0]
    result.columns = [index, "prediction"]
    return result

def generate_prediction_series(
        model,
        dates=["2018-06-22", "2018-06-23", "2018-06-24", "2018-06-25", "2018-06-26"], 
        location_ids=[168, 1403, 110],
        vehicle_types=["13 Linja-autot", "12 KAIP", "11 HA-PA"],
    ):
    """
    Generates a predicted data series for the given dates, 
    locations, and vehicle types. Returns a DataFrame.
    """
    generated = pd.DataFrame()
    for date_str in dates:
        date = parser.parse(date_str)
        for location_id in location_ids:
            for vehicle_type in vehicle_types:
                for direction in [1,2]:
                    row = pd.DataFrame()
                    row["year"] = [date.year]
                    row["week"] = [date.isocalendar()[1]]
                    row["weekday"] = [date.weekday()]
                    row["location_id"] = [location_id]
                    row["direction"] = [direction]
                    row["vehicle_type"] = [vehicle_type]
                    #row["temperature"] = [15]
                    #row["rain"] = [0]
                    prediction = predict(row, model)
                    prediction.columns = dependent_columns
                    combined = pd.concat([row, prediction], axis=1)
                    generated = generated.append(combined, ignore_index=True)
    return generated

def to_ts_format(generated_data):
    """
    Converts the generated dataset to time series format.
    Returns a DataFrame.
    """
    ts_data = pd.DataFrame()
    #ts_data.columns = generated_data.drop(dependent_columns).columns
    for row in ts_data.values:
        ts_data
        #for dcol in dependent_columns:
    return ts_data
    

In [3]:
# split the raw data to train and test sets
raw_data = pd.read_csv("refined_dataset.csv").sort_values(by="date")
raw_data = raw_data.drop(["sum", "location_name", "date"], axis=1)

model_data = raw_data[:-100]
validation_data = raw_data[-100:]

train_formatted_data = to_train_format(model_data)
X_train, X_test, Y_train, Y_test = train_test_split(train_formatted_data)

# random forest regression model
model = ensemble.RandomForestRegressor()

# train the model
train(X_train, Y_train, model)

# evaluate the model
evaluate(X_test, Y_test, model)

0.97473409109581599

In [19]:
# predict a sample
sample_prediction(validation_data, model)

Unnamed: 0,100168,prediction
direction,1,
hour_1,0,1.2
hour_10,15,18.8
hour_11,16,22.3
hour_12,17,21.1
hour_13,10,24.2
hour_14,21,27.1
hour_15,13,26.9
hour_16,18,24.9
hour_17,11,21.9


In [7]:
model_data.head()

Unnamed: 0,location_id,direction,vehicle_type,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,...,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,hour_24,year,weekday,week
35913,110,2,11 HA-PA,55.0,72.0,53.0,46.0,30.0,32.0,40.0,...,569.0,548.0,454.0,311.0,220.0,131.0,91.0,2010,4,53
35919,110,2,17 HA + AV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,3.0,0.0,0.0,1.0,0.0,2010,4,53
35918,110,2,16 HA + PK,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,11.0,6.0,5.0,4.0,1.0,0.0,0.0,2010,4,53
35917,110,2,15 KATP,5.0,1.0,8.0,3.0,5.0,2.0,2.0,...,2.0,2.0,2.0,5.0,3.0,1.0,4.0,2010,4,53
35916,110,2,14 KAPP,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,1.0,0.0,2010,4,53


In [8]:
generated = generate_prediction_series(model)
generated

Unnamed: 0,year,week,weekday,location_id,direction,vehicle_type,hour_1,hour_2,hour_3,hour_4,...,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,hour_24
0,2018,25,4,168,1,13 Linja-autot,1.3,2.5,1.5,1.7,...,3.0,2.7,5.4,4.0,2.0,2.8,3.8,1.7,9.1,3.2
1,2018,25,4,168,2,13 Linja-autot,2.8,3.0,1.6,1.8,...,2.1,3.5,3.8,2.2,0.7,2.1,3.1,1.3,3.1,2.0
2,2018,25,4,168,1,12 KAIP,2.8,2.3,2.3,2.2,...,5.5,5.3,2.8,4.1,2.7,2.1,4.0,1.7,1.0,1.9
3,2018,25,4,168,2,12 KAIP,3.7,2.7,1.1,3.1,...,14.6,8.5,8.2,7.7,5.7,3.8,1.6,1.9,0.5,3.6
4,2018,25,4,168,1,11 HA-PA,117.5,63.1,42.9,65.1,...,1364.6,1540.6,1314.2,996.4,745.5,556.2,460.1,416.9,303.9,210.2
5,2018,25,4,168,2,11 HA-PA,194.0,171.6,77.4,48.3,...,1144.0,1092.4,975.1,739.9,575.5,493.5,338.9,293.6,232.7,275.8
6,2018,25,4,1403,1,13 Linja-autot,0.0,0.0,0.0,0.0,...,0.2,0.2,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0
7,2018,25,4,1403,2,13 Linja-autot,0.0,0.0,0.0,0.0,...,0.0,0.2,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0
8,2018,25,4,1403,1,12 KAIP,0.0,0.0,0.0,0.1,...,1.8,0.0,1.6,1.7,0.0,0.1,0.0,0.0,0.0,0.0
9,2018,25,4,1403,2,12 KAIP,0.0,0.8,0.0,0.0,...,0.8,1.2,1.6,0.6,1.4,1.4,0.2,0.2,0.0,0.2


In [10]:
ts_format = pd.DataFrame()
for i in range(generated.shape[0]):
    independents = generated.loc[i:i].drop(dependent_columns, axis=1)
    hours = generated.loc[i:i][dependent_columns]
    for col in hours.columns:
        row = independents
        row["hour"] = col
        row["value"] = hours[col]
        ts_format = ts_format.append(row)
ts_format.reset_index()

Unnamed: 0,index,year,week,weekday,location_id,direction,vehicle_type,hour,value
0,0,2018,25,4,168,1,13 Linja-autot,hour_1,1.3
1,0,2018,25,4,168,1,13 Linja-autot,hour_2,2.5
2,0,2018,25,4,168,1,13 Linja-autot,hour_3,1.5
3,0,2018,25,4,168,1,13 Linja-autot,hour_4,1.7
4,0,2018,25,4,168,1,13 Linja-autot,hour_5,1.2
5,0,2018,25,4,168,1,13 Linja-autot,hour_6,3.2
6,0,2018,25,4,168,1,13 Linja-autot,hour_7,3.6
7,0,2018,25,4,168,1,13 Linja-autot,hour_8,3.9
8,0,2018,25,4,168,1,13 Linja-autot,hour_9,2.5
9,0,2018,25,4,168,1,13 Linja-autot,hour_10,3.8
