### Prepare Data
- convert categorical fields to dummies
- normalize data
- test/train/validation set split
    - use the last 3-4 years for testing and validation

### Train and Evaluate
- Train the model using 1) RandomForest and 2) XGBoost
- Evaluate the predictions agains the test set

### Prediction API
- Implement a function that takes raw data as input and produces the hourly predictions
    - implement a function that converts the dummies back to categorical
    - denormalize the data back to the original scale    

In [109]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from dateutil import parser
from sklearn import model_selection, preprocessing, ensemble
%matplotlib inline

In [154]:
# data parameters
categorical_columns = ["vehicle_type", "location_id", "direction", "weekday", "week"]
dependent_columns = ["hour_{}".format(n) for n in range(1,25)]

# functions
def to_train_format(raw_data):
    """
    Converts categorical fields to dummies (and normalizes the values).
    Returns a DataFrame.
    """
    data = raw_data
    for col in categorical_columns:
        dummies = pd.get_dummies(data[col], prefix=col)
        data = pd.concat([data, dummies], axis=1)
        data = data.drop([col], axis=1)
    return data

def train_test_split(data, test_size=0.10):
    """
    Splits the formatted data into train and test sets.
    Returns X and Y + associated test sets as DataFrames.
    """
    X = data.drop(dependent_columns, axis=1)
    Y = data[dependent_columns]
    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=test_size)
    return X_train, X_test, Y_train, Y_test

def train(X_train, Y_train, model):
    """
    Fits the model to the train data.
    Modifies the model parameter.
    """
    model.fit(X_train, Y_train)
    model.features = X_train.columns
    model.targets = Y_train.columns

def evaluate(X_test, Y_test, model):
    """
    Returns the coefficient of determination R^2 of the prediction.
    """
    score = model.score(X_test, Y_test)
    return score

def predict(row, model):
    """
    Uses the trained model to predict with the given DataFrame.
    Returns a DataFrame including the predictions.
    """
    row_f = to_train_format(row)
    row_formatted = pd.DataFrame()
    for col in model.features:
        if col in row_f.columns:
            row_formatted[col] = row_f[col]
        else:
            row_formatted[col] = 0
    pred = pd.DataFrame(model.predict(row_formatted))
    pred.columns = model.targets
    return pred

def sample_prediction(sample_data, model):
    """
    Generates the prediction of a sample item.
    Returns a DataFrame containing the original and the 
    predicted values side-by-side.
    """
    sample = sample_data.sample(1)
    result = pd.concat([sample, predict(sample, model)]).fillna("").transpose()
    index = result.columns[0]
    result.columns = [index, "prediction"]
    return result

def generate_prediction_series(
        model,
        dates=["2018-06-22", "2018-06-23", "2018-06-24", "2018-06-25", "2018-06-26"], 
        location_ids=[168, 1403, 110],
        vehicle_types=["13 Linja-autot", "12 KAIP", "11 HA-PA"],
    ):
    """
    Generates a predicted data series for the given dates, 
    locations, and vehicle types. Returns a DataFrame.
    """
    generated = pd.DataFrame()
    for date_str in dates:
        date = parser.parse(date_str)
        for location_id in location_ids:
            for vehicle_type in vehicle_types:
                for direction in [1,2]:
                    row = pd.DataFrame()
                    row["year"] = [date.year]
                    row["week"] = [date.isocalendar()[1]]
                    row["weekday"] = [date.weekday()]
                    row["location_id"] = [location_id]
                    row["direction"] = [direction]
                    row["vehicle_type"] = [vehicle_type]
                    row["temperature"] = [15]
                    row["rain"] = [0]
                    prediction = predict(row, model)
                    prediction.columns = dependent_columns
                    combined = pd.concat([row, prediction], axis=1)
                    generated = generated.append(combined, ignore_index=True)
    return generated

In [89]:
# split the raw data to train and test sets
raw_data = pd.read_csv("refined_dataset.csv").sort_values(by="date")
raw_data = raw_data.drop(["sum", "location_name", "midsummer_week", "date"], axis=1)

model_data = raw_data[:-100]
validation_data = raw_data[-100:]

train_formatted_data = to_train_format(model_data)
X_train, X_test, Y_train, Y_test = train_test_split(train_formatted_data)

# random forest regression model
model = ensemble.RandomForestRegressor()

# train the model
train(X_train, Y_train, model)

# evaluate the model
evaluate(X_test, Y_test, model)

0.97346974817986931

In [104]:
# predict a sample
sample_prediction(validation_data, model)

Unnamed: 0,4679,prediction
direction,1,
hour_1,11,11.7
hour_10,57,63.4
hour_11,59,69.7
hour_12,57,61.4
hour_13,59,64.6
hour_14,69,67.8
hour_15,60,53.8
hour_16,56,59.9
hour_17,62,53.2


In [108]:
model_data.head()

Unnamed: 0,location_id,direction,vehicle_type,hour_1,hour_2,hour_3,hour_4,hour_5,hour_6,hour_7,...,hour_20,hour_21,hour_22,hour_23,hour_24,rain,temperature,year,weekday,week
99237,1403,2,16 HA + PK,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,-1.0,-12.9,2010,4,53
57628,110,1,16 HA + PK,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,-1.0,-12.9,2010,4,53
57627,110,1,15 KATP,5.0,3.0,1.0,2.0,1.0,0.0,0.0,...,2.0,1.0,5.0,3.0,2.0,-1.0,-12.9,2010,4,53
57626,110,1,14 KAPP,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,-1.0,-12.9,2010,4,53
57625,110,1,13 Linja-autot,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,2.0,0.0,1.0,1.0,1.0,-1.0,-12.9,2010,4,53


In [155]:
generate_prediction_series(model)

Unnamed: 0,year,week,weekday,location_id,direction,vehicle_type,temperature,rain,hour_1,hour_2,...,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23,hour_24
0,2018,25,4,168,1,13 Linja-autot,15,0,2.0,0.3,...,8.4,8.1,11.6,8.6,5.1,4.5,2.8,3.9,3.4,2.5
1,2018,25,4,168,2,13 Linja-autot,15,0,2.9,2.9,...,5.6,6.3,6.2,4.9,2.9,3.1,3.0,1.6,2.2,3.0
2,2018,25,4,168,1,12 KAIP,15,0,2.0,2.4,...,4.5,4.5,2.1,3.4,2.6,2.3,3.7,1.7,2.0,1.6
3,2018,25,4,168,2,12 KAIP,15,0,1.1,1.9,...,4.5,2.5,3.9,4.2,1.6,2.1,1.9,0.3,0.9,2.3
4,2018,25,4,168,1,11 HA-PA,15,0,123.6,66.3,...,1266.0,1379.9,1210.7,914.3,726.1,563.4,499.5,456.8,331.3,230.5
5,2018,25,4,168,2,11 HA-PA,15,0,203.1,168.9,...,1286.0,1308.0,1220.8,928.4,689.3,577.2,417.0,341.0,258.2,299.1
6,2018,25,4,1403,1,13 Linja-autot,15,0,0.0,0.0,...,0.4,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0
7,2018,25,4,1403,2,13 Linja-autot,15,0,0.0,0.0,...,0.2,0.2,0.1,0.1,1.0,0.4,0.1,0.0,0.3,0.0
8,2018,25,4,1403,1,12 KAIP,15,0,0.0,0.0,...,0.8,0.8,0.8,1.2,0.3,0.2,0.3,0.3,0.2,0.0
9,2018,25,4,1403,2,12 KAIP,15,0,0.0,0.1,...,2.4,1.6,1.1,1.2,0.3,0.7,0.0,0.3,0.0,0.4
