# Ensemble Methods: Challenge Session

In [None]:
import numpy  as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt

## Bike Sharing Demand

Source:
https://www.kaggle.com/c/bike-sharing-demand

> You are provided hourly rental data spanning two years. 
> ... the training set is comprised of the first 19 days of each month,
> while the test set is the 20th to the end of the month. You must predict
> the total count of bikes rented during each hour covered by the test set,
> using only information available prior to the rental period.

This prediction challenge uses RMSE of log-targets as teh quality metric:
$$ \mathtt{RMSLE}
    = \sqrt{ \frac1T \sum_{t=1}^T (\log (y_t+1) - \log (\hat{y}_t+1))^2}
    \,. $$

In [None]:
def rmsle(y_true, y_pred):
    delta_ = np.nan_to_num(np.log(y_pred + 1))
    delta_ -= np.log(y_true + 1)
    return np.sqrt(np.nanmean(delta_ ** 2))

### Load the data

In [None]:
def load_data(filelike):
    df = pd.read_csv(filelike, header=0)
    df.datetime = pd.to_datetime(df.datetime)
    df.reset_index(inplace=True, drop=True)
    return df

data, test = load_data('bike/train.csv'), load_data('bike/test.csv')

Inspect the data

In [None]:
data.head()

In [None]:
test.head()

Print the shapes

In [None]:
print("Train %dx%d"%data.shape, "Test %dx%d"%test.shape)

#### Data Fields

* **datetime** -- hourly date + timestamp  
* **season** --  1 = spring, 2 = summer, 3 = fall, 4 = winter 
* **holiday** -- whether the day is considered a holiday
* **workingday** -- whether the day is neither a weekend nor holiday
* **temp** -- temperature in Celsius
* **atemp** -- "feels like" temperature in Celsius
* **humidity** -- relative humidity
* **windspeed** -- wind speed
* **casual** -- number of non-registered user rentals initiated
* **registered** -- number of registered user rentals initiated
* **count** -- number of total rentals


* **weather**:
    1. Clear, Few clouds, Partly cloudy, Partly cloudy 
    2. Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 
    3. Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 
    4. Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog

Create full samples and remove from `X` the columns absent in `test`

In [None]:
X_full = data.drop("count", axis=1, errors="ignore").copy()
y_full = data["count"].copy()

X_test = test.copy()
X_full = X_full[X_test.columns]

Create the train/validation split.
Train on the data up to (and including) the **10**-th of each month.

In [None]:
train_mask = X_full.datetime.dt.day <= 10
X_train, y_train = X_full[train_mask], y_full[train_mask]
X_valid, y_valid = X_full[~train_mask], y_full[~train_mask]

# Extract features for train/validation/test samples

A compact one-hot encoder for dataframes

In [None]:
from sklearn.preprocessing import OneHotEncoder

def one_hot(series, n_values, name):
    one_hot_ = OneHotEncoder(n_values, sparse=False)
    series_ = one_hot_.fit_transform(series.values[:, np.newaxis])
    return pd.DataFrame(series_.astype(int), index=series.index,
                        columns=["%s_%02d"%(name, v,)
                                 for v in range(n_values)])

Extract datetime-related features

In [None]:
def extract_features(df, encode=False):
    df_ = df.copy()
    df_["year"] = df_.datetime.dt.year - 2010
    if not encode:
        df_["month"] = df_.datetime.dt.month
        df_["weekday"] = df_.datetime.dt.weekday
        df_["hour"] = df_.datetime.dt.hour
    else:
        df_ = pd.concat(
            [df_,
             one_hot(df_.datetime.dt.hour, 24, "hour"),
             one_hot(df_.datetime.dt.month - 1, 12, "month"),
             one_hot(df_["season"] - 1, 4, "season"),
             one_hot(df_.datetime.dt.weekday, 7, "weekday"),
            ], axis=1)
        df_.drop("season", axis=1, inplace=True)
    df_.drop("datetime", axis=1, inplace=True)
    return df_.values

Get the train, validation and test datasets. 

In [None]:
def get_X_y(df_X, df_y=None):
    if df_y is None:
        return extract_features(df_X, encode=True)
    return extract_features(df_X, encode=True), df_y.values

X_train, y_train = get_X_y(X_train, y_train)
X_valid, y_valid = get_X_y(X_valid, y_valid)

X_full, y_full = get_X_y(X_full, y_full)
X_test = get_X_y(X_test)

Check the dimensions

In [None]:
print(X_train.shape, X_valid.shape, X_test.shape)

# Train, validate & apply regression

Train/validate your ensemble model here

Samples:
* ``(X_train, y_train)`` -- train
* ``(X_valid, y_valid)`` -- validation

In [None]:
###############################
##### PUT YOUR MODEL HERE #####
###############################

## Make a submission

Samples:
* ``(X, y)`` -- full train dataset
* ``X_test`` -- the test dataset (no target $y$)

In [None]:
###############################
##### PUT YOUR MODEL HERE #####
###############################

y_pred = np.zeros(X_test.shape[0], dtype=np.int)

Write the submission to ``"my_submission.csv"``
* ``y_pred`` -- predictions on the test ``X_test``

In [None]:
pd.DataFrame(dict(datetime=test.datetime,
                  count=y_pred.astype(int)),
             columns=["datetime", "count"])\
  .to_csv("my_submission.csv", index=False)