# Simple baselines

In [1]:
import sklearn
from sklearn.model_selection import cross_val_score, KFold

import pandas as pd
import matplotlib.pyplot as plt
from helpers import Baseline, per_station_models_cross_val_mean, load_all_processed_data

pd.set_option('display.max_columns', None)

In [2]:
df = load_all_processed_data()

## Predict value from 3h ago (ie bikes_3h_ago)

First fill in missing values of bikes_3h_ago from early in the data set using the mean of the values from the same station and weekhour (assuming here there's some weekly pattern in bike usage).

In [12]:
df['bikes_3h_ago'] = df.groupby(['station', 'weekhour']).bikes_3h_ago.transform(lambda x: x.fillna(x.mean()))

regr = Baseline('bikes_3h_ago')
scores = cross_val_score(regr, df, df.bikes, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')

print(scores.mean())

-3.277201314217443


In [13]:
df['bikes_3h_ago'] = df.groupby(['station', 'weekhour']).bikes_3h_ago.transform(lambda x: x.fillna(x.mean()))

regr = Baseline('bikes_3h_ago')

score = per_station_models_cross_val_mean(Baseline('bikes_3h_ago'), df)
print(score)

-3.277124150996634


## Predict full_profile_bikes

1. Fill in missing values of full_profile_bikes

In [6]:
df['full_profile_bikes'] = df.groupby(['station', 'weekhour']).full_profile_bikes.transform(lambda x: x.fillna(x.mean()))

regr = Baseline('full_profile_bikes')
scores = cross_val_score(regr, df, df.bikes, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-3.35697991 -3.30680319 -3.32330185 -3.3213243  -3.31210237]
-3.3241023247710073


In [7]:
df['full_profile_bikes'] = df.groupby(['station', 'weekhour']).full_profile_bikes.transform(lambda x: x.fillna(x.mean()))

regr = Baseline('full_profile_bikes')

score = per_station_models_cross_val_mean(regr, df)
print(score)

-3.3241019719383815


## 3 hours ago plus full_profile_3h_diff_bikes
So attempt to adjust the value from 3 hours ago with the mean change seen at the station at the same time of the week

1. Fill early empty full_profile_3h_diff_bikes values. This time filling with 0 for now.
2. Add column that is sum of bikes_3h_ago and full_profile_3h_diff_bikes
3. Use this new column as Baseline

In [10]:
df['full_profile_3h_diff_bikes'] = df.full_profile_3h_diff_bikes.fillna(0)
# df['full_profile_3h_diff_bikes'] = df.groupby(['station', 'weekhour']).full_profile_3h_diff_bikes.transform(lambda x: x.fillna(x.mean()))

df['3h_plus_diff'] = df['bikes_3h_ago'] + df['full_profile_3h_diff_bikes']

regr = Baseline('3h_plus_diff')
scores = cross_val_score(regr, df, df.bikes, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-3.0062724  -3.01493429 -2.99301822 -3.02810633 -3.03317652]
-3.01510155316607


See a ~10% reduction in MAE with this combination of bikes_3h_ago and the rolling diff at similar times of the week

In [11]:
df['full_profile_3h_diff_bikes'] = df.full_profile_3h_diff_bikes.fillna(0)
# df['full_profile_3h_diff_bikes'] = df.groupby(['station', 'weekhour']).full_profile_3h_diff_bikes.transform(lambda x: x.fillna(x.mean()))

df['3h_plus_diff'] = df['bikes_3h_ago'] + df['full_profile_3h_diff_bikes']

regr = Baseline('3h_plus_diff')

score = per_station_models_cross_val_mean(regr, df)
print(score)

-3.0151336940967814
