# Simple baselines

In [1]:
import sklearn
from sklearn.model_selection import cross_val_score

In [2]:
import glob
import os
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

all_files = all_files = glob.glob(os.path.join('Processed', '*.csv'))
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

## Predict value from 3h ago (ie bikes_3h_ago)

First fill in missing values of bikes_3h_ago from early in the data set using the mean of the values from the same station and weekhour (assuming here there's some weekly pattern in bike usage).

In [3]:
df['bikes_3h_ago'] = df.groupby(['station', 'weekhour']).bikes_3h_ago.transform(lambda x: x.fillna(x.mean()))

Shuffle the data

In [4]:
df = sklearn.utils.shuffle(df)

In [5]:
class Baseline(sklearn.base.BaseEstimator):
    def __init__(self, column):
        self.column = column
        
    def fit(self, X, y):
        pass
    def predict(self, X):
        return X[self.column]


In [6]:
regr = Baseline('bikes_3h_ago')
scores = cross_val_score(regr, df, df.bikes, cv=5, scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-3.28294504 -3.3479764  -3.25244176 -3.26084229 -3.24180108]
-3.277201314217443


## Predict full_profile_bikes

1. Fill in missing values of full_profile_bikes
2. Shuffle data

In [7]:
df['full_profile_bikes'] = df.groupby(['station', 'weekhour']).full_profile_bikes.transform(lambda x: x.fillna(x.mean()))
df = sklearn.utils.shuffle(df)

regr = Baseline('full_profile_bikes')
scores = cross_val_score(regr, df, df.bikes, cv=5, scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-3.34071709 -3.35279147 -3.30447282 -3.26926959 -3.35326065]
-3.3241023247710073


## 3 hours ago plus full_profile_3h_diff_bikes
So attempt to adjust the value from 3 hours ago with the mean change seen at the station at the same time of the week

1. Fill early empty full_profile_3h_diff_bikes values. This time filling with 0 for now.
2. Shuffle data
3. Add column that is sum of bikes_3h_ago and full_profile_3h_diff_bikes
4. Use this new column as Baseline

In [8]:
df['full_profile_3h_diff_bikes'] = df.full_profile_3h_diff_bikes.fillna(0)
# df['full_profile_3h_diff_bikes'] = df.groupby(['station', 'weekhour']).full_profile_3h_diff_bikes.transform(lambda x: x.fillna(x.mean()))

df['3h_plus_diff'] = df['bikes_3h_ago'] + df['full_profile_3h_diff_bikes']
df = sklearn.utils.shuffle(df)

regr = Baseline('3h_plus_diff')
scores = cross_val_score(regr, df, df.bikes, cv=5, scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-3.00198626 -3.00926673 -3.04877539 -2.970908   -3.04457139]
-3.01510155316607


See a ~10% reduction in MAE with this combination of bikes_3h_ago and the rolling diff at similar times of the week