# Simple baselines

In [1]:
import sklearn
from sklearn.model_selection import cross_val_score

In [2]:
import glob
import os
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

all_files = all_files = glob.glob(os.path.join('Train/Train', '*.csv'))
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

## Predict value from 3h ago (ie bikes_3h_ago)

First fill in missing values of bikes_3h_ago from early in the data set using the mean of the values from the same station and weekhour (assuming here there's some weekly pattern in bike usage).

In [3]:
df['bikes_3h_ago'] = df.groupby(['station', 'weekhour']).bikes_3h_ago.transform(lambda x: x.fillna(x.mean()))

Remove the entries with no bikes value from when the clocks change (one per station)

In [4]:
df = df[df['bikes'].isna() != True]

Shuffle the data

In [5]:
df = sklearn.utils.shuffle(df)

In [6]:
class Baseline(sklearn.base.BaseEstimator):
    def __init__(self, column):
        self.column = column
        
    def fit(self, X, y):
        pass
    def predict(self, X):
        return X[self.column]


In [7]:
regr = Baseline('bikes_3h_ago')
scores = cross_val_score(regr, df, df.bikes, cv=5, scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-3.27115442 -3.25831093 -3.30951314 -3.26560633 -3.28142174]
-3.277201314217444


## Predict full_profile_bikes

1. Fill in missing values of full_profile_bikes
2. Shuffle data
3. Remove entries with no bikes value from when clocks change (one per station)

In [8]:
df['full_profile_bikes'] = df.groupby(['station', 'weekhour']).full_profile_bikes.transform(lambda x: x.fillna(x.mean()))
df = df[df['bikes'].isna() != True]
df = sklearn.utils.shuffle(df)

regr = Baseline('full_profile_bikes')
scores = cross_val_score(regr, df, df.bikes, cv=5, scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-3.3362505  -3.30197755 -3.30360041 -3.36674072 -3.31194245]
-3.3241023247710073


## 3 hours ago plus full_profile_3h_diff_bikes
So attempt to adjust the value from 3 hours ago with the mean change seen at the station at the same time of the week

1. Fill early empty full_profile_3h_diff_bikes values. This time filling with 0 for now.
2. Remove entries with target value missing
3. Shuffle data
4. Add column that is sum of bikes_3h_ago and full_profile_3h_diff_bikes
5. Use this new column as Baseline

In [9]:
df['full_profile_3h_diff_bikes'] = df.full_profile_3h_diff_bikes.fillna(0)
# df['full_profile_3h_diff_bikes'] = df.groupby(['station', 'weekhour']).full_profile_3h_diff_bikes.transform(lambda x: x.fillna(x.mean()))

df['3h_plus_diff'] = df['bikes_3h_ago'] + df['full_profile_3h_diff_bikes']
df = df[df['bikes'].isna() != True]
df = sklearn.utils.shuffle(df)

regr = Baseline('3h_plus_diff')
scores = cross_val_score(regr, df, df.bikes, cv=5, scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-3.04696087 -3.03126493 -2.98303465 -2.96600956 -3.04823775]
-3.0151015531660694
