# Nearest neighbours

In [27]:
import sklearn
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from helpers import per_station_models_cross_val_mean, load_all_processed_data, ManualFeatureSelector


In [4]:
import glob
import os
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)

all_files = all_files = glob.glob(os.path.join('Processed', '*.csv'))
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

## Default KNeighborsRegressor

1. shuffle data
2. Extract lat, long and weekhour features (tempospatial nearest neighbour)
3. Cross value

In [21]:
# station_id = 249
# df = df[df.station == station_id]
df = sklearn.utils.shuffle(df)
df["pct_full"] = df.bikes/df.numDocks

X = df[['latitude', 'longitude', 'weekhour', 'numDocks']]
y = df['bikes']

In [10]:
regr = KNeighborsRegressor()

scores = cross_val_score(regr, X, y, cv=5, scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-3.90451613 -4.00543011 -3.88062724 -3.87860215 -3.93591398]
-3.921017921146954


In [14]:
regr = make_pipeline(MinMaxScaler(), KNeighborsRegressor())

scores = cross_val_score(regr, X, y, cv=5, scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-3.32277778 -3.36077061 -3.34062724 -3.29731183 -3.25084229]
-3.3144659498207885


In [13]:
regr = make_pipeline(StandardScaler(), KNeighborsRegressor())

scores = cross_val_score(regr, X, y, cv=5, scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-3.34792115 -3.35209677 -3.33969534 -3.31654122 -3.30663082]
-3.332577060931899


Let's try predicting what percentage full the station is in order to normalize across stations then converting to mean absolute error on the number of bikes at score time

In [22]:
def neg_mae_from_percent(estimator, X, y):
    return -abs(X.numDocks * (estimator.predict(X) - y)).mean()

regr = KNeighborsRegressor()
y = df['pct_full']
scores = cross_val_score(regr, X, y, cv=5, scoring=neg_mae_from_percent)

print(scores)
print(scores.mean())

[-3.81498088 -3.85014337 -3.79047043 -3.8246595  -3.8724552 ]
-3.8305418757467145


In [43]:
transformer = make_union(
    make_pipeline(
        ManualFeatureSelector(['latitude', 'longitude', 'weekhour']),
        StandardScaler()
    ),
    make_pipeline(
        ManualFeatureSelector(['numDocks'])
    )
)
regr = make_pipeline(
    transformer,
    KNeighborsRegressor()
)
y = df['pct_full']
scores = cross_val_score(regr, X, y, cv=5, scoring=neg_mae_from_percent)

print(scores)
print(scores.mean())

[-3.2794086  -3.3422043  -3.31286738 -3.34413978 -3.3916129 ]
-3.334046594982079


In [39]:
regr = KNeighborsRegressor()
y = df['pct_full']
scores = cross_val_score(regr, X, y, cv=5, scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-0.20264103 -0.20429291 -0.20247106 -0.20365299 -0.20530213]
-0.20367202262871725
