# Linear models

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression, Lasso, Ridge

from helpers import per_station_models_cross_val_mean, load_all_processed_data, ManualFeatureSelector

pd.set_option('display.max_columns', None)

In [29]:
import glob
import os

df = load_all_processed_data()

# if bikes_3h_ago is missing then assume number of bikes hasn't changed
df['bikes_3h_ago'] = df['bikes_3h_ago'].fillna(df['bikes'])

# # use station & weekhour group means to fill in missing profile and 3h ago values
for column in ['bikes_3h_ago', 'full_profile_3h_diff_bikes', 'full_profile_bikes']:
    df[column] = df.groupby(['station', 'weekhour'])[column].transform(lambda x: x.fillna(x.mean()))

In [30]:
def single_model_regr(lin_model=LinearRegression()):
    features = ['bikes_3h_ago', 'full_profile_3h_diff_bikes', 'full_profile_bikes', 'weekhour', 'temperature.C', 'windMeanSpeed.m.s', 'relHumidity.HR', 'airPressure.mb', 'hour', 'day', 'windMaxSpeed.m.s']
    return make_pipeline(ManualFeatureSelector(features), lin_model)

def per_station_model_regr(lin_model=LinearRegression()):
    features = ['bikes_3h_ago', 'full_profile_3h_diff_bikes', 'full_profile_bikes', 'weekhour', 'temperature.C', 'windMeanSpeed.m.s', 'relHumidity.HR', 'airPressure.mb', 'hour', 'day', 'windMaxSpeed.m.s']
    return make_pipeline(ManualFeatureSelector(features), lin_model)

## Default LinearRegression

In [31]:
X = df.drop(columns=['bikes'])
y = df['bikes']

scores = cross_val_score(single_model_regr(), X, y, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-2.41751236 -2.38583146 -2.40872898 -2.43888591 -2.43588448]
-2.4173686381069603


In [32]:
score = per_station_models_cross_val_mean(per_station_model_regr(), df)
print(score)

-2.3242606151431207


# Lasso regression

In [33]:
X = df.drop(columns=['bikes'])
y = df['bikes']

scores = cross_val_score(single_model_regr(Lasso(alpha=0.1)), X, y, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-2.44658645 -2.40865589 -2.41150741 -2.41667611 -2.42187835]
-2.4210608435647716


In [34]:
score = per_station_models_cross_val_mean(per_station_model_regr(Lasso(alpha=0.1)), df)
print(score)

-2.3265785195570734


# Ridge regression

In [35]:
X = df.drop(columns=['bikes'])
y = df['bikes']

scores = cross_val_score(single_model_regr(Ridge(alpha=0.5)), X, y, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

[-2.44645508 -2.41957706 -2.41347559 -2.36493263 -2.44092418]
-2.4170729072463337


In [36]:
score = per_station_models_cross_val_mean(per_station_model_regr(Ridge(alpha=0.5)), df)
print(score)

-2.3226427340364215


# Test submissions

In [37]:
from datetime import datetime

dft = pd.read_csv('test.csv')

X = df.drop(columns=['bikes'])
y = df['bikes']

regr = single_model_regr(Lasso(alpha=0.1))

model = regr.fit(X,y)
dft['bikes'] = model.predict(dft)

time = datetime.now().strftime("%d%H%M%S")

dft[['Id', 'bikes']].to_csv(f'Predictions/single_model_submission_{time}.csv',index=False)

In [38]:
from datetime import datetime

dft = pd.read_csv('test.csv')

predictions_dfs = []

for station in np.arange(201, 276):
    stationdf = df[df['station'] == station]
    test_stationdf = dft[dft['station'] == station].copy()
    
    X = stationdf.drop(columns=['bikes'])
    y = stationdf.bikes
    
    regr = per_station_model_regr(Lasso(alpha=0.1))

    model = regr.fit(X,y)
    test_stationdf['bikes'] = model.predict(test_stationdf).astype(int)
    predictions_dfs.append(test_stationdf[['Id', 'bikes']])

predictions_df = pd.concat(predictions_dfs)
    
time = datetime.now().strftime("%d%H%M%S")

predictions_df.to_csv(f'Predictions/per_station_models_submission_{time}.csv',index=False)