# Decision Tree

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline

from helpers import per_station_models_cross_val_mean, load_all_processed_data, ManualFeatureSelector

pd.set_option('display.max_columns', None)

In [2]:
import glob
import os

df = load_all_processed_data()

# # if bikes_3h_ago is missing then assume number of bikes hasn't changed
# df['bikes_3h_ago'] = df['bikes_3h_ago'].fillna(df['bikes'])

# # use station & weekhour group means to fill in missing profile and 3h ago values
# for column in ['bikes_3h_ago', 'full_profile_3h_diff_bikes', 'full_profile_bikes']:
#     df[column] = df.groupby(['station', 'weekhour'])[column].transform(lambda x: x.fillna(x.mean()))

## Default DecisionTreeRegressor

In [17]:
X = df.drop(columns=['bikes'])
y = df['bikes']

regr = make_pipeline(ManualFeatureSelector(['latitude', 'longitude', 'weekhour', 'windMeanSpeed.m.s', 'relHumidity.HR', 'airPressure.mb', 'hour', 'day', 'windMaxSpeed.m.s']), DecisionTreeRegressor())
scores = cross_val_score(regr, X, y, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')

print(scores)
print(scores.mean())

Pipeline(steps=[('manualfeatureselector',
                 <helpers.ManualFeatureSelector object at 0x7fd36ec9e0d0>),
                ('decisiontreeregressor', DecisionTreeRegressor())])
[-2.33494624 -2.25564516 -2.29551971 -2.2547491  -2.24856631]
-2.277885304659498


In [4]:
regr = make_pipeline(ManualFeatureSelector(['weekhour', 'temperature.C', 'windMeanSpeed.m.s', 'relHumidity.HR', 'airPressure.mb', 'hour', 'day', 'windMaxSpeed.m.s']), DecisionTreeRegressor())

score = per_station_models_cross_val_mean(regr, df)
print(score)

-2.092013906523974


In [54]:
from datetime import datetime

dft = pd.read_csv('test.csv')
ids = dft['Id']
dft = dft[['latitude', 'longitude', 'weekhour', 'windMeanSpeed.m.s', 'relHumidity.HR', 'airPressure.mb', 'hour', 'day', 'windMaxSpeed.m.s']]

X = df.drop(columns=['bikes'])
y = df['bikes']

regr = make_pipeline(ManualFeatureSelector(['latitude', 'longitude', 'weekhour', 'windMeanSpeed.m.s', 'relHumidity.HR', 'airPressure.mb', 'hour', 'day', 'windMaxSpeed.m.s']), DecisionTreeRegressor())

model = regr.fit(X,y)
predictions = model.predict(dft)

predictions_df = pd.DataFrame(ids)
predictions_df['bikes'] = predictions.astype(int)
predictions_df

time = datetime.now().strftime("%d%H%M%S")

predictions_df.to_csv(f'Predictions/submission_{time}.csv',index=False)

# scores = cross_val_score(regr, X, y, cv=KFold(n_splits=5, shuffle=True), scoring='neg_mean_absolute_error')

# print(scores)
# print(scores.mean())