In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
pd.options.mode.chained_assignment = None

In [59]:
# weer dataset
weer_df = pd.read_csv("weer.csv", index_col='day', names=['STN','day','windspeed','temperature','SQ','rain'], header=0)
weer_df.drop(['STN', 'SQ'], axis=1, inplace=True)
weer_df.index = pd.to_datetime(weer_df.index,format="%Y%m%d")
weer_df.head()

Unnamed: 0_level_0,windspeed,temperature,SQ,rain
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,76,77,13,5
2019-01-02,48,56,4,-1
2019-01-03,23,31,46,-1
2019-01-04,45,54,0,-1
2019-01-05,65,72,0,1


In [60]:
# train dataset
train_df = pd.read_csv("train.csv", index_col=0)
train_df["date"] = pd.to_datetime(train_df["date"], format="%Y-%m-%d %H:%M:%S")

In [61]:
# rented bikes counted
daily_rentals_df = train_df[["tripid"]].groupby([train_df["date"].dt.date]).count()
daily_rentals_df.index = pd.to_datetime(daily_rentals_df.index, format="%Y-%m-%d")
daily_rentals_df['weekday'] = daily_rentals_df.index.weekday

# merge with weather
daily_rentals_df = weer_df.merge(daily_rentals_df, left_index=True, right_index=True)
daily_rentals_df.rename(columns={'tripid':'rented'},inplace=True)
daily_rentals_df['heavyrain'] = daily_rentals_df['rain'].map(lambda x: x > 250)
daily_rentals_df['verycold'] = daily_rentals_df['temperature'].map(lambda x: x < 10)
daily_rentals_df['hardwind'] = daily_rentals_df['windspeed'].map(lambda x: x > 80)
daily_rentals_df['season'] = daily_rentals_df.index.to_series().apply(lambda x: (x.month%12 + 3) // 3)

# show current df
daily_rentals_df.head()

Unnamed: 0,windspeed,temperature,SQ,rain,rented,weekday,heavyrain,verycold,hardwind,season
2019-01-01,76,77,13,5,8,1,False,False,False,1
2019-01-02,48,56,4,-1,61,2,False,False,False,1
2019-01-03,23,31,46,-1,90,3,False,False,False,1
2019-01-04,45,54,0,-1,64,4,False,False,False,1
2019-01-05,65,72,0,1,17,5,False,False,False,1


In [62]:
train_cols = ['weekday', 'heavyrain', 'verycold', 'hardwind']

In [63]:
# model trainen
X = daily_rentals_df[train_cols]
y = daily_rentals_df[['rented']]
rfc = RandomForestClassifier(n_estimators=500, random_state=0).fit(X, y)

  after removing the cwd from sys.path.


In [64]:
# test and verify dataset
verify_df = pd.read_csv("test.csv", index_col=0)
verify_df["date"] = pd.to_datetime(verify_df["date"], format="%Y-%m-%d %H:%M:%S")

In [65]:
# prepare test dataset
test_df = pd.DataFrame(pd.to_datetime(verify_df["date"].dt.date.unique()), columns=['date'])
test_df['weekday'] = pd.to_datetime(test_df['date']).dt.dayofweek
test_df.set_index("date", inplace=True)
test_df = weer_df.merge(test_df, left_index=True, right_index=True)
test_df['heavyrain'] = test_df['rain'].map(lambda x: x > 200)
test_df['verycold'] = test_df['temperature'].map(lambda x: x < 20)
test_df['hardwind'] = test_df['windspeed'].map(lambda x: x > 70)
test_df['season'] = test_df.index.to_series().apply(lambda x: (x.month%12 + 3) // 3)
test_df.head()

Unnamed: 0,windspeed,temperature,SQ,rain,weekday,heavyrain,verycold,hardwind,season
2019-11-01,46,79,0,38,4,False,False,False,4
2019-11-02,85,128,35,33,5,False,False,True,4
2019-11-03,38,101,20,46,6,False,False,False,4
2019-11-04,40,103,20,1,0,False,False,False,4
2019-11-05,25,96,5,36,1,False,False,False,4


In [66]:
y = verify_df[["tripid"]].groupby([verify_df["date"].dt.date]).count()['tripid'].values
X = test_df[train_cols]
root_mean_squared_error = mean_squared_error(y, rfc.predict(X)) ** 0.5

In [67]:
print(root_mean_squared_error)

135.95417780097256
