In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
pd.options.mode.chained_assignment = None

In [2]:
# weer dataset
weer_df = pd.read_csv("weer.csv", index_col='day', names=['STN','day','windspeed','temperature','SQ','rain'], header=0)
weer_df.drop(['STN', 'SQ'], axis=1, inplace=True)
weer_df.index = pd.to_datetime(weer_df.index,format="%Y%m%d")
weer_df.head()

Unnamed: 0_level_0,windspeed,temperature,rain
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-01-01,76,77,5
2019-01-02,48,56,-1
2019-01-03,23,31,-1
2019-01-04,45,54,-1
2019-01-05,65,72,1


In [3]:
# train dataset
train_df = pd.read_csv("train.csv", index_col=0)
train_df["date"] = pd.to_datetime(train_df["date"], format="%Y-%m-%d %H:%M:%S")

# rented bikes counted
daily_rentals_df = train_df[["tripid"]].groupby([train_df["date"].dt.date]).count()
daily_rentals_df.index = pd.to_datetime(daily_rentals_df.index, format="%Y-%m-%d")
daily_rentals_df['weekday'] = daily_rentals_df.index.weekday

# merge with weather
daily_rentals_df = weer_df.merge(daily_rentals_df, left_index=True, right_index=True)
daily_rentals_df.rename(columns={'tripid':'rented'},inplace=True)

# show current df
daily_rentals_df.head()

Unnamed: 0,windspeed,temperature,rain,rented,weekday
2019-01-01,76,77,5,8,1
2019-01-02,48,56,-1,61,2
2019-01-03,23,31,-1,90,3
2019-01-04,45,54,-1,64,4
2019-01-05,65,72,1,17,5


In [4]:
# model trainen
X = daily_rentals_df[['windspeed', 'weekday']]
y = daily_rentals_df[['rented']]
rfc = RandomForestClassifier().fit(X, y)

  after removing the cwd from sys.path.


In [5]:
# test dataset
test_df = pd.read_csv("test.csv", index_col=0)
test_df["date"] = pd.to_datetime(test_df["date"], format="%Y-%m-%d %H:%M:%S")
test_df.head()

Unnamed: 0,tripid,userid,bikeid,account,bikenumber,start_time,end_time,start_lat,start_lng,end_lat,end_lng,date
1,151899,18734,1221,AT,6631000941,2019-11-01 00:17:17,2019-11-01 00:39:04,52.285242,4.853709,52.283334,4.853159,2019-11-01 00:17:13
2,151901,14414,749,AV,6631000096,2019-11-01 01:35:09,2019-11-01 01:37:53,52.306448,4.800563,52.30646,4.800592,2019-11-01 01:35:05
3,151903,23773,758,AV,6631000273,2019-11-01 06:13:18,2019-11-01 06:16:54,52.307922,4.806427,52.307627,4.804996,2019-11-01 06:13:14
4,151905,15420,318,AT,6631000009,2019-11-01 06:39:14,2019-11-01 06:41:01,52.295016,4.790957,52.297227,4.787585,2019-11-01 06:39:10
5,151909,12504,498,AV,6631000702,2019-11-01 06:49:33,2019-11-01 06:51:17,52.294959,4.792023,52.295575,4.790326,2019-11-01 06:49:29


In [6]:
# get dates from test dataset and merge with weather data
test_df = pd.DataFrame(pd.to_datetime(test_df["date"].dt.date.unique()), columns=['date'])
test_df['weekday'] = pd.to_datetime(test_df['date']).dt.dayofweek
test_df.set_index("date", inplace=True)
test_df = weer_df.merge(test_df, left_index=True, right_index=True)
test_df.head()

Unnamed: 0,windspeed,temperature,rain,weekday
2019-11-01,46,79,38,4
2019-11-02,85,128,33,5
2019-11-03,38,101,46,6
2019-11-04,40,103,1,0
2019-11-05,25,96,36,1


In [7]:
# to verify
verify = pd.read_csv("test.csv", index_col=0)
verify["date"] = pd.to_datetime(verify["date"], format="%Y-%m-%d %H:%M:%S")

# test model
X = test_df[['windspeed', 'weekday']]
y = verify[["tripid"]].groupby([verify["date"].dt.date]).count()['tripid'].values
mean_squared_error(y, rfc.predict(X))

7826.538461538462