In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import list_holidays as hd

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from helpers import listCombinations
pd.options.mode.chained_assignment = None

In [6]:
# weer dataset
weer_df = pd.read_csv("weer.csv", index_col='day', names=['STN','day','windspeed','temperature','SQ','rain'], header=0)
weer_df.drop(['STN', 'SQ'], axis=1, inplace=True)
weer_df.index = pd.to_datetime(weer_df.index,format="%Y%m%d")

In [7]:
# holidays, vacations
holidays, vacations = hd.holidays(), hd.vacations()

In [8]:
# train dataset
train_df = pd.read_csv("train.csv", index_col=0)
train_df["date"] = pd.to_datetime(train_df["date"], format="%Y-%m-%d %H:%M:%S")
train_df.head()

Unnamed: 0,tripid,userid,bikeid,account,bikenumber,start_time,end_time,start_lat,start_lng,end_lat,end_lng,date
1,33838,13452,382,AT,6631000433,2019-01-01 06:46:03,2019-01-01 06:46:03,52.296065,4.787667,52.2962,4.787679,2019-01-01 06:46:00
2,33839,13452,812,AT,6631000146,2019-01-01 06:47:39,2019-01-01 06:47:39,52.295938,4.788336,52.29594,4.788557,2019-01-01 06:47:35
3,33851,13182,238,AT,6631000443,2019-01-01 14:15:36,2019-01-01 14:15:36,52.307337,4.807633,52.307336,4.80763,2019-01-01 14:15:32
4,33852,13182,238,AT,6631000443,2019-01-01 14:33:50,2019-01-01 14:33:50,52.308478,4.80997,52.308537,4.810099,2019-01-01 14:33:46
5,33867,12674,787,AI,6631000283,2019-01-01 20:11:05,2019-01-01 20:11:05,52.51034,4.725336,52.508565,4.726276,2019-01-01 20:11:01


In [9]:
# test and verify dataset
verify_df = pd.read_csv("test.csv", index_col=0)
verify_df["date"] = pd.to_datetime(verify_df["date"], format="%Y-%m-%d %H:%M:%S")
verify_df.head()

Unnamed: 0,tripid,userid,bikeid,account,bikenumber,start_time,end_time,start_lat,start_lng,end_lat,end_lng,date
1,151899,18734,1221,AT,6631000941,2019-11-01 00:17:17,2019-11-01 00:39:04,52.285242,4.853709,52.283334,4.853159,2019-11-01 00:17:13
2,151901,14414,749,AV,6631000096,2019-11-01 01:35:09,2019-11-01 01:37:53,52.306448,4.800563,52.30646,4.800592,2019-11-01 01:35:05
3,151903,23773,758,AV,6631000273,2019-11-01 06:13:18,2019-11-01 06:16:54,52.307922,4.806427,52.307627,4.804996,2019-11-01 06:13:14
4,151905,15420,318,AT,6631000009,2019-11-01 06:39:14,2019-11-01 06:41:01,52.295016,4.790957,52.297227,4.787585,2019-11-01 06:39:10
5,151909,12504,498,AV,6631000702,2019-11-01 06:49:33,2019-11-01 06:51:17,52.294959,4.792023,52.295575,4.790326,2019-11-01 06:49:29


In [10]:
# sample submission
submission_df = pd.read_csv("sampleSubmission.csv", index_col=0)
submission_df["date"] = pd.to_datetime(submission_df.index, format="%Y%m%d")
submission_df.head()

Unnamed: 0_level_0,Predicted,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
20191101,0,2019-11-01
20191102,0,2019-11-02
20191103,0,2019-11-03
20191104,0,2019-11-04
20191105,0,2019-11-05


In [11]:
## PREPARE TRAIN DATASET

# rented bikes counted
daily_rentals_df = train_df[["tripid"]].groupby([train_df["date"].dt.date]).count()
daily_rentals_df.index = pd.to_datetime(daily_rentals_df.index, format="%Y-%m-%d")
daily_rentals_df['weekday'] = daily_rentals_df.index.weekday
daily_rentals_df.rename(columns={'tripid':'rented'},inplace=True)

# # merge with weather and create new columns
# daily_rentals_df = weer_df.merge(daily_rentals_df, left_index=True, right_index=True)
# daily_rentals_df['heavyrain'] = daily_rentals_df['rain'].map(lambda x: x > 200)
# daily_rentals_df['verycold'] = daily_rentals_df['temperature'].map(lambda x: x < 15)
# daily_rentals_df['hardwind'] = daily_rentals_df['windspeed'].map(lambda x: x > 50)
# daily_rentals_df['season'] = daily_rentals_df.index.to_series().map(lambda x: (x.month%12 + 3) // 3)

# holidays and vacations
daily_rentals_df["vacation"]=0
daily_rentals_df["holiday"]=0
for i in vacations:
    daily_rentals_df["vacation"][daily_rentals_df.index.isin(vacations[i])]=i
for i in holidays:
    daily_rentals_df["holiday"][daily_rentals_df.index.isin(holidays[i])]=i
    
# show current df
daily_rentals_df.head()

Unnamed: 0_level_0,rented,weekday,vacation,holiday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-01-01,8,1,5,3
2019-01-02,61,2,5,0
2019-01-03,90,3,5,0
2019-01-04,64,4,5,0
2019-01-05,17,5,5,0


In [12]:
## PREPARE TEST DATASET

# prepare test dataset
test_df = pd.DataFrame(pd.to_datetime(verify_df["date"].dt.date.unique()), columns=['date'])
test_df['weekday'] = pd.to_datetime(test_df['date']).dt.dayofweek
test_df.set_index("date", inplace=True)

# # merge with weather and create new columns
# test_df = weer_df.merge(test_df, left_index=True, right_index=True)
# test_df['heavyrain'] = test_df['rain'].map(lambda x: x > 100)
# test_df['verycold'] = test_df['temperature'].map(lambda x: x < 40)
# test_df['hardwind'] = test_df['windspeed'].map(lambda x: x > 70)
# test_df['season'] = test_df.index.to_series().map(lambda x: (x.month%12 + 3) // 3)

# holidays and vacations
test_df["vacation"]=0
test_df["holiday"]=0
for i in vacations:
    test_df["vacation"][test_df.index.isin(vacations[i])]=i
for i in holidays:
    test_df["holiday"][test_df.index.isin(holidays[i])]=i

# show current df
test_df.head()

Unnamed: 0_level_0,weekday,vacation,holiday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-11-01,4,0,0
2019-11-02,5,0,0
2019-11-03,6,0,0
2019-11-04,0,0,0
2019-11-05,1,0,0


In [13]:
## PREPARE SUBMISSION DATASET

# prepare submission dataset
submission_df = pd.DataFrame(pd.to_datetime(submission_df["date"].dt.date.unique()), columns=['date'])
submission_df['weekday'] = pd.to_datetime(submission_df['date']).dt.dayofweek
submission_df.set_index("date", inplace=True)

# # merge with weather and create new columns
# submission_df = weer_df.merge(submission_df, left_index=True, right_index=True)
# submission_df['heavyrain'] = submission_df['rain'].map(lambda x: x > 100)
# submission_df['verycold'] = submission_df['temperature'].map(lambda x: x < 40)
# submission_df['hardwind'] = submission_df['windspeed'].map(lambda x: x > 70)
# submission_df['season'] = submission_df.index.to_series().map(lambda x: (x.month%12 + 3) // 3)

# holidays and vacations
submission_df["vacation"]=0
submission_df["holiday"]=0
for i in vacations:
    submission_df["vacation"][submission_df.index.isin(vacations[i])]=i
for i in holidays:
    submission_df["holiday"][submission_df.index.isin(holidays[i])]=i

# show current df
submission_df.head()

Unnamed: 0_level_0,weekday,vacation,holiday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-11-01,4,0,0
2019-11-02,5,0,0
2019-11-03,6,0,0
2019-11-04,0,0,0
2019-11-05,1,0,0


In [14]:
# model training
train_X = daily_rentals_df[['weekday', 'vacation', 'holiday']]
train_y = daily_rentals_df[['rented']]
rfc = RandomForestClassifier(n_estimators=150, criterion='entropy', random_state=0).fit(train_X, train_y)

# model testing
test_X = test_df[['weekday', 'vacation', 'holiday']]
test_y = verify_df[["tripid"]].groupby([verify_df["date"].dt.date]).count()['tripid'].values
root_mean_squared_error = mean_squared_error(test_y, rfc.predict(test_X)) ** 0.5

# submission to csv
submission_X = submission_df[['weekday', 'vacation', 'holiday']]
pred = rfc.predict(submission_X)

print(root_mean_squared_error)
print(pred)
# new_df = pd.DataFrame(index=submission_df.index)
# new_df.rename(columns={'date':'id'}, inplace=True)
# new_df['Prediction'] = pred
# new_df.to_csv('C:\sampleSubmission.csv')

[180  14  20 313 499 324 289 252  19  13 267 206 211 301 225  26  22 249
 297 336 327 218  15   9 278 317 219 242 189  10   5 256 342 368 310 203
  14  19 253]
57.744796635381256
[193  25  10 226 268 231 304 193  25  10 226 268 231 304 193  25  10 226
 268 231 304 193  25  10 226 268 231 304 193  25  10 226 268 231 304 193
  25  10 226 268 231 304 193  25  10 226 268 231 304 193  25   8 123 114
  61  90  64  17   8 123 114 231 304 193  25  10 226 268 231 304 193  25
  10 226 268 231 304 193  25  10 226 268 231 304 193  25  10 226 268 231
 304 193]


  after removing the cwd from sys.path.
