In [200]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import list_holidays as hd

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from helpers import listCombinations
pd.options.mode.chained_assignment = None

In [201]:
# weer dataset
weer_df = pd.read_csv("weer.csv", index_col='day', names=['STN','day','windspeed','temperature','SQ','rain'], header=0)
weer_df.drop(['STN', 'SQ'], axis=1, inplace=True)
weer_df.index = pd.to_datetime(weer_df.index,format="%Y%m%d")

In [202]:
# holidays, vacations
holidays, vacations = hd.holidays(), hd.vacations()

In [203]:
# train dataset
train_df = pd.read_csv("train.csv", index_col=0)
train_df["date"] = pd.to_datetime(train_df["date"], format="%Y-%m-%d %H:%M:%S")

In [204]:
# test and verify dataset
verify_df = pd.read_csv("test.csv", index_col=0)
verify_df["date"] = pd.to_datetime(verify_df["date"], format="%Y-%m-%d %H:%M:%S")

In [205]:
# sample submission
submission_df = pd.read_csv("sampleSubmission.csv", index_col=0)
submission_df.index = pd.to_datetime(submission_df.index, format="%Y%m%d")

In [206]:
## PREPARE TRAIN DATASET

# rented bikes counted
daily_rentals_df = train_df[["tripid"]].groupby([train_df["date"].dt.date]).count()
daily_rentals_df.index = pd.to_datetime(daily_rentals_df.index, format="%Y-%m-%d")
daily_rentals_df['weekday'] = daily_rentals_df.index.weekday

# merge with weather and create new columns
daily_rentals_df = weer_df.merge(daily_rentals_df, left_index=True, right_index=True)
daily_rentals_df.rename(columns={'tripid':'rented'},inplace=True)
daily_rentals_df['heavyrain'] = daily_rentals_df['rain'].map(lambda x: x > 200)
daily_rentals_df['verycold'] = daily_rentals_df['temperature'].map(lambda x: x < 15)
daily_rentals_df['hardwind'] = daily_rentals_df['windspeed'].map(lambda x: x > 50)
daily_rentals_df['season'] = daily_rentals_df.index.to_series().map(lambda x: (x.month%12 + 3) // 3)

# holidays and vacations
daily_rentals_df["vacation"]=0
daily_rentals_df["holiday"]=0
for i in vacations:
    daily_rentals_df["vacation"][daily_rentals_df.index.isin(vacations[i])]=i
for i in holidays:
    daily_rentals_df["holiday"][daily_rentals_df.index.isin(holidays[i])]=i
    
# show current df
daily_rentals_df.head()

Unnamed: 0,windspeed,temperature,rain,rented,weekday,heavyrain,verycold,hardwind,season,vacation,holiday
2019-01-01,76,77,5,8,1,False,False,True,1,5,3
2019-01-02,48,56,-1,61,2,False,False,False,1,5,0
2019-01-03,23,31,-1,90,3,False,False,False,1,5,0
2019-01-04,45,54,-1,64,4,False,False,False,1,5,0
2019-01-05,65,72,1,17,5,False,False,True,1,5,0


In [207]:
## PREPARE TEST DATASET

# prepare test dataset
test_df = pd.DataFrame(pd.to_datetime(verify_df["date"].dt.date.unique()), columns=['date'])
test_df['weekday'] = pd.to_datetime(test_df['date']).dt.dayofweek
test_df.set_index("date", inplace=True)

# merge with weather and create new columns
test_df = weer_df.merge(test_df, left_index=True, right_index=True)
test_df['heavyrain'] = test_df['rain'].map(lambda x: x > 100)
test_df['verycold'] = test_df['temperature'].map(lambda x: x < 40)
test_df['hardwind'] = test_df['windspeed'].map(lambda x: x > 70)
test_df['season'] = test_df.index.to_series().map(lambda x: (x.month%12 + 3) // 3)

# holidays and vacations
test_df["vacation"]=0
test_df["holiday"]=0
for i in vacations:
    test_df["vacation"][test_df.index.isin(vacations[i])]=i
for i in holidays:
    test_df["holiday"][test_df.index.isin(holidays[i])]=i

# show current df
test_df.head()

Unnamed: 0,windspeed,temperature,rain,weekday,heavyrain,verycold,hardwind,season,vacation,holiday
2019-11-01,46,79,38,4,False,False,False,4,0,0
2019-11-02,85,128,33,5,False,False,True,4,0,0
2019-11-03,38,101,46,6,False,False,False,4,0,0
2019-11-04,40,103,1,0,False,False,False,4,0,0
2019-11-05,25,96,36,1,False,False,False,4,0,0


In [208]:
## PREPARE SUBMISSION DATASET

# prepare submission dataset
submission_df = pd.DataFrame(pd.to_datetime(submission_df.index.weekday.unique()), columns=['date'])
submission_df['weekday'] = pd.to_datetime(submission_df.index).weekday
submission_df.set_index("date", inplace=True)

# merge with weather and create new columns
submission_df = weer_df.merge(test_df, left_index=True, right_index=True)
submission_df['heavyrain'] = submission_df['rain'].map(lambda x: x > 100)
submission_df['verycold'] = submission_df['temperature'].map(lambda x: x < 40)
submission_df['hardwind'] = submission_df['windspeed'].map(lambda x: x > 70)
submission_df['season'] = submission_df.index.to_series().map(lambda x: (x.month%12 + 3) // 3)

# holidays and vacations
submission_df["vacation"]=0
submission_df["holiday"]=0
for i in vacations:
    submission_df["vacation"][submission_df.index.isin(vacations[i])]=i
for i in holidays:
    submission_df["holiday"][submission_df.index.isin(holidays[i])]=i

# show current df
submission_df.head()

KeyError: 'rain'

In [None]:
train_X = daily_rentals_df[['weekday', 'vacation', 'holiday']]
train_y = daily_rentals_df[['rented']]
rfc = RandomForestClassifier(n_estimators=150, criterion='entropy', random_state=0).fit(train_X, train_y)

test_X = test_df[['weekday', 'vacation', 'holiday']]
test_y = verify_df[["tripid"]].groupby([verify_df["date"].dt.date]).count()['tripid'].values
root_mean_squared_error = mean_squared_error(test_y, rfc.predict(test_X)) ** 0.5

print(root_mean_squared_error)