In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
test = pd.read_excel('bike_test.xlsx')
test.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,12000,2012-05-20,2,1,5,4,0,0,0,1,0.52,0.5,0.68,0.0896
1,12001,2012-05-20,2,1,5,5,0,0,0,1,0.5,0.4848,0.72,0.1045
2,12002,2012-05-20,2,1,5,6,0,0,0,1,0.5,0.4848,0.63,0.1343
3,12003,2012-05-20,2,1,5,7,0,0,0,1,0.52,0.5,0.68,0.194
4,12004,2012-05-20,2,1,5,8,0,0,0,1,0.56,0.5303,0.56,0.1642


In [3]:
test_clean = test.drop(['instant', 'dteday'], axis=1)
test_clean = test_clean.sort_values(by=['yr', 'mnth', 'hr'], ascending=True)
test_clean.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
20,2,1,5,0,0,1,1,3,0.58,0.5455,0.88,0.2985
44,2,1,5,0,0,2,1,1,0.58,0.5455,0.83,0.2537
68,2,1,5,0,0,3,1,1,0.62,0.5909,0.78,0.1343
92,2,1,5,0,0,4,1,1,0.6,0.5606,0.83,0.0896
116,2,1,5,0,0,5,1,1,0.62,0.5758,0.83,0.1343


In [4]:
train = pd.read_excel('bike_train.xlsx')
train.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [5]:
train_clean = train.drop(['instant', 'dteday', 'casual', 'registered'], axis=1)
train_clean = train_clean.sort_values(by=['yr', 'mnth', 'hr'], ascending=True)
train_clean.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,16
24,1,0,1,0,0,0,0,2,0.46,0.4545,0.88,0.2985,17
47,1,0,1,0,0,1,1,1,0.22,0.197,0.44,0.3582,5
69,1,0,1,0,0,2,1,1,0.16,0.1818,0.55,0.1045,5
92,1,0,1,0,0,3,1,1,0.2,0.2576,0.64,0.0,6


In [6]:
X, y = train_clean.drop('cnt', axis=1), train_clean['cnt']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
from sklearn.model_selection import RandomizedSearchCV

# Different rfr hyperparameters
rf_grid = {'n_estimators':np.arange(10,500,5),
          'max_depth': [None, 3, 5, 10, 15],
          'min_samples_split': np.arange(2,20,2),
          'min_samples_leaf': np.arange(1, 20, 2),
          'max_features': [0.5, 1.0, 'sqrt'],
          'max_samples': [7000]}

# Instantiate RSCV model
rs_model = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1,
                                                   random_state=42),
                             param_distributions=rf_grid,
                             n_iter=50,
                             cv=20,
                             verbose=True)

# Fit the RSCV model
rs_model.fit(X_train, y_train)

Fitting 20 folds for each of 50 candidates, totalling 1000 fits


In [12]:
rs_model.best_params_

{'n_estimators': 350,
 'min_samples_split': 10,
 'min_samples_leaf': 3,
 'max_samples': 7000,
 'max_features': 1.0,
 'max_depth': 15}

In [13]:
pred = rs_model.predict(X_test)
rs_model.score(X_test, y_test)

0.9324834193679457

In [14]:
rmse = mean_squared_error(y_test, pred, squared = False)
rmse

39.1243070949887

In [15]:
X_final = test_clean

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
20,2,1,5,0,0,1,1,3,0.58,0.5455,0.88,0.2985
44,2,1,5,0,0,2,1,1,0.58,0.5455,0.83,0.2537
68,2,1,5,0,0,3,1,1,0.62,0.5909,0.78,0.1343
92,2,1,5,0,0,4,1,1,0.6,0.5606,0.83,0.0896
116,2,1,5,0,0,5,1,1,0.62,0.5758,0.83,0.1343


In [24]:
pred_final = rs_model.predict(X_final)
pred_final = pred_final.astype(int)
pred_final

array([26, 30, 34, ..., 71, 39, 50])

In [25]:
df_pred = pd.DataFrame(pred_final, columns={'pred'})
df_pred

Unnamed: 0,pred
0,26
1,30
2,34
3,43
4,59
...,...
5375,60
5376,68
5377,71
5378,39


In [26]:
df_pred.to_csv('tmendesdiz.csv')