In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import os
print(os.listdir("../input"))

**Data Wrangling**

ETL (Extract Transform Load)

In [None]:
train = pd.read_csv('../input/bike-share-demand/train.csv')
test = pd.read_csv('../input/bike-share-demand/test.csv')
train_data = train.copy()
test_data = test.copy()

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
train_data.head()

In [None]:
train.describe()

In [None]:
train.isna().sum()


In [None]:
train.info()

Some visualization correlating each variable related to variable target 'cnt'

In [None]:
fig,(ax1,ax2,ax3,ax4) = plt.subplots(ncols=4)
fig.set_size_inches(12, 5)
sns.regplot(x="temp", y="cnt", data=train_data,ax=ax1)
sns.regplot(x="atemp", y="cnt", data=train_data,ax=ax2)
sns.regplot(x="windspeed", y="cnt", data=train_data,ax=ax3)
sns.regplot(x="humidity", y="cnt", data=train_data,ax=ax4)

In [None]:
cor= train[:].corr()
corval = np.array(cor)
corval[np.tril_indices_from(corval)] = False
fig=plt.gcf()
fig.set_size_inches(30,12)
sns.heatmap(data=cor,mask=corval,square=True,annot=True,cbar=True)

Reshaping Data

In [None]:
train_data['datetime'] = pd.to_datetime(train_data['datetime'])
train_data['weekday'] = train_data['datetime'].dt.weekday_name
train_data['year'] = train_data['datetime'].dt.year.astype(str)
train_data['hour'] = train_data['datetime'].dt.hour
train_data['month'] = train_data['datetime'].dt.month
test_data['datetime'] = pd.to_datetime(test_data['datetime'])
test_data['weekday'] = test_data['datetime'].dt.weekday_name
test_data['year'] = test_data['datetime'].dt.year.astype(str)
test_data['hour'] = test_data['datetime'].dt.hour
test_data['month'] = test_data['datetime'].dt.month

In [None]:
train_data.columns

In [None]:
sns.factorplot(x="hour",y="cnt",data=train_data,kind='bar',size=5,aspect=1.5)

+Feature hour_group

In [None]:
def hour_group(s):
    if((0<=s) & (s<=6)):
        return 1
    elif((s==7) | (s==9)):
        return 2
    elif((s==8) | (s==16) | (s==19)):
        return 5
    elif((10<=s) & (s<=15)):
        return 4
    elif((s==17) | (s==18)):
        return 6
    elif(20<=s):
        return 3
train_data['hour_group'] = train_data['hour'].apply(hour_group).astype(str)
test_data['hour_group'] = test_data['hour'].apply(hour_group).astype(str)

In [None]:
sns.factorplot(x="weekday",y='cnt',kind='bar',data=train_data,size=7,aspect=1)

In [None]:
sns.factorplot(x="month",y="cnt",data=train_data,kind='bar',size=5,aspect=1.5)

In [None]:
sns.factorplot(x="year",y="cnt",data=train_data,kind='bar',size=5,aspect=1.5)

change data type to String

In [None]:
train_data['season'] = train_data['season'].astype(str)
test_data['season'] = test_data['season'].astype(str)
train_data['weather'] = train_data['weather'].astype(str)
test_data['weather'] = test_data['weather'].astype(str)
train_data['holiday'] = train_data['holiday'].astype(str)
test_data['holiday'] = test_data['holiday'].astype(str)
train_data['workingday'] = train_data['workingday'].astype(str)
test_data['workingday'] = test_data['workingday'].astype(str)

Transform to dummy variables

In [None]:
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

In [None]:
train_data.head()

**Manually splitting data and Dropping columns**

In [None]:
x_train = train_data.copy()
del x_train['casual'], x_train['registered'], x_train['cnt'], x_train['datetime'], x_train['windspeed']
x_test = test_data.copy()
del x_test['datetime'], x_test['windspeed']
y_train = train_data['cnt']
y_test = test_data.copy()
y_test = pd.concat([test_data,train_data['cnt']], axis=1)
y_test = y_test.dropna(axis=0)
y_test = y_test['cnt']


**Scaling Variable**

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

**Modelling and Evaluation**

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=1000,min_samples_leaf=2,
                              random_state=0, n_jobs=-1)
model.fit(x_train, y_train)
pred = model.predict(x_test)
pred = np.expm1(pred)
pred = np.round(pred)

In [None]:
from sklearn.metrics import mean_squared_log_error 
rmsle = np.sqrt(mean_squared_log_error(pred,y_test))
rmsle/10

In [None]:
from sklearn.ensemble import BaggingRegressor
model2 = BaggingRegressor(n_estimators=1000, random_state =0)
model2.fit(x_train,y_train)
pred2 = model2.predict(x_test)
pred2 = np.expm1(pred2)
pred2 = np.round(pred2)
rmsle2 = np.sqrt(mean_squared_log_error(pred2,y_test))
rmsle2/10

In [None]:
from sklearn import ensemble
model3 = ensemble.GradientBoostingRegressor(max_features=10,learning_rate=0.01,
                                            n_estimators=1000,subsample=0.7,random_state=0)
model3.fit(x_train,y_train)
pred3 = model3.predict(x_test)
pred3 = np.expm1(pred3)
pred3 = np.round(pred3)
rmsle3 = np.sqrt(mean_squared_log_error(pred3,y_test))
rmsle3/10

In [None]:
model_names=['RandomForestRegressor','BaggingRegressor','GradientBoostingRegressor']
RSMLE=[rmsle,rmsle2,rmsle3]
d={'Modelling Algo':model_names,'RMSLE':RSMLE}   
rmsle_frame=pd.DataFrame(d)
rmsle_frame

In [None]:
sns.factorplot(y='Modelling Algo',x='RMSLE',data=rmsle_frame,kind='bar',size=5,aspect=2)

**Submittion**

In [None]:
output1 = pd.DataFrame({'datetime': test_data.datetime,
                       'count': pred})
output1.to_csv('submission.csv', index=False)

In [None]:
output2 = pd.DataFrame({'datetime': test_data.datetime,
                       'count': pred2})
output2.to_csv('submission2.csv', index=False)

In [None]:
output3 = pd.DataFrame({'datetime': test_data.datetime,
                       'count': pred3})
output3.to_csv('submission3.csv', index=False)