In [1]:
# data analysis stack
import pandas as pd
import numpy as np

# data visualization stack
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set() # set seaborn as default style

# data pre-processing stack
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    PolynomialFeatures
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

#machine learning stack
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression


from sklearn.model_selection import GridSearchCV

# miscellaneous
import time
import warnings
warnings.filterwarnings("ignore")

In [2]:
bike = pd.read_csv("./bike_train.csv", index_col=0, parse_dates=True)
bike.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [3]:
bike["log_casual"]=np.log1p(bike["casual"])
bike["log_registered"]=np.log1p(bike["registered"])

In [4]:
def yoda(df):
    df["year"] = df.index.year
    df["month"] = df.index.month
    df["hour"] = df.index.hour
    df["weather"].replace([4], [3], inplace = True)
    return df

In [6]:
yoda(bike)
bike

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,log_casual,log_registered,year,month,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16,1.386294,2.639057,2011,1,0
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40,2.197225,3.496508,2011,1,1
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32,1.791759,3.332205,2011,1,2
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13,1.386294,2.397895,2011,1,3
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1,0.000000,0.693147,2011,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336,2.079442,5.799093,2012,12,19
2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241,2.397895,5.446737,2012,12,20
2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168,1.609438,5.105945,2012,12,21
2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129,2.564949,4.770685,2012,12,22


In [7]:
def month_index(year, month):
    return (year-2011)*12 + month
    
bike['month_idx'] = month_index(bike["year"], bike["month"])

**CASUAL USERS**

In [8]:
# we need to delete either the "temp" or the "atemp" because they are highly corelated with each other.
numerical_features = [
     'temp',
     #'atemp', ## temp was better than atemp. Training score is 2 points more 
     'humidity', ## add alot like 3 points
     'windspeed', # added some more power
     'month_idx',
     'hour' 
]

categorical_features = [
    #'season', Adding season does not add much. Mostly weather matters.
    'holiday',
    'workingday',
    'weather' # our as a categorical training score is 78.
]

features = numerical_features + categorical_features

target_variable = 'log_casual' # for registered it it aroun 20 but casual gets 45 with these features except atemp.


In [9]:
X_casual_train,y_casual_train = bike[features], bike[target_variable]

In [10]:
# scaling and polynomial features
numerical_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('polynomial', PolynomialFeatures())
    ]
)

In [11]:
# one-hot encoding
categorical_transformer = Pipeline(
    steps=[
        ('ohe', OneHotEncoder(drop='first'))
    ]
)

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [13]:
estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),   # preprocessing step
        ('lasso', Lasso()) # lasso regression
    ]
)

In [14]:
param_grid = {
    'preprocessor__num__polynomial__degree': [4],
    'preprocessor__num__polynomial__interaction_only': [False,True],
    'lasso__alpha': [100.,10.,1.,0.1,0.01],
    'lasso__max_iter': [5_000, 10_000,20_000]
}

In [15]:
gscv = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    scoring='r2',
    #scoring="neg_root_mean_squared_error",
    cv=5, 
    n_jobs=-1,
    verbose=1
)

In [16]:
# initial time
ti = time.time()

# grid-search cross-validation
gscv.fit(X_casual_train,y_casual_train)

# final time 
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
time taken: 11.26 sec


In [17]:
gscv.cv_results_

{'mean_fit_time': array([0.18540058, 0.04885025, 0.12908936, 0.06881771, 0.13720469,
        0.06655836, 0.14454455, 0.07023058, 0.14583368, 0.07154894,
        0.11272216, 0.06127372, 0.14250035, 0.07099938, 0.11650023,
        0.05325742, 0.10948281, 0.04936662, 0.17134562, 0.05744071,
        0.17157879, 0.05579171, 0.17363343, 0.0546288 , 0.61675677,
        0.06509123, 0.61603303, 0.06206708, 0.74719706, 0.08247714]),
 'std_fit_time': array([0.03810106, 0.00350193, 0.01497373, 0.00446323, 0.00430912,
        0.00679712, 0.00790428, 0.0099932 , 0.00705494, 0.02003095,
        0.00687347, 0.0067986 , 0.00905819, 0.0096792 , 0.00858386,
        0.00602451, 0.00681653, 0.00297314, 0.01845052, 0.00738875,
        0.01802946, 0.00502292, 0.01800294, 0.00336549, 0.1450446 ,
        0.00583416, 0.1510798 , 0.00693352, 0.20819964, 0.02471326]),
 'mean_score_time': array([0.03514848, 0.01634579, 0.02801366, 0.02004223, 0.02748613,
        0.02336316, 0.02948952, 0.02132664, 0.03068538, 0.01

In [18]:
# list of columns to show
column_list = ['param_preprocessor__num__polynomial__degree',
               'param_preprocessor__num__polynomial__interaction_only',
               'param_lasso__alpha',
               'param_lasso__max_iter',
               'mean_test_score',
               'std_test_score',
               'rank_test_score'
              ]
# create result dataframe
result_df = pd.DataFrame(gscv.cv_results_)[column_list]

# rename columns
result_df.rename(
    columns=lambda name: name.split('__')[-1],inplace=True
)

# order by rank
result_df.sort_values(
    by='rank_test_score', ascending=True, inplace=True, ignore_index=True
)

result_df

Unnamed: 0,degree,interaction_only,alpha,max_iter,mean_test_score,std_test_score,rank_test_score
0,4,False,0.01,10000,0.632971,0.132168,1
1,4,False,0.01,5000,0.632971,0.132168,1
2,4,False,0.01,20000,0.632971,0.132168,1
3,4,True,0.01,20000,0.509126,0.051741,4
4,4,True,0.01,10000,0.509126,0.051741,4
5,4,True,0.01,5000,0.509126,0.051741,4
6,4,False,0.1,20000,0.487723,0.081381,7
7,4,False,0.1,10000,0.487723,0.081381,7
8,4,False,0.1,5000,0.487723,0.081381,7
9,4,True,0.1,20000,0.456614,0.030571,10


In [19]:
gscv.best_params_

{'lasso__alpha': 0.01,
 'lasso__max_iter': 5000,
 'preprocessor__num__polynomial__degree': 4,
 'preprocessor__num__polynomial__interaction_only': False}

In [20]:
round(gscv.best_score_,6)

0.632971

In [21]:
best_model = gscv.best_estimator_
best_model

In [22]:
best_model.fit(X_casual_train,y_casual_train);

In [23]:
# training score
casual_training_score = best_model.score(X_casual_train, y_casual_train)

print(f'Casual Train score: {round(casual_training_score,6)}')


Casual Train score: 0.817461


**Kaggle Test for Casual**

In [24]:
kaggle_test = pd.read_csv("./bike_test.csv", index_col=0, parse_dates=True)
kaggle_test.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0
2011-01-20 03:00:00,1,0,1,1,10.66,12.88,56,11.0014
2011-01-20 04:00:00,1,0,1,1,10.66,12.88,56,11.0014


In [25]:
yoda(kaggle_test)
kaggle_test

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,hour
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,0
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,1
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,2
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,3
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,4
...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,19
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,20
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,2012,12,21
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,2012,12,22


In [26]:
kaggle_test['month_idx'] = month_index(kaggle_test["year"], kaggle_test["month"])
kaggle_test

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,hour,month_idx
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,0,1
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,1,1
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,2,1
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,3,1
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,19,24
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,20,24
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,2012,12,21,24
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,2012,12,22,24


In [27]:
kaggle_test.shape

(6493, 12)

In [28]:
X_casual_kaggle = kaggle_test[features]

In [29]:
y_casual_kaggle = best_model.predict(X_casual_kaggle)
y_casual_kaggle

array([0.81301831, 0.38658337, 0.11192421, ..., 1.93150857, 1.79625497,
       1.69271096])

In [30]:
kaggle_test["log_casual"] = y_casual_kaggle
kaggle_test

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,hour,month_idx,log_casual
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,0,1,0.813018
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,1,1,0.386583
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,2,1,0.111924
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,3,1,0.053746
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,4,1,0.132718
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,19,24,2.394014
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,20,24,2.149482
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,2012,12,21,24,1.931509
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,2012,12,22,24,1.796255


**REGISTERED USERS**

In [32]:
# we need to delete either the "temp" or the "atemp" because they are highly corelated with each other.
numerical_features = [
     'temp',
     #'atemp', ## temp was better than atemp. Training score is 2 points more 
     'humidity', ## add alot like 3 points
     'windspeed', # added some more power
     'month_idx',
     'hour' 
]

categorical_features = [
    #'season', Adding season does not add much. Mostly weather matters.
    'holiday',
    'workingday',
    'weather' # our as a categorical training score is 78.
]

features = numerical_features + categorical_features

target_variable = 'log_registered' # for registered it it aroun 20 but casual gets 45 with these features except atemp.


In [33]:
X_registered_train,y_registered_train = bike[features], bike[target_variable]

In [34]:
# scaling and polynomial features
numerical_transformer = Pipeline(
    steps=[
        ('scaler', StandardScaler()),
        ('polynomial', PolynomialFeatures())
    ]
)

In [35]:
# one-hot encoding
categorical_transformer = Pipeline(
    steps=[
        ('ohe', OneHotEncoder(drop='first'))
    ]
)

In [36]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

In [37]:
estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),   # preprocessing step
        ('lasso', Lasso()) # lasso regression
    ]
)

In [38]:
param_grid = {
    'preprocessor__num__polynomial__degree': [5],
    'preprocessor__num__polynomial__interaction_only': [False,True],
    'lasso__alpha': [100.,10.,1.,0.1,0.01],
    'lasso__max_iter': [5_000, 10_000,20_000]
}

In [39]:
gscv = GridSearchCV(
    estimator=estimator,
    param_grid=param_grid,
    scoring='r2',
    cv=5, 
    n_jobs=-1,
    verbose=1
)

In [40]:
# initial time
ti = time.time()

# grid-search cross-validation
gscv.fit(X_registered_train,y_registered_train)

# final time 
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

Fitting 5 folds for each of 30 candidates, totalling 150 fits
time taken: 34.24 sec


In [41]:
gscv.cv_results_

{'mean_fit_time': array([0.32285895, 0.04970784, 0.21264305, 0.05395765, 0.19899092,
        0.048804  , 0.19922786, 0.0521946 , 0.20411816, 0.05539379,
        0.25428057, 0.05803604, 0.23256297, 0.05266819, 0.34210372,
        0.07625389, 0.3100596 , 0.05297265, 0.72834597, 0.07233129,
        0.60281382, 0.07347522, 0.83195648, 0.07213235, 6.32128696,
        0.10296392, 5.02907562, 0.07432656, 4.11801987, 0.07650442]),
 'std_fit_time': array([0.07517714, 0.00326753, 0.01628432, 0.00356259, 0.01293184,
        0.002934  , 0.00758182, 0.00328727, 0.00759057, 0.00513671,
        0.02892568, 0.00487005, 0.00422085, 0.00734578, 0.01903076,
        0.00705519, 0.04412153, 0.00698822, 0.09865811, 0.0015205 ,
        0.04830104, 0.00702987, 0.05629596, 0.01820275, 0.47003148,
        0.00495091, 0.4072717 , 0.01373626, 0.47468972, 0.00662259]),
 'mean_score_time': array([0.0522831 , 0.01436653, 0.03137279, 0.01474738, 0.0282423 ,
        0.01440458, 0.03118105, 0.01522913, 0.0302063 , 0.01

In [42]:
# list of columns to show
column_list = ['param_preprocessor__num__polynomial__degree',
               'param_preprocessor__num__polynomial__interaction_only',
               'param_lasso__alpha',
               'param_lasso__max_iter',
               'mean_test_score',
               'std_test_score',
               'rank_test_score'
              ]
# create result dataframe
result_df = pd.DataFrame(gscv.cv_results_)[column_list]

# rename columns
result_df.rename(
    columns=lambda name: name.split('__')[-1],inplace=True
)

# order by rank
result_df.sort_values(
    by='rank_test_score', ascending=True, inplace=True, ignore_index=True
)

result_df

Unnamed: 0,degree,interaction_only,alpha,max_iter,mean_test_score,std_test_score,rank_test_score
0,5,False,0.01,10000,0.566043,0.164129,1
1,5,False,0.01,5000,0.566043,0.164129,1
2,5,False,0.01,20000,0.566043,0.164129,1
3,5,True,0.01,20000,0.410746,0.029361,4
4,5,True,0.01,10000,0.410746,0.029361,4
5,5,True,0.01,5000,0.410746,0.029361,4
6,5,True,0.1,20000,0.370522,0.07418,7
7,5,True,0.1,10000,0.370522,0.07418,7
8,5,True,0.1,5000,0.370522,0.07418,7
9,5,False,1.0,20000,-0.050894,0.316696,10


In [43]:
gscv.best_params_

{'lasso__alpha': 0.01,
 'lasso__max_iter': 5000,
 'preprocessor__num__polynomial__degree': 5,
 'preprocessor__num__polynomial__interaction_only': False}

In [44]:
round(gscv.best_score_,6)

0.566043

In [45]:
best_model = gscv.best_estimator_
best_model

In [46]:
best_model.fit(X_registered_train,y_registered_train);

In [47]:
# training score
registered_training_score = best_model.score(X_registered_train, y_registered_train)

print(f'Registered Train score: {round(registered_training_score,6)}')


Registered Train score: 0.746689


In [48]:
X_registered_kaggle = kaggle_test[features]

In [49]:
y_registered_kaggle = best_model.predict(X_registered_kaggle)
y_registered_kaggle

array([2.37091701, 1.66150005, 1.2943956 , ..., 5.41404146, 4.97986269,
       4.09770938])

In [50]:
kaggle_test["log_registered"] = y_registered_kaggle
kaggle_test

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,hour,month_idx,log_casual,log_registered
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,0,1,0.813018,2.370917
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,1,1,0.386583,1.661500
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,2,1,0.111924,1.294396
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,3,1,0.053746,1.361972
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,4,1,0.132718,1.734653
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,19,24,2.394014,5.619650
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,20,24,2.149482,5.593289
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,2012,12,21,24,1.931509,5.414041
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,2012,12,22,24,1.796255,4.979863


In [54]:
kaggle_test["pred_casual"] = np.exp(kaggle_test["log_casual"])-1
kaggle_test["pred_registered"] = np.exp(kaggle_test["log_registered"])-1
kaggle_test


Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,hour,month_idx,log_casual,log_registered,pred_registered,pred_casual
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,0,1,0.813018,2.370917,9.707206,1.254703
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,1,1,0.386583,1.661500,4.267206,0.471943
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,2,1,0.111924,1.294396,2.648790,0.118428
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,3,1,0.053746,1.361972,2.903884,0.055217
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,4,1,0.132718,1.734653,4.666959,0.141927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,19,24,2.394014,5.619650,274.792825,9.957387
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,20,24,2.149482,5.593289,267.617640,7.580411
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,2012,12,21,24,1.931509,5.414041,223.537214,5.899911
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,2012,12,22,24,1.796255,4.979863,144.454407,5.027034


In [56]:
kaggle_test["count"]  = kaggle_test["pred_casual"] + kaggle_test["pred_registered"]
kaggle_test

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,year,month,hour,month_idx,log_casual,log_registered,pred_registered,pred_casual,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027,2011,1,0,1,0.813018,2.370917,9.707206,1.254703,10.961909
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,1,1,0.386583,1.661500,4.267206,0.471943,4.739149
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000,2011,1,2,1,0.111924,1.294396,2.648790,0.118428,2.767218
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,3,1,0.053746,1.361972,2.903884,0.055217,2.959101
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014,2011,1,4,1,0.132718,1.734653,4.666959,0.141927,4.808887
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,19,24,2.394014,5.619650,274.792825,9.957387,284.750212
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014,2012,12,20,24,2.149482,5.593289,267.617640,7.580411,275.198051
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014,2012,12,21,24,1.931509,5.414041,223.537214,5.899911,229.437125
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981,2012,12,22,24,1.796255,4.979863,144.454407,5.027034,149.481441


In [60]:
kaggle_bike_submission = kaggle_test[["count"]]
kaggle_bike_submission

Unnamed: 0_level_0,count
datetime,Unnamed: 1_level_1
2011-01-20 00:00:00,10.961909
2011-01-20 01:00:00,4.739149
2011-01-20 02:00:00,2.767218
2011-01-20 03:00:00,2.959101
2011-01-20 04:00:00,4.808887
...,...
2012-12-31 19:00:00,284.750212
2012-12-31 20:00:00,275.198051
2012-12-31 21:00:00,229.437125
2012-12-31 22:00:00,149.481441


In [62]:
kaggle_bike_submission.to_csv("kaggle_bike_submission.csv", index = True)