In [20]:
# data analysis stack
import numpy as np
import pandas as pd

# data visualization stack
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

# data pre-processing stack
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# machine learning stack
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# miscellaneous
import scipy.stats as ss
import warnings
warnings.filterwarnings("ignore")



In [21]:
bike = pd.read_csv("./bike_train.csv", index_col=0, parse_dates=True)
bike.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [23]:
bike["log_casual"]=np.log1p(bike["casual"])
bike["log_registered"]=np.log1p(bike["registered"])
bike

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,log_casual,log_registered
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16,1.386294,2.639057
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40,2.197225,3.496508
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32,1.791759,3.332205
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13,1.386294,2.397895
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1,0.000000,0.693147
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336,2.079442,5.799093
2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241,2.397895,5.446737
2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168,1.609438,5.105945
2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129,2.564949,4.770685


In [24]:
# I will separate the date column to modify the day and the month.
bike["day"] = bike.index.day
bike["month"] = bike.index.month
bike["year"] = bike.index.year
bike.tail()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,log_casual,log_registered,day,month,year
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336,2.079442,5.799093,19,12,2012
2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241,2.397895,5.446737,19,12,2012
2012-12-19 21:00:00,4,0,1,1,13.94,15.91,61,15.0013,4,164,168,1.609438,5.105945,19,12,2012
2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129,2.564949,4.770685,19,12,2012
2012-12-19 23:00:00,4,0,1,1,13.12,16.665,66,8.9981,4,84,88,1.609438,4.442651,19,12,2012


In [29]:
# A function which takes the month column and adds 12 to the values for the year 2012. 

def month_index(year,month):
    return (year-2011)*12 + month


In [30]:
bike['month_idx'] = month_index(bike["year"], bike["month"])

#bike['month_idx'] = bike.apply(lambda row: month_index(row["year"], row["month"]), axis=1)
bike

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,log_casual,log_registered,day,month,year,month_idx,day_of_week
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000,3,13,16,1.386294,2.639057,1,1,2011,1,6
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000,8,32,40,2.197225,3.496508,1,1,2011,1,6
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000,5,27,32,1.791759,3.332205,1,1,2011,1,6
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000,3,10,13,1.386294,2.397895,1,1,2011,1,6
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000,0,1,1,0.000000,0.693147,1,1,2011,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027,7,329,336,2.079442,5.799093,19,12,2012,24,3
2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013,10,231,241,2.397895,5.446737,19,12,2012,24,3
2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013,4,164,168,1.609438,5.105945,19,12,2012,24,3
2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032,12,117,129,2.564949,4.770685,19,12,2012,24,3


In [27]:
bike['day_of_week'] = bike.index.weekday + 1
bike.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,log_casual,log_registered,day,month,year,month_idx,day_of_week
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16,1.386294,2.639057,1,1,2011,1,6
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2.197225,3.496508,1,1,2011,1,6
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32,1.791759,3.332205,1,1,2011,1,6
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13,1.386294,2.397895,1,1,2011,1,6
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1,0.0,0.693147,1,1,2011,1,6


In [28]:
# we need to delete either the "temp" or the "atemp" because they are highly corelated with each other.
numerical_features = [
     'temp',
    #'atemp', ## temp was better than atemp
     'humidity', ## add alot like 3 points
     'windspeed', # added some more power
     'dow' # somehow did not add anything
]

categorical_features = [
    'season',
    'holiday',
    'workingday',
    'weather'
]

features = numerical_features + categorical_features

target_variable = 'registered' # for registered it it aroun 20 but casual gets 45 with these features except atemp.

In [9]:
X,y = bike[features], bike[target_variable]

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((8708, 8), (2178, 8))

In [11]:
class BaselineModel:
    """
    Baseline model predicting only the mean value of the target variable
    """
    def predict(self,X):
        y_mean = y_train.mean()
        return [y_mean]*X.shape[0]

In [12]:
from sklearn.metrics import r2_score

y_baseline_train_pred = BaselineModel().predict(X_train)
r2 = r2_score(y_train,y_baseline_train_pred)

print(f'baseline model train score: {round(r2,6)}')

baseline model train score: 0.0


In [13]:
y_baseline_pred = BaselineModel().predict(X_test)
r2 = r2_score(y_test,y_baseline_pred)
r2

-6.703038615052392e-06

In [19]:
print(f'baseline model test score: {round(r2,6)}')

baseline model test score: -7e-06


In [15]:
# column transformation
transformer = ColumnTransformer([
    ('scaling', MinMaxScaler(), numerical_features),
    ('onehot', OneHotEncoder(drop='first'), categorical_features)
])

In [16]:
# pipeline
pipeline = Pipeline(
    [('col_transformer', transformer),                  # step-1 column transformation
     ('lr_model', LinearRegression(fit_intercept=True)) # step-2 linear fit
    ])

In [17]:
# model training
pipeline.fit(X_train,y_train)

In [18]:
# training score
training_score = pipeline.score(X_train,y_train)
print(f'training score: {round(training_score,6)}')

training score: 0.454111
