In [85]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score,mean_squared_log_error

In [86]:
bike=pd.read_csv('bike_train.csv', parse_dates=True, index_col=0 )

In [87]:
bike.head() 

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [6]:
X=bike.copy()

In [7]:
X.drop(['casual','registered','count'],axis=1,inplace=True)

In [8]:
#X = X.reset_index(level=0)
#X['year'] = X['datetime'].dt.year
X

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0000
2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0000
2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0000
2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0000
2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0000
...,...,...,...,...,...,...,...,...
2012-12-19 19:00:00,4,0,1,1,15.58,19.695,50,26.0027
2012-12-19 20:00:00,4,0,1,1,14.76,17.425,57,15.0013
2012-12-19 21:00:00,4,0,1,1,13.94,15.910,61,15.0013
2012-12-19 22:00:00,4,0,1,1,13.94,17.425,61,6.0032


In [9]:
y = bike.copy()
y = y['count']
#y = y.reset_index(level=0)
y

datetime
2011-01-01 00:00:00     16
2011-01-01 01:00:00     40
2011-01-01 02:00:00     32
2011-01-01 03:00:00     13
2011-01-01 04:00:00      1
                      ... 
2012-12-19 19:00:00    336
2012-12-19 20:00:00    241
2012-12-19 21:00:00    168
2012-12-19 22:00:00    129
2012-12-19 23:00:00     88
Name: count, Length: 10886, dtype: int64

In [10]:
categorical_features=['season','holiday','workingday','weather']
numerical_features= ['temp','atemp','humidity','windspeed']

In [11]:
def day_period_dataframe(X):
    X = pd.DataFrame(X).copy()
    
    X["day_period"]=X["hour"].apply(day_period)
    return X

In [12]:
def day_period(hour):
    label=None
    if hour>=22 or hour<4 or hour==0:
        label="night"
    elif hour<10:
        label="morning"
    elif hour<16:
        label="afternoon"
    else:
        label="evening"
    return label

In [13]:
def timebreakdown (X):
    X = pd.DataFrame(X).copy()
    
    X['year'] = X.index.year
    X['month'] = X.index.month
    X['weekday'] = X.index.day_name()
    X['hour'] = X.index.hour

    
    return X

In [14]:
timebreakdown_step = FunctionTransformer(timebreakdown)

In [15]:
day_period_step = FunctionTransformer(day_period_dataframe)

In [65]:
# now we define the steps we need to do for both groups of columns
categorical_steps = [('timebreakdown', timebreakdown_step),
                     ('day_period_step', day_period_step),
                     ('onehot', OneHotEncoder(handle_unknown='ignore'))]

In [66]:
#sub pipeline 1
categorical_transformer = Pipeline(categorical_steps)
categorical_transformer

Pipeline(steps=[('timebreakdown',
                 FunctionTransformer(func=<function timebreakdown at 0x7ff4db9f9d30>)),
                ('day_period_step',
                 FunctionTransformer(func=<function day_period_dataframe at 0x7ff4db9f9550>)),
                ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [67]:
# sub-pipeline 2
numeric_steps = [('imputer', SimpleImputer(strategy='median')), 
                 ('scaler', StandardScaler())]

numeric_transformer  = Pipeline(numeric_steps)
numeric_transformer

Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())])

In [68]:
#combining both pipelines
preprocessor = ColumnTransformer(transformers=[
        ('numeric', numeric_transformer, numerical_features),
        ('categorical', categorical_transformer, categorical_features)])


In [69]:
final_steps = [('preprocessor', preprocessor),
     ('LinReg', LinearRegression())]

In [70]:
pipeline = Pipeline(final_steps)

In [71]:
pipeline

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['temp', 'atemp', 'humidity',
                                                   'windspeed']),
                                                 ('categorical',
                                                  Pipeline(steps=[('timebreakdown',
                                                                   FunctionTransformer(func=<function timebreakdown at 0x7ff4db9f9d30>)),
                                                                  ('day_period_step',
                                          

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                    test_size = 0.2, random_state=42)

In [73]:
X_train.head()

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-07-06 05:00:00,3,0,1,1,27.88,31.82,83,6.0032
2012-08-04 16:00:00,3,0,0,1,36.9,40.91,39,19.9995
2012-07-11 15:00:00,3,0,1,1,32.8,35.605,41,16.9979
2011-04-10 04:00:00,2,0,0,2,14.76,18.18,93,7.0015
2011-11-19 10:00:00,4,0,0,1,13.12,15.15,45,16.9979


In [25]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['temp', 'atemp', 'humidity',
                                                   'windspeed']),
                                                 ('categorical',
                                                  Pipeline(steps=[('timebreakdown',
                                                                   FunctionTransformer(func=<function timebreakdown at 0x7ff4db9f9d30>)),
                                                                  ('day_period_step',
                                          

In [103]:
preprocessor.fit_transform(X_train).shape

(8708, 65)

In [26]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'preprocessor', 'LinReg', 'preprocessor__n_jobs', 'preprocessor__remainder', 'preprocessor__sparse_threshold', 'preprocessor__transformer_weights', 'preprocessor__transformers', 'preprocessor__verbose', 'preprocessor__verbose_feature_names_out', 'preprocessor__numeric', 'preprocessor__categorical', 'preprocessor__numeric__memory', 'preprocessor__numeric__steps', 'preprocessor__numeric__verbose', 'preprocessor__numeric__imputer', 'preprocessor__numeric__scaler', 'preprocessor__numeric__imputer__add_indicator', 'preprocessor__numeric__imputer__copy', 'preprocessor__numeric__imputer__fill_value', 'preprocessor__numeric__imputer__missing_values', 'preprocessor__numeric__imputer__strategy', 'preprocessor__numeric__imputer__verbose', 'preprocessor__numeric__scaler__copy', 'preprocessor__numeric__scaler__with_mean', 'preprocessor__numeric__scaler__with_std', 'preprocessor__categorical__memory', 'preprocessor__categorical__steps', 'preprocessor__categor

In [28]:
y_pred = pipeline.predict(X_test)
y_pred

array([229.35978279,  15.96732753, 349.40097489, ..., 286.59679322,
       -33.50005051, 266.78655827])

In [34]:
y_pred[y_pred<0] = 0
y_pred

array([229.35978279,  15.96732753, 349.40097489, ..., 286.59679322,
         0.        , 266.78655827])

In [40]:
y_test.info()

<class 'pandas.core.series.Series'>
DatetimeIndex: 2178 entries, 2011-07-19 11:00:00 to 2012-05-15 13:00:00
Series name: count
Non-Null Count  Dtype
--------------  -----
2178 non-null   int64
dtypes: int64(1)
memory usage: 34.0 KB


In [44]:
y_pred_series=pd.Series(y_pred)

In [123]:
#print(classification_report(y_test, y_pred_series))
#print(accuracy_score(y_test, y_pred_series))

In [93]:
# Comparing with Random Forest Regressor
final_steps2 = [('preprocessor', preprocessor),
     ('RFG', RandomForestRegressor())]
pipeline = Pipeline(final_steps2)
pipeline
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                    test_size = 0.2, random_state=42)
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['temp', 'atemp', 'humidity',
                                                   'windspeed']),
                                                 ('categorical',
                                                  Pipeline(steps=[('timebreakdown',
                                                                   FunctionTransformer(func=<function timebreakdown at 0x7ff4db9f9d30>)),
                                                                  ('day_period_step',
                                          

In [60]:
y_pred2 = pipeline.predict(X_test)
y_pred2

array([142.9 ,  20.88, 153.35, ..., 477.4 ,   7.09, 150.41])

In [74]:
mean_squared_log_error(y_test,y_pred,squared=False)

1.0912383313462386

In [64]:
mean_squared_log_error(y_test,y_pred2,squared=False)

0.34324372475325987

In [100]:
final_test=pd.read_csv('test.csv', parse_dates=True, index_col=0 )
final_test

Unnamed: 0_level_0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-20 00:00:00,1,0,1,1,10.66,11.365,56,26.0027
2011-01-20 01:00:00,1,0,1,1,10.66,13.635,56,0.0000
2011-01-20 02:00:00,1,0,1,1,10.66,13.635,56,0.0000
2011-01-20 03:00:00,1,0,1,1,10.66,12.880,56,11.0014
2011-01-20 04:00:00,1,0,1,1,10.66,12.880,56,11.0014
...,...,...,...,...,...,...,...,...
2012-12-31 19:00:00,1,0,1,2,10.66,12.880,60,11.0014
2012-12-31 20:00:00,1,0,1,2,10.66,12.880,60,11.0014
2012-12-31 21:00:00,1,0,1,1,10.66,12.880,60,11.0014
2012-12-31 22:00:00,1,0,1,1,10.66,13.635,56,8.9981


In [101]:
predictions = pipeline.predict(final_test)
predictions

array([ 11.12,   4.2 ,   3.33, ..., 124.49,  95.18,  47.94])

In [106]:
bike_predictions = pd.Series(predictions)

In [108]:
filename = 'bike_predictions.csv'

bike_predictions.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: bike_predictions.csv


In [119]:
bike_csv=pd.read_csv('Final Bike predictions formatted.csv', parse_dates=True, sep = ";")

In [120]:
bike_csv

Unnamed: 0,datetime,count
0,2011-01-20 00:00:00,11.12
1,2011-01-20 01:00:00,4.20
2,2011-01-20 02:00:00,3.33
3,2011-01-20 03:00:00,3.44
4,2011-01-20 04:00:00,2.51
...,...,...
6488,2012-12-31 19:00:00,203.99
6489,2012-12-31 20:00:00,169.03
6490,2012-12-31 21:00:00,124.49
6491,2012-12-31 22:00:00,95.18


In [121]:
#bike_csv = bike_csv.set_index('datetime')
#bike_csv

In [122]:
filename = 'bike_csv.csv'

bike_csv.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: bike_csv.csv
