In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import TimeSeriesSplit
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error,r2_score


In [8]:
df=pd.read_csv("/content/train.csv")
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


In [16]:
df["datetime"]=pd.to_datetime(df["datetime"])
df["dayofweek"]=df["datetime"].dt.dayofweek
df["month"]=df["datetime"].dt.month
df["hour"]=df["datetime"].dt.hour
df["weekend"]=df["dayofweek"].isin([5,6]).astype(int)
df["1_console_lag"]=df["count"].shift(1)
df["24_hours_back"]=df["count"].shift(24)
df["mean_of_last_24hours"]=df["count"].rolling(24).mean()
df=df.dropna().reset_index(drop=True)
df.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,dayofweek,month,hour,weekend,1_console_lag,24_hours_back,mean_of_last_24hours
0,2011-01-02 00:00:00,1,0,0,2,18.86,22.725,88,19.9995,4,13,17,6,1,0,1,39.0,16.0,41.083333
1,2011-01-02 01:00:00,1,0,0,2,18.04,21.97,94,16.9979,1,16,17,6,1,1,1,17.0,40.0,40.125
2,2011-01-02 02:00:00,1,0,0,2,17.22,21.21,100,19.0012,1,8,9,6,1,2,1,17.0,32.0,39.166667
3,2011-01-02 03:00:00,1,0,0,2,18.86,22.725,94,12.998,2,4,6,6,1,3,1,9.0,13.0,38.875
4,2011-01-02 04:00:00,1,0,0,2,18.86,22.725,94,12.998,2,1,3,6,1,4,1,6.0,1.0,38.958333


In [43]:
x=df.drop(["datetime","count","casual","registered"],axis=1)
y=df["count"]
categorical_columns=["season","weather","dayofweek","hour","month"]
numerical_columns=[i  for i in x.columns if i not in categorical_columns]


In [44]:
preprocessor=ColumnTransformer(transformers=[
    ("cat",OneHotEncoder(drop="first",handle_unknown="ignore"),categorical_columns),
    ("numerical","passthrough",numerical_columns)

])

In [51]:
tscv=TimeSeriesSplit(n_splits=5)
rmse_scores=[]
r2_scores=[]
scores=[]

In [52]:
for fold,(idx_train,idx_test) in enumerate(tscv.split(x),1):
  x_train=x.iloc[idx_train]
  x_test=x.iloc[idx_test]
  y_train=y.iloc[idx_train]
  y_test=y.iloc[idx_test]

  x_train=preprocessor.fit_transform(x_train)
  x_test=preprocessor.transform(x_test)

  model=Ridge(alpha=1.0)
  model.fit(x_train,y_train)

  y_pred=model.predict(x_test)

  mse=mean_squared_error(y_test,y_pred)
  rmse=np.sqrt(mse)
  r2=r2_score(y_test,y_pred)

  rmse_scores.append(rmse)
  r2_scores.append(r2)
  scores.append(model.score(x_test,y_test))

  print(f"{fold} --> RMSE : {rmse} and R2_score : {r2}")



1 --> RMSE : 63.38909121975267 and R2_score : 0.8212314865126356
2 --> RMSE : 52.60517293742904 and R2_score : 0.8474193656315396
3 --> RMSE : 67.19004839093833 and R2_score : 0.8531536402393167
4 --> RMSE : 83.51546057027397 and R2_score : 0.8559549133660597
5 --> RMSE : 82.44140167297354 and R2_score : 0.8539120685120752




In [53]:
print(f"Average RMSE : {np.mean(rmse_scores)}")
print(f"Average R2_scores : {np.mean(r2_scores)}")
print(f"Average model_scores : {np.mean(scores)}")

Average RMSE : 69.8282349582735
Average R2_scores : 0.8463342948523254
Average model_scores : 0.8463342948523254
