Linear Regression Assignment

In [1]:
## Import all the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
%matplotlib inline

In [2]:
## import model related libraries

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer


In [3]:
# RMSLE function definition
#RMSLE = sqrt( (1/n) * Σ (log(pred+1) - log(actual+1))² ) 
def rmsle(y_true, y_pred):
    y_pred = np.maximum(0, y_pred)  # Clip to zero to avoid log of negative numbers
    return np.sqrt(np.mean((np.log1p(y_pred) - np.log1p(y_true)) ** 2))


In [4]:
# importing data to dataframe using pandas  
train_data = pd.read_csv("bike_train.csv")
test_data = pd.read_csv("bike_test.csv")

In [5]:
# Display the shape of the dataset
print(train_data.shape)

(10450, 12)


In [6]:
# Display the first 5 rows of the training dataset
train_data.head(5)

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,15/7/12 7:00,3,0,0,1,28.7,33.335,79,6.0032,17,30,47
1,14/8/12 15:00,3,0,1,1,33.62,37.88,46,15.0013,84,199,283
2,6/2/11 6:00,1,0,0,1,10.66,12.88,60,15.0013,0,1,1
3,6/5/12 17:00,2,0,0,2,26.42506,30.566166,61,9.512288,198,330,531
4,9/1/12 2:00,1,0,1,1,9.84,12.12,56,8.9981,2,3,5


In [7]:
print (train_data.info()) #2.4 Summary of  Training data set

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10450 entries, 0 to 10449
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   datetime    10450 non-null  object 
 1   season      10450 non-null  int64  
 2   holiday     10450 non-null  int64  
 3   workingday  10450 non-null  int64  
 4   weather     10450 non-null  int64  
 5   temp        10450 non-null  float64
 6   atemp       10450 non-null  float64
 7   humidity    10450 non-null  int64  
 8   windspeed   10450 non-null  float64
 9   casual      10450 non-null  int64  
 10  registered  10450 non-null  int64  
 11  count       10450 non-null  int64  
dtypes: float64(3), int64(8), object(1)
memory usage: 979.8+ KB
None


In [8]:
print (train_data.describe()) #2.5 Statistical Summary of all numerical attributes

             season       holiday    workingday       weather          temp  \
count  10450.000000  10450.000000  10450.000000  10450.000000  10450.000000   
mean       2.507943      0.028804      0.675694      1.413876     20.191700   
std        1.116946      0.167263      0.468137      0.632258      7.792683   
min        1.000000      0.000000      0.000000      1.000000      0.820000   
25%        2.000000      0.000000      0.000000      1.000000     13.940000   
50%        3.000000      0.000000      1.000000      1.000000     20.500000   
75%        4.000000      0.000000      1.000000      2.000000     26.240000   
max        4.000000      1.000000      1.000000      4.000000     41.000000   

              atemp      humidity     windspeed        casual    registered  \
count  10450.000000  10450.000000  10450.000000  10450.000000  10450.000000   
mean      23.605793     61.924211     12.765259     35.869091    154.511675   
std        8.478045     19.245193      8.102821    

In [9]:
#
train_data['datetime'] = pd.to_datetime(train_data['datetime'], format='%d/%m/%y %H:%M')
train_data['hour'] = train_data['datetime'].dt.hour
train_data['month'] = train_data['datetime'].dt.month
train_data['day_of_week'] = train_data['datetime'].dt.dayofweek

In [10]:
# Cyclical Encoding
def cyclical_encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col] / max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col] / max_val)
    return data

train_data = cyclical_encode(train_data, 'hour', 24)
train_data = cyclical_encode(train_data, 'month', 12)
train_data = cyclical_encode(train_data, 'day_of_week', 7)




In [11]:
# Dropping unnecessary columns
train_data.drop(['datetime', 'hour', 'month', 'day_of_week'], axis=1, inplace=True)

X = train_data.drop(['count', 'casual', 'registered'], axis=1)
y = train_data['count']

In [12]:
#slpit the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Preprocessing Pipelines
numeric_features = ['temp', 'atemp', 'humidity', 'windspeed']
categorical_features = ['season', 'holiday', 'workingday', 'weather',
                        'hour_sin', 'hour_cos', 'month_sin', 'month_cos', 'day_of_week_sin', 'day_of_week_cos']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [14]:
# Define models to train
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Lasso Regression': Lasso(alpha=0.1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

In [18]:
results = {}

for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_val)
    
    r2 = r2_score(y_val, y_pred)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    mae = mean_absolute_error(y_val, y_pred)
    rmsle_val = rmsle(y_val, y_pred)
    
    results[name] = {
        'R2': r2,
        'RMSE': rmse,
        'MAE': mae,
        'RMSLE': rmsle_val
    }

results

{'Linear Regression': {'R2': 0.6293401333921633,
  'RMSE': np.float64(110.26924683193789),
  'MAE': 79.50503102326867,
  'RMSLE': np.float64(1.0601753808741279)},
 'Ridge Regression': {'R2': 0.629326641206827,
  'RMSE': np.float64(110.2712537386879),
  'MAE': 79.5072320784514,
  'RMSLE': np.float64(1.0605145236345501)},
 'Lasso Regression': {'R2': 0.6293319141177547,
  'RMSE': np.float64(110.27046941935002),
  'MAE': 79.50668151277337,
  'RMSLE': np.float64(1.0621548420768656)},
 'Gradient Boosting': {'R2': 0.7200003679805664,
  'RMSE': np.float64(95.83968022012009),
  'MAE': 67.82731010199765,
  'RMSLE': np.float64(0.8593062167203266)},
 'Random Forest': {'R2': 0.8624644422996276,
  'RMSE': np.float64(67.16981035057444),
  'MAE': 44.904140510366815,
  'RMSLE': np.float64(0.5328749014296794)}}