In [2]:
!wget https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
!unzip bike+sharing+dataset.zip

--2024-08-27 13:09:38--  https://archive.ics.uci.edu/static/public/275/bike+sharing+dataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bike+sharing+dataset.zip’

bike+sharing+datase     [  <=>               ] 273.43K   962KB/s    in 0.3s    

2024-08-27 13:09:38 (962 KB/s) - ‘bike+sharing+dataset.zip’ saved [279992]

Archive:  bike+sharing+dataset.zip
  inflating: Readme.txt              
  inflating: day.csv                 
  inflating: hour.csv                


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv('hour.csv')
df

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,2012-12-31,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119
17375,17376,2012-12-31,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89
17376,17377,2012-12-31,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90
17377,17378,2012-12-31,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61


In [5]:
# Data preprocessing
df['day_night'] = df['hr'].apply(lambda x: 'day' if 6 <= x <= 18 else 'night')
df.drop(['instant', 'casual', 'registered'], axis=1, inplace=True)
df['dteday'] = pd.to_datetime(df.dteday)
df['season'] = df.season.astype('category')
df['holiday'] = df.holiday.astype('category')
df['weekday'] = df.weekday.astype('category')
df['weathersit'] = df.weathersit.astype('category')
df['workingday'] = df.workingday.astype('category')
df['mnth'] = df.mnth.astype('category')
df['yr'] = df.yr.astype('category')
df['hr'] = df.hr.astype('category')
df.drop(columns=['dteday'], inplace=True)

In [6]:
# Creating interaction features
df['temp_hum'] = df['temp'] * df['hum']
df['hum_windspeed'] = df['hum'] * df['windspeed']

In [7]:
# Separating features and target variable
X = df.drop(columns=['cnt'])
y = df['cnt']

In [8]:
!pip install category_encoders




In [9]:
numerical_features = ['temp', 'hum', 'windspeed', 'temp_hum', 'hum_windspeed']
categorical_features = ['season', 'weathersit', 'day_night']

# Pipelining
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target_enc', TargetEncoder())
])

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])

# Training the LinearRegression model
lr_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('model', LinearRegression())
])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
lr_pipeline.fit(X_train, y_train)


In [11]:
# Print the sizes and ratios
print(f"Original dataset size: {df.shape[0]} rows")
print(f"Training data size: {X_train.shape[0]} rows")
print(f"Test data size: {X_test.shape[0]} rows")
print(f"Training data ratio: {X_train.shape[0] / df.shape[0]:.2f}")
print(f"Test data ratio: {X_test.shape[0] / df.shape[0]:.2f}")


Original dataset size: 17379 rows
Training data size: 13903 rows
Test data size: 3476 rows
Training data ratio: 0.80
Test data ratio: 0.20


In [12]:
y_pred_lr = lr_pipeline.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f'LinearRegression Mean Squared Error: {mse_lr}')
print(f'LinearRegression R-squared: {r2_lr}')

LinearRegression Mean Squared Error: 19526.297769102166
LinearRegression R-squared: 0.38335562542902035


### Linear Regressor
Compare their performance using metrics like Mean Squared Error (MSE)
and R-squared. **bold text**

In [1]:
pip install category_encoders


Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.linear_model import LinearRegression as SklearnLinearRegression

class LinearRegressionCustom:
    def __init__(self, learning_rate=0.0001, n_iters=1000, lambda_reg=0.1):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.lambda_reg = lambda_reg
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        for i in range(self.n_iters):
            y_predicted = np.dot(X, self.weights) + self.bias
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y)) + (self.lambda_reg * self.weights)
            db = (1 / n_samples) * np.sum(y_predicted - y)
            dw = np.clip(dw, -1e5, 1e5)
            db = np.clip(db, -1e5, 1e5)

            self.weights -= self.lr * dw
            self.bias -= self.lr * db

            if i % (self.n_iters // 10) == 0:
                loss = np.mean((y_predicted - y) ** 2) + (self.lambda_reg / 2) * np.sum(self.weights ** 2)
                print(f"Iteration {i}: Loss = {loss}")

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

df = pd.read_csv('hour.csv')
df['day_night'] = df['hr'].apply(lambda x: 'day' if 6 <= x <= 18 else 'night')
df.drop(['instant', 'casual', 'registered'], axis=1, inplace=True)
df['dteday'] = pd.to_datetime(df.dteday)
df['season'] = df.season.astype('category')
df['holiday'] = df.holiday.astype('category')
df['weekday'] = df.weekday.astype('category')
df['weathersit'] = df.weathersit.astype('category')
df['workingday'] = df.workingday.astype('category')
df['mnth'] = df.mnth.astype('category')
df['yr'] = df.yr.astype('category')
df['hr'] = df.hr.astype('category')
df.drop(columns=['dteday'], inplace=True)
df['temp_hum'] = df['temp'] * df['hum']
df['hum_windspeed'] = df['hum'] * df['windspeed']

X = df.drop(columns=['cnt'])
y = df['cnt']

numerical_features = ['temp', 'hum', 'windspeed', 'temp_hum', 'hum_windspeed']
categorical_features = ['season', 'weathersit', 'day_night']

numerical_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

categorical_data = X[categorical_features]
numerical_data = X[numerical_features]

target_encoder = TargetEncoder()
categorical_encoded = target_encoder.fit_transform(categorical_data, y)

numerical_scaled = numerical_pipeline.fit_transform(numerical_data)
X_processed = np.hstack([numerical_scaled, categorical_encoded.values])


X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

model_custom = LinearRegressionCustom(learning_rate=0.00001, n_iters=1000, lambda_reg=0.1)
model_custom.fit(X_train, y_train)
y_pred_custom = model_custom.predict(X_test)


if np.any(np.isnan(y_pred_custom)):
    print("Warning: NaNs detected in custom model predictions.")

model_sklearn = SklearnLinearRegression()
model_sklearn.fit(X_train, y_train)
y_pred_sklearn = model_sklearn.predict(X_test)

if np.any(np.isnan(y_pred_sklearn)):
    print("Warning: NaNs detected in sklearn model predictions.")


mse_custom = mean_squared_error(y_test, y_pred_custom)
mse_sklearn = mean_squared_error(y_test, y_pred_sklearn)
r2_custom = r2_score(y_test, y_pred_custom)
r2_sklearn = r2_score(y_test, y_pred_sklearn)

print("Scratch Linear Regression Model Mean Squared Error:", mse_custom)
print("Linear Regression Model Mean Squared Error:", mse_sklearn)
print("Scratch Linear Regression Model R-squared:", r2_custom)
print("Linear Regression Model R-squared:", r2_sklearn)


Iteration 0: Loss = 69521.56772310472
Iteration 100: Loss = 25322.17862105599
Iteration 200: Loss = 25265.908527440628
Iteration 300: Loss = 25252.627617530157
Iteration 400: Loss = 25242.34679738787
Iteration 500: Loss = 25232.296597997196
Iteration 600: Loss = 25222.284752099396
Iteration 700: Loss = 25212.297803941943
Iteration 800: Loss = 25202.33474616277
Iteration 900: Loss = 25192.395449180665
Scratch Linear Regression Model Mean Squared Error: 24291.793406638746
Linear Regression Model Mean Squared Error: 19526.31625469389
Scratch Linear Regression Model R-squared: 0.23286032357105857
Linear Regression Model R-squared: 0.38335504165035006
