# Project on Seoul Bike Demand Prediction

### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns     # for plotting different types of graph 


from sklearn.model_selection import train_test_split   # testing and training data

from statsmodels.stats.outliers_influence import variance_inflation_factor     # for calculating outliers

from sklearn.preprocessing import StandardScaler       #  for Scalling features


### Import Datasets

In [None]:
data_path = r"C:\Users\Asus\Desktop\College Projects\Seoul Bike Prediction Project\data\Seoul Bike Data.csv"

df_original = pd.read_csv(data_path , encoding ="unicode_escape")

In [None]:
print("Shape of the given data:" ,df_original.shape)

In [None]:
print("Top columns and rows data is :")
df_original.head()

In [None]:
print("Lower columns and rows data is:")
df_original.tail()

### Brief Informations of Datasets

In [None]:
df_original.info()

In [None]:
df_original.describe(include ="all").T

### Checking Null Values

In [None]:
df_original.isnull().any()

##### Handling datasets

Handling Date Features

In [None]:
df_original["Date"] = pd.to_datetime(df_original["Date"])

df_original["Day"] = df_original["Date"].dt.day
df_original["Weekdays"] = df_original["Date"].dt.day_name()
df_original["Month"] = df_original["Date"].dt.month
df_original["Year"] = df_original["Date"].dt.year

df_original.drop("Date",axis=1,inplace=True)


In [None]:
df_original.head(2)

In [None]:
df_original.info()

#### Exploratory Data Analysis (EDA)

Exploratory Data Analysis (EDA) is an approach that is used to analyze the data and discover trends, patterns, or check assumptions in data with the help of statistical summaries and graphical representationsm

In [None]:
sns.pairplot(df_original)

In [None]:
plt.figure(figsize = (6,4))
Month = df_original.groupby("Month").sum().reset_index()
sns.barplot(x ="Month" , y="Rented Bike Count" , data =Month)

In [None]:
plt.figure(figsize = (8,7))
Month = df_original.groupby("Day").sum().reset_index()
sns.barplot(x ="Day" , y="Rented Bike Count" , data =Month)

In [None]:
plt.figure(figsize = (7,6))
Month = df_original.groupby("Hour").sum().reset_index()
sns.barplot(x ="Hour" , y="Rented Bike Count" , data =Month)

In [None]:
plt.figure(figsize = (6,4))
sns.barplot(x ="Holiday" , y="Rented Bike Count" , data =df_original)

In [None]:
plt.figure(figsize = (6,4))
sns.barplot(x ="Seasons" , y="Rented Bike Count" , data =df_original)

In [None]:
plt.figure(figsize = (150,50))
sns.barplot(x ="Rainfall(mm)" , y="Rented Bike Count" , data =df_original)

In [None]:
plt.figure(figsize = (40,10))
sns.displot(df_original["Rented Bike Count"])

##### Skewed Data

A skewed data distribution is neither symmetric nor normal because the data values trail off more sharply on one side than on the other side. If the value of feature is negative then data is skrewed towards left side whereas if the value of features is positive then the data is skrewed towards right side.

Skewed Data is normalized by following meyhods:- Box-cox transform. Log transform. Square root transform.

In [None]:
df_original.skew().sort_values(ascending=True)

##### Multiple Linear Collinearity

In [None]:
df_original.corr()

In [None]:
plt.figure(figsize = (15,8))
sns.heatmap(df_original.corr() , annot=True , cmap ="coolwarm")

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def get_vif(df_original):
    vif = pd.DataFrame()
    vif["Variables"] = df_original.columns
    vif["VIF"] = [variance_inflation_factor(df_original.values , i) for i in range(df_original.shape[1])]
    
    return vif

In [None]:
not_for_vif = ["Day" ,"Year" , "Rented Bike Count" , "Month" ]

get_vif(df_original[[i for i in df_original.describe().columns if i not in not_for_vif]])

In [None]:
not_for_vif = ["Day" ,"Year" , "Rented Bike Count" , "Month" ,"Dew point temperature(°C)"]

get_vif(df_original[[i for i in df_original.describe().columns if i not in not_for_vif]])

In [None]:
df_original.drop("Dew point temperature(°C)",axis=1,inplace=True)

In [None]:
df_original.info()

In [None]:
df_original.head(2)

In [None]:
df_original.shape

##### Encoding

Nominal Encoding

In [None]:
df_original["Holiday"].value_counts()

In [None]:
df_original["Functioning Day"].value_counts()

In [None]:
df_original["Functioning Day"] = df_original["Functioning Day"].map({"Yes":1 ,"No":0})
df_original["Holiday"] = df_original["Holiday"].map({"Holiday":1 ,"No Holiday":0})

In [None]:
df_original.info()

One Hot Encoding

In [None]:
df_original["Seasons"].value_counts()

In [None]:
df_original["Weekdays"].value_counts()

In [None]:
df_orig = df_original.copy()

In [None]:
df_seasons = pd.get_dummies(df_orig["Seasons"] ,drop_first=False)
df_Weekdays = pd.get_dummies(df_orig["Weekdays"] ,drop_first=False)

In [None]:
df_seasons.head(2)

In [None]:
df_Weekdays.head(2)

In [None]:
df_orig.info()

In [None]:
df = pd.concat([df_orig,df_seasons ,df_Weekdays ] , axis=1)

In [None]:
df.drop(["Seasons" ,"Weekdays"],axis=1,inplace=True)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.head(3)

In [None]:
df.columns

#### Splitting The Data for Training and Testing

In [None]:
X = df.drop("Rented Bike Count",axis=1)
Y = df["Rented Bike Count"]

from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size= 0.2 ,random_state=2023)

print("Shape of the X_train data:" , X_train.shape)
print("Shape of the y_train data:" , y_train.shape)
print("Shape of the X_test data:" , X_test.shape)
print("Shape of the y_test data:" , y_test.shape)

##### Scalling Features

As Machine Learning model learn only numerical value so to avoid any partiallity between any features we use scaling features. This Scaling Features convert Numerical data to numpy format i.e. into mean,standard deviation form

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)

X_train = sc.transform(X_train)
X_test = sc.transform(X_test)


In [None]:
X_train

In [None]:
sc.mean_

In [None]:
sc.scale_

### Training Machine Learning Model

#### 1. Linear Regression Model

In [None]:
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
LR.fit(X_train,y_train)

In [None]:
y_pred = LR.predict(X_test)
y_pred

#####  Model Evaluation 

In [None]:
from sklearn.metrics import mean_squared_error , mean_absolute_error , r2_score

In [None]:
MSE = mean_squared_error(y_test,y_pred)
RMSE = np.sqrt(MSE)
MAE = mean_absolute_error(y_test,y_pred)
R2 = r2_score(y_test,y_pred)

print("Mean squared Error of given data is :" ,MSE)
print("Root Mean squared Error of given data is :" ,RMSE)
print("Mean absolute Error of given data is :" ,MAE)
print("R2 score of given data is :" ,R2)

In [None]:
def det_metrics(y_true, y_pred, model_name):
    mse = mean_squared_error(y_test,y_pred)
    rmse = np.sqrt(MSE)
    mae = mean_absolute_error(y_test,y_pred) 
    r2 = r2_score(y_test,y_pred)
   

    print("Mean squared Error  of",model_name, "is :", round(mse,3))
    print("Root Mean squared Error of",model_name, "is :" ,round(rmse,3))
    print("Mean absolute Error of",model_name, "is :" ,round(mae,3))
    print("R2 score of",model_name, "is :" ,round(r2,3))

In [None]:
det_metrics(y_test,y_pred, "Linear Regression")


#### 2. Training Multi Machine Learning Models

In [None]:
from sklearn.linear_model import Ridge ,Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
rid = Ridge().fit(X_train,y_train)
y_pred_Ridge =rid.predict(X_test)

las = Lasso().fit(X_train,y_train)
y_pred_Lasso = las.predict(X_test)

poly = PolynomialFeatures(2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.fit_transform(X_test)
poly_r = LinearRegression().fit(X_train_poly ,y_train)
y_pred_PolynomialFeatures = poly_r.predict(X_test_poly)

svc = SVC().fit(X_train,y_train)
y_pred_SVC = svc.predict(X_test)

KNR = KNeighborsRegressor().fit(X_train,y_train)
y_pred_KNeighborsRegressor = KNR.predict(X_test)

DTR = DecisionTreeRegressor().fit(X_train,y_train)
y_pred_DecisionTreeRegressor = DTR.predict(X_test)

RFR = RandomForestRegressor().fit(X_train,y_train)
y_pred_RandomForestRegressor = RFR.predict(X_test)

XGBR = XGBRegressor().fit(X_train,y_train)
y_pred_XGBRegressor = XGBR.predict(X_test)


#### Evaluating Multi Machine Learning Modes

In [None]:
det_metrics(y_test,y_pred_Ridge, "Ridge")
print("...")
det_metrics(y_test,y_pred_Lasso, "Lasso")
print("...")
det_metrics(y_test,y_pred_PolynomialFeatures, "Polynomial Features")
print("...")
det_metrics(y_test,y_pred_SVC, "SVC")
print("...")
det_metrics(y_test,y_pred_KNeighborsRegressor, "KNeighborsRegressor")
print("...")
det_metrics(y_test,y_pred_DecisionTreeRegressor, "DecisionTreeRegressor")
print("...")
det_metrics(y_test,y_pred_RandomForestRegressor, "RandomForestRegressor")
print("...")
det_metrics(y_test,y_pred_XGBRegressor, "XGBRegressor")


#### 3. Visualize The pattern of Prediction Value

In [None]:
plt.scatter(y_test,y_pred)
plt.xlabel("Ground Truth")
plt.ylabel("Prediction")
plt.title("Linear Regression Tested VS Prediction Plot")
plt.show()
plt.figure(figsize =(10,5))

In [None]:
plt.scatter(y_test,y_pred_RandomForestRegressor)
plt.xlabel("Ground Truth")
plt.ylabel("Prediction")
plt.title("Random Forest Regressor Tested VS Prediction Plot")
plt.show()

In [None]:
plt.scatter(y_test,y_pred_XGBRegressor)
plt.xlabel("Ground Truth")
plt.ylabel("Prediction")
plt.title("XG Boost Regressor Tested VS Prediction Plot")
plt.show()

### Hyper Parameters Tuning for Random Forest Regressor Model

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators =[int(x) for x in np.linspace(start =200,stop=2000,num=10)]

# Number of features in each split
max_features = ["auto","sqrt"]

# Number of levels in every decision tree
max_depth = [int(x) for x in np.linspace(10,120,num=12)]

# Number of samples requires to split a single node
min_samples_split = [2,5,10]

# Number of samples requires to split a single leaf
min_samples_leaf = [1,2,4]

# Methods for selecting samples for training
bootstrap = [True,False]

# Creating Random Grid
Random_grid = { "n_estimators": n_estimators , "max_features":max_features , "max_depth" :max_depth ,
              "min_samples_split" :min_samples_split , "min_samples_leaf" : min_samples_leaf , "bootstrap" :bootstrap}

In [None]:
import time
start_time = time.time()

RFR = RandomForestRegressor()
RFR_random = RandomizedSearchCV( estimator = RFR,
    param_distributions =Random_grid,
    n_iter=100,
    n_jobs=-1,
    cv=3,
    verbose=2,
    random_state=42)
RFR_random.fit(X_train,y_train)
y_pred_RFR_random = RFR_random.predict(X_test)

print("Time taken by the system for training the data through Randomized Search CV :" ,time.time()-start_time)

In [None]:
det_metrics(y_test,y_pred_RFR_random, "Random Forest Regressor Fine Tune")


In [None]:
RFR_random.best_params_

In [None]:
RFR_tuned = RandomForestRegressor(n_estimators = 800 ,max_depth =100,
              min_samples_split =2 ,  min_samples_leaf=1 ,bootstrap=True)
RFR_tuned.fit(X_train,y_train)
y_pred_RFR_tuned = RFR_tuned.predict(X_test)

In [None]:
det_metrics(y_test,y_pred_RFR_tuned, "Random Forest Regressor Fine Tune With Best Parameters")


In [None]:
!pip install --upgrade scikit-learn

### Hyper Parameters Tuning for XG Boost Regressor Model

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import time
start_time = time.time()

n_estimators =  [int(x) for x in np.linspace(start =200,stop=2000,num=10)]
max_depth =  [int(x) for x in np.linspace(10,120,num=12)]
colsample_bytree = np.arange(0.4,1.0,0.1)
colsample_bylevel =  np.arange(0.4,1.0,0.1)
subsample = np.arange(0.5,1.0,0.1)
learning_rate = [0.01,0.1,0.2,0.3]

params = { "n_estimators": n_estimators , "colsample_bytree":colsample_bytree , "max_depth" :max_depth ,
              "colsample_bylevel" :colsample_bylevel , "subsample" : subsample , "learning_rate" :learning_rate}

XGBR = XGBRegressor(seed=20)

RSCV = RandomizedSearchCV( estimator = XGBR,
    param_distributions =params,
    n_iter=25,
    cv=5,
    verbose=1,
    scoring = 'neg_mean_squared_error')


RSCV.fit(X_train,y_train)
y_pred_XGB_random= RSCV.predict(X_test)

print("Time taken by the system for training the XG Boost Model through Randomized Search CV :" ,time.time()-start_time)

det_metrics(y_test,y_pred_XGB_random, "XG Boost Regressor With Best Parameters")

print("Best Parameters are :", RSCV.best_params_)



In [None]:
xgbr_tune = XGBRegressor(subsample = 0.5,
                          n_estimators = 1200, max_depth =  90, learning_rate =  0.01, 
                          colsample_bytree =  0.8999999999999999, 
                          colsample_bylevel = 0.8999999999999999)
xgbr_tune.fit(X_train,y_train)
y_pred_xgbr_tune = xgbr_tune.predict(X_test)

det_metrics(y_test,y_pred_xgbr_tune, "XG Boost Regressor Fine Tune With Best Parameters")


### Save ML Model

In [None]:
import pickle
import os

dir = r"C:\Users\Asus\Desktop\College Projects\Seoul Bike Prediction Project\models"
model_file_name = "XGBoost_Regressor_r2_0_929_v1.pkl"

model_file_path = os.path.join(dir , model_file_name)

pickle.dump(xgbr_tune , open(model_file_path , "wb"))

In [None]:
X_test[0 ,:]

In [None]:
X_test[1,:]

In [None]:
X_test[3,:]


In [None]:
y_test

### Dumping Scaling Features Parameters

In [None]:
sc_dump_path = r"C:\Users\Asus\Desktop\College Projects\Seoul Bike Prediction Project\models\sc.pkl"

pickle.dump(sc ,open(sc_dump_path ,"wb"))

## Different Algorithm Analysis and Comparison

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# --- Regression Models to Compare ---
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
# You can add more, e.g., from xgboost import XGBRegressor, from lightgbm import LGBMRegressor

# --- Hypothetical Data Loading and Preprocessing (Replace with your actual data) ---
# This part simulates having your preprocessed data ready, similar to the end
# of your 'Seoul Bike Sharing Demand Prediction.ipynb' notebook.
# You would load your 'Seoul Bike Data.csv', perform all cleaning, feature engineering,
# and one-hot encoding, and then separate features (X) and target (y).

# For demonstration, let's create some dummy data that resembles your problem
# In a real scenario, X would have 24 features and y would be 'Rented Bike Count'
np.random.seed(42)
num_samples = 1000
num_features = 24
X = pd.DataFrame(np.random.rand(num_samples, num_features), columns=[f'feature_{i}' for i in range(num_features)])
y = pd.Series(np.random.randint(0, 3000, num_samples)) # Dummy bike counts

# If you want to use your actual preprocessed DataFrame from your first notebook:
# df_processed = pd.read_csv('your_preprocessed_data.csv') # Load your clean data
# X = df_processed.drop('Rented Bike Count', axis=1) # Adjust target column name if different
# y = df_processed['Rented Bike Count']

# Splitting data into training and testing sets
# It's important to use the same split for all models for fair comparison
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling numerical features (if not already done in your preprocessing pipeline)
# Ensure StandardScaler is fitted ONLY on training data and then transform both train/test
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for easier inspection and plotting (optional, but good practice)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)


# --- 1. Define Algorithms to Test ---
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(random_state=42), # Added random_state for reproducibility
    "Decision Tree Regressor": DecisionTreeRegressor(random_state=42),
    "Random Forest Regressor": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(n_estimators=100, random_state=42),
    # "XGBoost Regressor": XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42),
    # "LightGBM Regressor": LGBMRegressor(n_estimators=100, random_state=42)
}

# --- Store Results ---
results = pd.DataFrame(columns=['Model', 'R-squared', 'MAE', 'MSE', 'RMSE'])

# --- 2. Train, Evaluate, and Visualize Each Model ---
print("--- Model Training, Evaluation, and Visualization ---")

for name, model in models.items():
    print(f"\n--- Training {name} ---")

    # Train the model
    model.fit(X_train_scaled, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test_scaled)

    # --- 3. Calculate Evaluation Metrics ---
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    print(f"{name} Performance:")
    print(f"  R-squared (R2): {r2:.4f}")
    print(f"  Mean Absolute Error (MAE): {mae:.2f}")
    print(f"  Mean Squared Error (MSE): {mse:.2f}")
    print(f"  Root Mean Squared Error (RMSE): {rmse:.2f}")

    # Store results
    results.loc[len(results)] = [name, r2, mae, mse, rmse]

    # --- 4. Error Checking Visualization ---

    # Plotting Actual vs. Predicted Values
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2) # Diagonal line
    plt.xlabel("Actual Bike Count")
    plt.ylabel("Predicted Bike Count")
    plt.title(f"{name}: Actual vs. Predicted Bike Count")
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.show()

    # Plotting Residuals (Prediction Errors)
    residuals = y_test - y_pred
    plt.figure(figsize=(10, 6))
    sns.scatterplot(x=y_pred, y=residuals, alpha=0.6)
    plt.axhline(y=0, color='r', linestyle='--', lw=2) # Zero error line
    plt.xlabel("Predicted Bike Count")
    plt.ylabel("Residuals (Actual - Predicted)")
    plt.title(f"{name}: Residuals Plot")
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.show()

    # Distribution of Residuals
    plt.figure(figsize=(8, 5))
    sns.histplot(residuals, kde=True, bins=30)
    plt.xlabel("Residuals")
    plt.ylabel("Frequency")
    plt.title(f"{name}: Distribution of Residuals")
    plt.show()

print("\n--- Summary of All Models ---")
print(results.set_index('Model'))

# Optional: Visualize overall performance comparison
results_melted = results.melt(id_vars='Model', var_name='Metric', value_name='Value')
plt.figure(figsize=(12, 7))
sns.barplot(x='Model', y='Value', hue='Metric', data=results_melted[results_melted['Metric'].isin(['R-squared', 'MAE', 'RMSE'])])
plt.title('Model Performance Comparison')
plt.ylabel('Metric Value')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()