## 0. Importing Packages and Load Datasets

In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
#data visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'jupyterlab'

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from category_encoders import TargetEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error as mse
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
import joblib
from sklearn.compose import ColumnTransformer

In [4]:
#summary tools for enhanced EDA
!pip install summarytools
from summarytools import dfSummary

#ignore warnings
import warnings
warnings.simplefilter(action='ignore', category = FutureWarning)




[notice] A new release of pip is available: 24.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import sys
import os

# Add the src directory to the Python path
sys.path.append('E:/OneDrive - UTS/Sem 4 (Spring 2024)/Advanced ML Applications/Assignment 3/adv_mla_at3/src')

from model_evaluation import evaluate_model

In [6]:
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

In [7]:
#load data
folder_path = r"E:/OneDrive - UTS/Sem 4 (Spring 2024)/Advanced ML Applications/Assignment 3/adv_mla_at3/data/interim/"

# Load the datasets
df_train = pd.read_csv(f"{folder_path}df_train_farhan.csv")
df_val = pd.read_csv(f"{folder_path}df_val_farhan.csv")
df_test = pd.read_csv(f"{folder_path}df_test_farhan.csv")

In [8]:
df_train.head()

Unnamed: 0,startingAirport,destinationAirport,totalFare,Travel_distance,n_stops,cabin_Leg1,cabin_Leg2,cabin_Leg3,cabin_Leg4,month,day,hour,minute,day_of_week,week_of_year,date_diff
0,OAK,ATL,307.6,2175.0,1,coach,coach,no_stop,no_stop,4,17,20,50,6,15,1
1,SFO,BOS,493.1,2566.0,1,coach,coach,no_stop,no_stop,4,26,15,30,1,17,10
2,SFO,CLT,247.6,3111.0,1,coach,coach,no_stop,no_stop,4,27,5,15,2,17,10
3,SFO,CLT,279.6,2753.0,1,coach,coach,no_stop,no_stop,4,27,6,30,2,17,10
4,SFO,CLT,279.6,2557.0,1,coach,coach,no_stop,no_stop,4,27,6,59,2,17,10


In [9]:
# Define the target variable
target = 'totalFare'

# Separate features and target for training set
X_train = df_train.drop(columns=[target])
y_train = df_train[target]

# Separate features and target for validation set
X_val = df_val.drop(columns=[target])
y_val = df_val[target]

# Separate features and target for test set
X_test = df_test.drop(columns=[target])
y_test = df_test[target]

## 2. Create Pipeline

In [10]:
cabin_order = [
    ['no_stop', 'coach', 'premium coach', 'business', 'first'],  # for Cabin_Leg1
    ['no_stop', 'coach', 'premium coach', 'business', 'first'],  # for Cabin_Leg2
    ['no_stop', 'coach', 'premium coach', 'business', 'first'],  # for Cabin_Leg3
    ['no_stop', 'coach', 'premium coach', 'business', 'first'],  # for Cabin_Leg4
]

In [11]:
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cabins', OrdinalEncoder(categories=cabin_order), ['cabin_Leg1', 'cabin_Leg2', 'cabin_Leg3', 'cabin_Leg4']), 
        ('ohe', OneHotEncoder(), ['startingAirport', 'destinationAirport']), 
        ('standard', StandardScaler(), ['date_diff', 'month', 'day','hour','minute', 'day_of_week','week_of_year','Travel_distance'])
    ],
    remainder='drop'  
)

## 3. Build Predictive Models

### Baseline Model

In [13]:
# Calculate the mean of y_train
y_train_mean = y_train.mean()

# filling mean value of y_train
y_train_base = np.full(y_train.shape, y_train_mean)
y_val_base = np.full(y_val.shape, y_train_mean)

# Calculate RMSE for training and validation sets
rmse_train_base = np.sqrt(mse(y_train, y_train_base))
rmse_val_base = np.sqrt(mse(y_val, y_val_base))

print(f'Baseline Model RMSE - Training: {rmse_train_base}')
print(f'Baseline Model RMSE - Validation: {rmse_val_base}')

Baseline Model RMSE - Training: 190.93478568036284
Baseline Model RMSE - Validation: 209.3618702689759


### Linear regression with default params

In [14]:
linear_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), # Step 1: preprocess features
    ('model', LinearRegression()), # Step 2: train model
])

In [15]:
linear_pipeline.fit(X_train, y_train)

In [16]:
from model_evaluation import evaluate_model

In [17]:
evaluate_model(linear_pipeline, X_train, y_train, X_val, y_val)

Training RMSE: 132.1884688783526
Validation RMSE: 135.69273091988293


(np.float64(132.1884688783526), np.float64(135.69273091988293))

We can see that the RMSE for training and validation is pretty close with slight overfitting. The model is performing better than the baseline model.

### Ridge regression with Default params

In [18]:
pred_pipe_ridge1 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', Ridge()) 
    ]
)

In [19]:
# Fit the model
pred_pipe_ridge1.fit(X_train, y_train)

In [20]:
evaluate_model(pred_pipe_ridge1, X_train, y_train, X_val, y_val)

Training RMSE: 132.18840614866411
Validation RMSE: 135.69478019719216


(np.float64(132.18840614866411), np.float64(135.69478019719216))

### Ridge regression with different values of alpha

In [21]:
#alpha=0.5
pred_pipe_ridge2 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', Ridge(alpha=0.5)) 
    ]
)

In [22]:
# Fit the model
pred_pipe_ridge2.fit(X_train, y_train)

In [23]:
evaluate_model(pred_pipe_ridge2, X_train, y_train, X_val, y_val)

Training RMSE: 132.18840568826676
Validation RMSE: 135.6944185057815


(np.float64(132.18840568826676), np.float64(135.6944185057815))

In [24]:
#alpha=1
pred_pipe_ridge3 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', Ridge(alpha=1)) 
    ]
)
# Fit the model
pred_pipe_ridge3.fit(X_train, y_train)
evaluate_model(pred_pipe_ridge3, X_train, y_train, X_val, y_val)

Training RMSE: 132.18840614866411
Validation RMSE: 135.69478019719216


(np.float64(132.18840614866411), np.float64(135.69478019719216))

It seems that tuning the hyperparams is not effective. 

### Lasso regression with different values of alpha

In [25]:
#default params
pred_pipe_lasso1 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', Lasso()) 
    ]
)

In [26]:
# Fit the model
pred_pipe_lasso1.fit(X_train, y_train)

In [27]:
evaluate_model(pred_pipe_lasso1, X_train, y_train, X_val, y_val)

Training RMSE: 133.90851969237076
Validation RMSE: 138.35249729772855


(np.float64(133.90851969237076), np.float64(138.35249729772855))

We can see that the RMSE values are higher than ridge and linear regression

In [28]:
#alpha=0.5
pred_pipe_lasso2 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', Lasso(alpha=0.5)) 
    ]
)

# Fit the model
pred_pipe_lasso2.fit(X_train, y_train)

evaluate_model(pred_pipe_lasso2, X_train, y_train, X_val, y_val)

Training RMSE: 132.84151989141756
Validation RMSE: 136.71496013079422


(np.float64(132.84151989141756), np.float64(136.71496013079422))

We can see that the linear regression models perform pretty much the same even after tuning the hyperaparams. Lets try elastic net and conclude linear regression models.

### ElasticNet with default params and tuning hyperparams

In [29]:
#default params
pred_pipe_enet1 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', ElasticNet()) 
    ]
)

# Fit the model
pred_pipe_enet1.fit(X_train, y_train)

evaluate_model(pred_pipe_enet1, X_train, y_train, X_val, y_val)

Training RMSE: 145.2233697212354
Validation RMSE: 155.33606630848067


(np.float64(145.2233697212354), np.float64(155.33606630848067))

In [30]:
#alpha = 0.5
pred_pipe_enet2 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', ElasticNet(alpha=0.5)) 
    ]
)

# Fit the model
pred_pipe_enet2.fit(X_train, y_train)

evaluate_model(pred_pipe_enet2, X_train, y_train, X_val, y_val)

Training RMSE: 140.0136612815762
Validation RMSE: 147.89648665954093


(np.float64(140.0136612815762), np.float64(147.89648665954093))

In [31]:
#alpha = 0.1
pred_pipe_enet3 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', ElasticNet(alpha=0.1)) 
    ]
)

# Fit the model
pred_pipe_enet3.fit(X_train, y_train)

evaluate_model(pred_pipe_enet3, X_train, y_train, X_val, y_val)

Training RMSE: 134.31006081918378
Validation RMSE: 139.11020460852097


(np.float64(134.31006081918378), np.float64(139.11020460852097))

In [32]:
#alpha = 0.01
pred_pipe_enet4 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', ElasticNet(alpha=0.01)) 
    ]
)

# Fit the model
pred_pipe_enet4.fit(X_train, y_train)

evaluate_model(pred_pipe_enet4, X_train, y_train, X_val, y_val)

Training RMSE: 132.39389032838437
Validation RMSE: 136.10588424799286


(np.float64(132.39389032838437), np.float64(136.10588424799286))

In [33]:
#alpha = 0
pred_pipe_enet5 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', ElasticNet(alpha=0)) 
    ]
)

# Fit the model
pred_pipe_enet5.fit(X_train, y_train)

evaluate_model(pred_pipe_enet5, X_train, y_train, X_val, y_val)

Training RMSE: 132.19868883130138
Validation RMSE: 135.79455915823684


(np.float64(132.19868883130138), np.float64(135.79455915823684))

In [34]:
#alpha = 0, l1_ratio=0.2
pred_pipe_enet6 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', ElasticNet(alpha=0, l1_ratio=0.2)) 
    ]
)

# Fit the model
pred_pipe_enet6.fit(X_train, y_train)

evaluate_model(pred_pipe_enet6, X_train, y_train, X_val, y_val)

Training RMSE: 132.19868883130138
Validation RMSE: 135.79455915823684


(np.float64(132.19868883130138), np.float64(135.79455915823684))

In [35]:
#alpha = 0, l1_ratio=0.8
pred_pipe_enet7 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', ElasticNet(alpha=0, l1_ratio=0.8)) 
    ]
)

# Fit the model
pred_pipe_enet7.fit(X_train, y_train)

evaluate_model(pred_pipe_enet7, X_train, y_train, X_val, y_val)

Training RMSE: 132.19868883130138
Validation RMSE: 135.79455915823684


(np.float64(132.19868883130138), np.float64(135.79455915823684))

### Stochastic Gradient Descent with default params

In [36]:
#default params
pred_pipe_sgd1 = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', SGDRegressor()) 
    ]
)

In [37]:
# Fit the model
pred_pipe_sgd1.fit(X_train, y_train)

In [38]:
#evaluate
evaluate_model(pred_pipe_sgd1, X_train, y_train, X_val, y_val)

Training RMSE: 132.2546766458262
Validation RMSE: 135.83575255610938


(np.float64(132.2546766458262), np.float64(135.83575255610938))

In [40]:
# Test different values of alpha
alpha_values = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1]

# Loop through each alpha value and evaluate
for alpha in alpha_values:
    print(f"Testing alpha = {alpha}")
    
    # Define pipeline with current alpha
    pred_pipe_sgd = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('model', SGDRegressor(alpha=alpha)) 
        ]
    )
    
    # Fit the model
    pred_pipe_sgd.fit(X_train, y_train)
    
    # Evaluate the model
    evaluate_model(pred_pipe_sgd, X_train, y_train, X_val, y_val)
    print("\n")

Testing alpha = 1e-05
Training RMSE: 132.22560709260006
Validation RMSE: 136.10956218281473


Testing alpha = 0.0001
Training RMSE: 132.21903318862215
Validation RMSE: 135.82843636968488


Testing alpha = 0.001
Training RMSE: 132.26715786470882
Validation RMSE: 135.9315517093249


Testing alpha = 0.01
Training RMSE: 132.63278430197207
Validation RMSE: 136.31444193279904


Testing alpha = 0.1
Training RMSE: 135.95008909385783
Validation RMSE: 141.6904750451238




In [41]:
#test different penalties
penalties = ['l2', 'l1', 'elasticnet']

for penalty in penalties:
    print(f"Testing penalty = {penalty}")
    
    # Define pipeline with current penalty
    pred_pipe_sgd = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('model', SGDRegressor(alpha=0.0001, penalty=penalty))
        ]
    )
    
    # Fit the model
    pred_pipe_sgd.fit(X_train, y_train)
    
    # Evaluate the model
    evaluate_model(pred_pipe_sgd, X_train, y_train, X_val, y_val)
    print("\n")


Testing penalty = l2
Training RMSE: 132.20275749358893
Validation RMSE: 135.63029001765915


Testing penalty = l1
Training RMSE: 132.21923314069804
Validation RMSE: 135.75968447741582


Testing penalty = elasticnet
Training RMSE: 132.21747745850473
Validation RMSE: 135.80170207203537




The values are pretty similar. L2 regularisation performs sligthly better.

In [42]:
# Test different learning rate schedules
learning_rate_options = ['constant', 'optimal', 'invscaling', 'adaptive']

# Loop through each learning rate option and evaluate
for learning_rate in learning_rate_options:
    print(f"Testing learning_rate = {learning_rate}")
    
    # Define pipeline with current learning rate
    pred_pipe_sgd = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('model', SGDRegressor(alpha=0.0001, penalty='l2', learning_rate=learning_rate, eta0=0.01))  
        ]
    )
    
    # Fit the model
    pred_pipe_sgd.fit(X_train, y_train)
    
    # Evaluate the model
    evaluate_model(pred_pipe_sgd, X_train, y_train, X_val, y_val)
    print("\n")


Testing learning_rate = constant
Training RMSE: 135.1396424704589
Validation RMSE: 138.4117566526487


Testing learning_rate = optimal
Training RMSE: 49917.08048167512
Validation RMSE: 49869.270245249056


Testing learning_rate = invscaling
Training RMSE: 132.23705813511145
Validation RMSE: 135.7439695312428


Testing learning_rate = adaptive
Training RMSE: 132.1926712432447
Validation RMSE: 135.74411056071548




### Decision Trees with Default Params

In [43]:
pred_pipe_dt1 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor()) 
])

In [44]:
# Fit the model
pred_pipe_dt1.fit(X_train, y_train)

In [45]:
#evaluate
evaluate_model(pred_pipe_dt1, X_train, y_train, X_val, y_val)

Training RMSE: 22.58844937409714
Validation RMSE: 78.24718696652543


(np.float64(22.58844937409714), np.float64(78.24718696652543))

We can see that RMSE is comparatively lower for both training and validation set but there is high overfitting

### Decision Trees with hyperparam tuning

In [46]:
#max depth = 10

pred_pipe_dt2 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(max_depth=10)) 
])
# Fit the model
pred_pipe_dt2.fit(X_train, y_train)

#evaluate
evaluate_model(pred_pipe_dt2, X_train, y_train, X_val, y_val)

Training RMSE: 113.38474990395608
Validation RMSE: 118.68477927857009


(np.float64(113.38474990395608), np.float64(118.68477927857009))

We can see performance decreased but level of overfitting also decreased.

In [47]:
#max depth = 5

pred_pipe_dt3 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(max_depth=5)) 
])
# Fit the model
pred_pipe_dt3.fit(X_train, y_train)

#evaluate
evaluate_model(pred_pipe_dt3, X_train, y_train, X_val, y_val)

Training RMSE: 130.17662327439294
Validation RMSE: 135.12532419994446


(np.float64(130.17662327439294), np.float64(135.12532419994446))

In [48]:
#max depth = 20

pred_pipe_dt4 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(max_depth=20)) 
])
# Fit the model
pred_pipe_dt4.fit(X_train, y_train)

#evaluate
evaluate_model(pred_pipe_dt4, X_train, y_train, X_val, y_val)

Training RMSE: 67.60497825750893
Validation RMSE: 86.86541466962673


(np.float64(67.60497825750893), np.float64(86.86541466962673))

In [49]:
#max depth = 15

pred_pipe_dt5 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(max_depth=15)) 
])
# Fit the model
pred_pipe_dt5.fit(X_train, y_train)

#evaluate
evaluate_model(pred_pipe_dt5, X_train, y_train, X_val, y_val)

Training RMSE: 92.8376065702093
Validation RMSE: 101.192805257074


(np.float64(92.8376065702093), np.float64(101.192805257074))

In [50]:
#max depth = 20, min sample split =5

pred_pipe_dt6 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(max_depth=20, min_samples_split=5)) 
])
# Fit the model
pred_pipe_dt6.fit(X_train, y_train)

#evaluate
evaluate_model(pred_pipe_dt6, X_train, y_train, X_val, y_val)

Training RMSE: 67.90746036818518
Validation RMSE: 86.77095128864121


(np.float64(67.90746036818518), np.float64(86.77095128864121))

In [51]:
#max depth = 20, min sample split =3

pred_pipe_dt7 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(max_depth=20, min_samples_split=3)) 
])
# Fit the model
pred_pipe_dt7.fit(X_train, y_train)

#evaluate
evaluate_model(pred_pipe_dt7, X_train, y_train, X_val, y_val)

Training RMSE: 67.68593612090274
Validation RMSE: 86.88811243789824


(np.float64(67.68593612090274), np.float64(86.88811243789824))

In [52]:
#max depth = 20, min sample leaf=5

pred_pipe_dt8 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(max_depth=20, min_samples_leaf=5)) 
])
# Fit the model
pred_pipe_dt8.fit(X_train, y_train)

#evaluate
evaluate_model(pred_pipe_dt8, X_train, y_train, X_val, y_val)

Training RMSE: 69.37787258027419
Validation RMSE: 86.53108286000206


(np.float64(69.37787258027419), np.float64(86.53108286000206))

In [53]:
#max depth = 20, min sample leaf=10

pred_pipe_dt9 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(max_depth=20, min_samples_leaf=10)) 
])
# Fit the model
pred_pipe_dt9.fit(X_train, y_train)

#evaluate
evaluate_model(pred_pipe_dt9, X_train, y_train, X_val, y_val)

Training RMSE: 71.16697430378665
Validation RMSE: 86.90696467747979


(np.float64(71.16697430378665), np.float64(86.90696467747979))

In [59]:
from sklearn.model_selection import RandomizedSearchCV

# Updated hyperparameter space
param_dist = {
    'model__max_depth': np.arange(10, 20),  
    'model__min_samples_split': np.arange(3, 10), 
    'model__min_samples_leaf': np.arange(2, 10),  
    'model__max_features': ['auto', 'sqrt', 'log2', None],  
}

# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    pred_pipe_dt,
    param_distributions=param_dist,
    n_iter=20,  # Increase iterations for broader search
    scoring='neg_mean_squared_error',
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the model with RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get best hyperparameters
print("Best Hyperparameters: ", random_search.best_params_)

# Evaluate the model with the best parameters
best_model = random_search.best_estimator_
evaluate_model(best_model, X_train, y_train, X_val, y_val)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Hyperparameters:  {'model__min_samples_split': np.int64(6), 'model__min_samples_leaf': np.int64(2), 'model__max_features': None, 'model__max_depth': np.int64(16)}
Training RMSE: 88.16149186272659
Validation RMSE: 97.94693373059903


(np.float64(88.16149186272659), np.float64(97.94693373059903))

In [60]:
pred_pipe_dt10 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', DecisionTreeRegressor(
        max_depth=16,
        min_samples_split=6,
        min_samples_leaf=2,
        max_features=None
    )) 
])


In [61]:
# Fit the model
pred_pipe_dt10.fit(X_train, y_train)

In [62]:
# Evaluate
evaluate_model(pred_pipe_dt10, X_train, y_train, X_val, y_val)

Training RMSE: 88.1613811154386
Validation RMSE: 97.87739506860783


(np.float64(88.1613811154386), np.float64(97.87739506860783))

### LightGBM with default params

In [14]:
from lightgbm import LGBMRegressor

In [65]:
# Define the pipeline for LightGBM
pred_pipe_lgb1 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor()) 
])

# Fit the model
pred_pipe_lgb1.fit(X_train, y_train)

# Evaluate
evaluate_model(pred_pipe_lgb1, X_train, y_train, X_val, y_val)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.080315 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 518
[LightGBM] [Info] Number of data points in the train set: 2179090, number of used features: 32
[LightGBM] [Info] Start training from score 406.416774
Training RMSE: 99.95759812628029
Validation RMSE: 105.5008810263406


(np.float64(99.95759812628029), np.float64(105.5008810263406))

### LightGBM with hyperparam tuning

In [67]:
# Define the pipeline for max_depth = 3
pred_pipe_lgb3 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=3, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb3.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb3, X_train, y_train, X_val, y_val)

Training RMSE: 117.25057459386773
Validation RMSE: 123.77298594355135


(np.float64(117.25057459386773), np.float64(123.77298594355135))

In [68]:
# Define the pipeline for max_depth = 10
pred_pipe_lgb4 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb4.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb4, X_train, y_train, X_val, y_val)

Training RMSE: 100.00592080704253
Validation RMSE: 105.6725968630844


(np.float64(100.00592080704253), np.float64(105.6725968630844))

In [69]:
# Define the pipeline for max_depth = 20
pred_pipe_lgb5 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=20, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb5.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb5, X_train, y_train, X_val, y_val)

Training RMSE: 99.95759812628029
Validation RMSE: 105.5008810263406


(np.float64(99.95759812628029), np.float64(105.5008810263406))

In [70]:
# Define the pipeline for max_depth = 30
pred_pipe_lgb6 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=30, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb6.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb6, X_train, y_train, X_val, y_val)

Training RMSE: 99.95759812628029
Validation RMSE: 105.5008810263406


(np.float64(99.95759812628029), np.float64(105.5008810263406))

In [74]:
# Define the pipeline for max_depth = 10, num_leaves=10
pred_pipe_lgb7 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10,num_leaves=10, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb7.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb7, X_train, y_train, X_val, y_val)

Training RMSE: 112.10079684615883
Validation RMSE: 118.38209882401932


(np.float64(112.10079684615883), np.float64(118.38209882401932))

In [75]:
# Define the pipeline for max_depth = 10, num_leaves=20
pred_pipe_lgb8 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10,num_leaves=20, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb8.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb8, X_train, y_train, X_val, y_val)

Training RMSE: 104.44799726322395
Validation RMSE: 110.12850604828141


(np.float64(104.44799726322395), np.float64(110.12850604828141))

In [76]:
# Define the pipeline for max_depth = 10, num_leaves=40
pred_pipe_lgb9 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10,num_leaves=40, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb9.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb9, X_train, y_train, X_val, y_val)

Training RMSE: 97.44776577452724
Validation RMSE: 103.01643202212234


(np.float64(97.44776577452724), np.float64(103.01643202212234))

In [77]:
# Define the pipeline for max_depth = 10, num_leaves=60
pred_pipe_lgb10 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10,num_leaves=60, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb10.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb10, X_train, y_train, X_val, y_val)

Training RMSE: 93.1216276418754
Validation RMSE: 98.62673673397016


(np.float64(93.1216276418754), np.float64(98.62673673397016))

In [78]:
# Define the pipeline for max_depth = 10, num_leaves=100
pred_pipe_lgb11 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10,num_leaves=100, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb11.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb11, X_train, y_train, X_val, y_val)

Training RMSE: 88.37984373880451
Validation RMSE: 94.3336935126416


(np.float64(88.37984373880451), np.float64(94.3336935126416))

In [15]:
# Define the pipeline for max_depth = 10, num_leaves=200
pred_pipe_lgb12 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10,num_leaves=200, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb12.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb12, X_train, y_train, X_val, y_val)

Training RMSE: 82.55027138969388
Validation RMSE: 88.90851383094437


(np.float64(82.55027138969388), np.float64(88.90851383094437))

In [80]:
# Define the pipeline for max_depth = 10, num_leaves=500
pred_pipe_lgb13 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10,num_leaves=500, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb13.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb13, X_train, y_train, X_val, y_val)

Training RMSE: 77.4604020324159
Validation RMSE: 85.02689765931542


(np.float64(77.4604020324159), np.float64(85.02689765931542))

In [81]:
# Define the pipeline for max_depth = 10, num_leaves=1000
pred_pipe_lgb14 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10,num_leaves=1000, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb14.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb14, X_train, y_train, X_val, y_val)

Training RMSE: 77.16406749657247
Validation RMSE: 84.59084718191315


(np.float64(77.16406749657247), np.float64(84.59084718191315))

In [82]:
# Define the pipeline for max_depth = 10, num_leaves=500, n_estimators=50
pred_pipe_lgb15 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10,num_leaves=500,n_estimators=50, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb15.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb15, X_train, y_train, X_val, y_val)

Training RMSE: 86.12570891552119
Validation RMSE: 93.05822915522144


(np.float64(86.12570891552119), np.float64(93.05822915522144))

In [83]:
# Define the pipeline for max_depth = 10, num_leaves=500, n_estimators=200
pred_pipe_lgb16 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10,num_leaves=500,n_estimators=200, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb16.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb16, X_train, y_train, X_val, y_val)

Training RMSE: 69.54936413762088
Validation RMSE: 78.42171228667546


(np.float64(69.54936413762088), np.float64(78.42171228667546))

In [84]:
# Define the pipeline for max_depth = 10, num_leaves=500, n_estimators=500
pred_pipe_lgb17 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10,num_leaves=500,n_estimators=500, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb17.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb17, X_train, y_train, X_val, y_val)

Training RMSE: 59.72277900127957
Validation RMSE: 71.98845087680142


(np.float64(59.72277900127957), np.float64(71.98845087680142))

In [85]:
# Define the pipeline for max_depth = 10, num_leaves=500, n_estimators=1000
pred_pipe_lgb18 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10,num_leaves=500,n_estimators=1000, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb18.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb18, X_train, y_train, X_val, y_val)

Training RMSE: 53.21263625558726
Validation RMSE: 68.64855551371751


(np.float64(53.21263625558726), np.float64(68.64855551371751))

In [86]:
# Define the pipeline for max_depth = 10, num_leaves=500, n_estimators=500, learning_rate=0.5
pred_pipe_lgb19 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10,num_leaves=500,n_estimators=500, learning_rate=0.5, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb19.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb19, X_train, y_train, X_val, y_val)

Training RMSE: 45.701484281808014
Validation RMSE: 70.81562636921109


(np.float64(45.701484281808014), np.float64(70.81562636921109))

In [87]:
# Define the pipeline for max_depth = 10, num_leaves=500, n_estimators=500, learning_rate=0.01
pred_pipe_lgb20 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10,num_leaves=500,n_estimators=500, learning_rate=0.01, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb20.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb20, X_train, y_train, X_val, y_val)

Training RMSE: 86.72034310332103
Validation RMSE: 93.65285508051899


(np.float64(86.72034310332103), np.float64(93.65285508051899))

In [88]:
# Define the pipeline for max_depth = 10, num_leaves=500, n_estimators=500, learning_rate=0.2
pred_pipe_lgb21 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(max_depth=10,num_leaves=500,n_estimators=500, learning_rate=0.2, verbose=-1))
])

# Fit and evaluate the model
pred_pipe_lgb21.fit(X_train, y_train)
evaluate_model(pred_pipe_lgb21, X_train, y_train, X_val, y_val)

Training RMSE: 52.68777944777853
Validation RMSE: 68.84388988188884


(np.float64(52.68777944777853), np.float64(68.84388988188884))

In [89]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

# Define the hyperparameter search space with only the selected parameters
space = {
    'max_depth': hp.choice('max_depth', range(3, 21)),            # max_depth from 3 to 20
    'num_leaves': hp.choice('num_leaves', range(100, 601, 50)),   # num_leaves from 100 to 600
    'n_estimators': hp.choice('n_estimators', range(100, 501, 50)), # n_estimators from 100 to 500
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.5),      # learning_rate from 0.01 to 0.5
}

# Objective function
def objective(params):
    # Define the pipeline with LightGBM
    pred_pipe_lgb = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', LGBMRegressor(**params, verbose=-1))
    ])
    
    # Fit the model on training data
    pred_pipe_lgb.fit(X_train, y_train)
    
    # Calculate RMSE on the validation set
    val_preds = pred_pipe_lgb.predict(X_val)
    val_rmse = mean_squared_error(y_val, val_preds, squared=False)
    
    # Return the RMSE as the loss (Hyperopt minimizes the loss)
    return {'loss': val_rmse, 'status': STATUS_OK}

# Track the trials for optimization history
trials = Trials()

# Run the optimization
best = fmin(
    fn=objective,                # Objective function
    space=space,                 # Hyperparameter space
    algo=tpe.suggest,            # Optimization algorithm
    max_evals=30,                # Number of evaluations
    trials=trials,               # To store results of each trial
    rstate=np.random.default_rng(42)  # Seed for reproducibility
)

# Display the best hyperparameters found
print("Best Hyperparameters:", best)


100%|█████████████████████████████████████████████████| 30/30 [09:17<00:00, 18.58s/trial, best loss: 68.07363540897721]
Best Hyperparameters: {'learning_rate': np.float64(0.23970446078327823), 'max_depth': np.int64(16), 'n_estimators': np.int64(8), 'num_leaves': np.int64(7)}


In [90]:
# Define the pipeline with the best hyperparameters
pred_pipe_lgb22 = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(
        learning_rate=0.23970446078327823,
        max_depth=16,
        n_estimators=8,
        num_leaves=7,
        verbose=-1
    ))
])

# Fit the model
pred_pipe_lgb22.fit(X_train, y_train)

# Evaluate
evaluate_model(pred_pipe_lgb22, X_train, y_train, X_val, y_val)

Training RMSE: 133.40972937731874
Validation RMSE: 141.70672429597283


(np.float64(133.40972937731874), np.float64(141.70672429597283))

The hyperopt is giving pretty high RMSE values. lets try random search

In [91]:
from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMRegressor
import numpy as np

# Define the pipeline for LightGBM
pred_pipe_lgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LGBMRegressor(verbose=-1))
])

# Define hyperparameter search space for LightGBM
param_dist = {
    'model__max_depth': np.arange(10, 20),           # Range for max depth
    'model__num_leaves': np.arange(100, 600, 50),    # Range for num leaves
    'model__learning_rate': np.linspace(0.01, 0.5, 10),  # Learning rate from 0.01 to 0.5
    'model__n_estimators': np.arange(100, 500, 50),  # Number of estimators
    'model__min_child_samples': np.arange(5, 20),    # Minimum data in child
    'model__subsample': np.linspace(0.6, 1.0, 5),    # Subsample ratio
    'model__colsample_bytree': np.linspace(0.6, 1.0, 5),  # Feature fraction
}

# Setup RandomizedSearchCV with the LightGBM pipeline
random_search = RandomizedSearchCV(
    pred_pipe_lgb,
    param_distributions=param_dist,
    n_iter=20,                        # Number of random combinations to try
    scoring='neg_mean_squared_error',  # Scoring metric
    cv=5,                              # Cross-validation folds
    verbose=2,                         # Verbosity level for logging
    random_state=42,                   # Seed for reproducibility
    n_jobs=-1                          # Use all available processors
)

# Fit the model with RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Evaluate the model with the best parameters
best_model = random_search.best_estimator_
evaluate_model(best_model, X_train, y_train, X_val, y_val)


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Hyperparameters: {'model__subsample': np.float64(0.8), 'model__num_leaves': np.int64(500), 'model__n_estimators': np.int64(300), 'model__min_child_samples': np.int64(10), 'model__max_depth': np.int64(16), 'model__learning_rate': np.float64(0.2822222222222222), 'model__colsample_bytree': np.float64(0.9)}
Training RMSE: 51.599393420732504
Validation RMSE: 69.88507591054231


(np.float64(51.599393420732504), np.float64(69.88507591054231))

#### We will chose the lightgbm model 12 and try it on the test set since overfitting was less compared to other model.

In [16]:
 obs = pd.DataFrame(X_train.iloc[3]).transpose()

In [17]:
pred_pipe_lgb12.predict(obs)

array([346.82295459])

In [18]:
obs

Unnamed: 0,startingAirport,destinationAirport,Travel_distance,n_stops,cabin_Leg1,cabin_Leg2,cabin_Leg3,cabin_Leg4,month,day,hour,minute,day_of_week,week_of_year,date_diff
3,SFO,CLT,2753.0,1,coach,coach,no_stop,no_stop,4,27,6,30,2,17,10


In [19]:
y_test_pred = pred_pipe_lgb12.predict(X_test)

In [20]:
mse_test = mse(y_test, y_test_pred)

In [21]:
rmse_test = np.sqrt(mse_test)

In [22]:
print(f'Test RMSE: {rmse_test}')

Test RMSE: 100.44675542882612


In [23]:
from joblib import dump
dump(pred_pipe_lgb12, 'E:/OneDrive - UTS/Sem 4 (Spring 2024)/Advanced ML Applications/Assignment 3/adv_mla_at3/models/pred_pipe_lgb12.joblib')

['E:/OneDrive - UTS/Sem 4 (Spring 2024)/Advanced ML Applications/Assignment 3/adv_mla_at3/models/pred_pipe_lgb12.joblib']