# Advanced Modeling for MPG Forecasting
After establishing a baseline model using simple linear regression, it's essential to explore more advanced modeling techniques to enhance prediction accuracy and account for complex patterns in the data. While baseline models offer a solid starting point, they often rely on strong assumptions—such as linear relationships and absence of multicollinearity—which may not fully capture the nuances of real-world data like fuel efficiency (MPG).

In this stage, we introduce a series of increasingly sophisticated models including
1. Linear Regression
2. Ridge Regression
3. Random Forest
4. Gradient Boosting
5. XGBoost

These models are capable of handling non-linear relationships, interactions between variables, and high-dimensional feature spaces. Regularization techniques like Ridge Regression help mitigate overfitting, while ensemble methods such as Random Forest and Gradient Boosting combine the strengths of multiple learners to improve generalization. XGBoost, a highly optimized gradient boosting algorithm, often delivers state-of-the-art performance in structured data problems like this.

By comparing the performance of these models against the baseline, we aim to identify the most reliable and interpretable approach for accurately forecasting miles per gallon, ultimately supporting more informed decisions in vehicle design and policy planning.

# Import Libraries

In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# visualization libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# model libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import shap

# Load and Prepare Data For Modeling

## Load Data

In [2]:
# load preprocessed dataset of mpg
df  = pd.read_csv("output/mpg_cleaned.csv")

## Split the dataset into input features (x) and target variable (y)

In [3]:
# features
x = df.drop('mpg', axis=1)

# target variable
y = df['mpg']

In [4]:
# features
x

Unnamed: 0,cylinders,horsepower,weight,car_age,origin_japan,origin_usa
0,8,130.0,3504,55,False,True
1,8,165.0,3693,55,False,True
2,8,150.0,3436,55,False,True
3,8,150.0,3433,55,False,True
4,8,140.0,3449,55,False,True
...,...,...,...,...,...,...
387,4,86.0,2790,43,False,True
388,4,52.0,2130,43,False,False
389,4,84.0,2295,43,False,True
390,4,79.0,2625,43,False,True


In [5]:
# target variable
y

0      18.0
1      15.0
2      18.0
3      16.0
4      17.0
       ... 
387    27.0
388    44.0
389    32.0
390    28.0
391    31.0
Name: mpg, Length: 392, dtype: float64

## Split the dataset into training and testing sets

In [6]:
# split data into training and testing sets (80% train, 20% test)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# Advanced Modeling

In [7]:
# generate empty models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting Regressor": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(random_state=42, enable_categorical=True)
}

## Hyperparameter Grids and Bayesian Search Space

In [8]:
# hyperparameter grids
param_grids = {
    "Linear Regression": {
        'fit_intercept': [True, False],
        'positive': [False]
    },

    "Ridge Regression": {
        'alpha': np.logspace(-4, 4, 9),
        'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
    },

    "Random Forest": {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },

    "Gradient Boosting Regressor": {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    },

    "XGBoost": {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'subsample': [0.6, 0.8, 1.0]
    }
}

In [9]:
# bayesian search space
bayes_spaces = {
    "Linear Regression" : {
        'fit_intercept' : Categorical([True, False]),
        'positive' : Categorical([False])
    },

    "Ridge Regression" : {
        'alpha' : Real(1e-4, 1e4, prior='log-uniform'),
        'solver' : Categorical(['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])
    },

    "Random Forest" : {
        'n_estimators' : Integer(50, 150),
        'max_depth' : Integer(5, 30),
        'min_samples_split' : Integer(2, 10),
        'min_samples_leaf' : Integer(1, 4)
    },

    "Gradient Boosting Regressor" : {
        'n_estimators' : Integer(50, 200),
        'learning_rate' : Real(0.01, 0.3, prior='log-uniform'),
        'max_depth' : Integer(3, 10),
        'min_samples_split' : Integer(2, 10),
        'min_samples_leaf' : Integer(1, 4)
    },

    "XGBoost" : {
        'n_estimators' : Integer(50, 150),
        'learning_rate' : Real(0.01, 0.3, prior='log-uniform'),
        'max_depth' : Integer(3, 10),
        'colsample_bytree' : Real(0.5, 1.0),
        'subsample' : Real(0.5, 1.0)
    }
}

##  Training Models with Hyperparameter Tuning

In [None]:
# train models using grid search
grid_search = {}

for model in models:
    grid_search[model] = GridSearchCV(
        estimator=models[model],
        param_grid=param_grids[model],
        scoring='neg_mean_squared_error',
        cv=5,
        n_jobs=-1,
    )

    grid_search[model].fit(x_train, y_train)

In [11]:
# train models using random search
random_search = {}

for model in models:
    random_search[model] = RandomizedSearchCV(
        models[model],
        param_grids[model],
        n_iter=20,
        scoring='neg_mean_squared_error',
        cv=5,
        n_jobs=-1
    )
    
    random_search[model].fit(x_train, y_train)

In [12]:
# train models using bayesian search
bayes_search = {}

for model in models:
    bayes_search[model] = BayesSearchCV(
        models[model],
        bayes_spaces[model],
        n_iter=30,
        scoring='neg_mean_squared_error',
        cv=5,
        n_jobs=-1
    )

    bayes_search[model].fit(x_train, y_train)

# Cross Validation

## Aggregate Best Estimators from Hyperparameter Tuning

In [14]:
# list of best models
best_models = []

In [15]:
# best models from grid search
for model in models:
    best_models.append({
        "Model": f"{model} (Grid Search)",
        "Estimator": grid_search[model].best_estimator_
    })

In [16]:
# best models from random search
for model in models:
    best_models.append({
            "Model": f"{model} (Random Search)",
            "Estimator": random_search[model].best_estimator_
        })

In [17]:
# best models from bayesian search
for model in models:
    best_models.append({
        "Model": f"{model} (Bayesian Search)",
        "Estimator": bayes_search[model].best_estimator_
    })

## Evaluate Best Models Using Cross-Validation

In [None]:
# evaluation results
evaluation_results = []

for model in best_models:
    model_name = model["Model"]
    estimator = model["Estimator"]

    # cross-validation based on train data
    scores = cross_validate(
        estimator,
        x_train,
        y_train,
        cv=5,
        scoring=['neg_mean_squared_error', 'neg_mean_absolute_error', 'r2']
    )

    # evaluation metrics
    evaluation_results.append({
        "Model": model_name,
        "CV MSE": -scores['test_neg_mean_squared_error'].mean(),
        "CV MAE": -scores['test_neg_mean_absolute_error'].mean(),
        "CV R2": scores['test_r2'].mean()
    })

In [19]:
# display evaluation results
evaluation_df = pd.DataFrame(evaluation_results)

evaluation_df

Unnamed: 0,Model,CV MSE,CV MAE,CV R2
0,Linear Regression (Grid Search),11.777357,2.627755,0.812156
1,Ridge Regression (Grid Search),11.744147,2.621644,0.812975
2,Random Forest (Grid Search),9.334242,2.205254,0.850436
3,Gradient Boosting Regressor (Grid Search),8.732826,2.105042,0.859043
4,XGBoost (Grid Search),8.708989,2.145665,0.859703
5,Linear Regression (Random Search),11.777357,2.627755,0.812156
6,Ridge Regression (Random Search),11.744147,2.621644,0.812975
7,Random Forest (Random Search),9.334242,2.205254,0.850436
8,Gradient Boosting Regressor (Random Search),8.732826,2.105042,0.859043
9,XGBoost (Random Search),9.085606,2.215971,0.852686
