In our previous experiments, we found that the StackingRegressor gave us pretty solid results, with a Train $R^2$ score of 91.33% and a Test $R^2$ score of 90.77% on the raw data (the accuracy wasn't as good when we used the processed data with feature engineering). In this notebook, we’re going to try some new experiments to see if changing the final estimator in the StackingRegressor can help us get even better results.

## Imports

In [1]:
import os
import sys
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

In [15]:
import warnings
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import xgboost as xgb

import mlflow

from src.utils import r2_and_adjusted_r2_score as score, root_mean_squared_error as rmse
from src.mlflow_util import setup_mlflow_experiment as setup_exp

warnings.filterwarnings('ignore')

## Read Data

In [3]:
raw_data_dir = os.path.join(parent_dir, 'data', 'raw')
df = pd.read_csv(os.path.join(raw_data_dir, 'insurance.csv'))

## MLFlow Setup

In [4]:
exp_name = 'Experiments Age Stratified on Raw Data'
exp_description = '''The goal is to predict the PremiumPrice based on the given features.
Data is splitted using the Age feature to ensure that the distribution of Age is similar in both train and test sets.
There are no engineered features in this experiment.
'''

experiment = setup_exp(exp_name, exp_description)

experiment_id = experiment.experiment_id

mlflow.set_experiment(experiment_id=experiment_id)

<Experiment: artifact_location=('file:///Users/wasimmujawar/Desktop/Case '
 'Study/Insaurance/mlruns/116828091641320024'), creation_time=1735710435805, experiment_id='116828091641320024', last_update_time=1735710435805, lifecycle_stage='active', name='Experiments Age Stratified on Raw Data', tags={'mlflow.note.content': 'The goal is to predict the PremiumPrice based on the '
                        'given features.\n'
                        'Data is splitted using the Age feature to ensure that '
                        'the distribution of Age is similar in both train and '
                        'test sets.\n'
                        'There are no engineered features in this '
                        'experiment.\n'}>

## Train Test Split

In [5]:
age_bins = [18, 25, 40, 55, np.inf]
age_labels = ['Young Adult', 'Adult', 
              'Middle Aged Adults', 'Senior']

age_category = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['PremiumPrice']), df['PremiumPrice'],
                                                    stratify=age_category, test_size=0.2, random_state=42)

## Preprocessing

In [6]:
numeric_columns = ['Age', 'Height', 'Weight']
binary_columns = ['Diabetes', 'BloodPressureProblems', 'AnyTransplants', 'AnyChronicDiseases', 'KnownAllergies', 'HistoryOfCancerInFamily', 'NumberOfMajorSurgeries']

preprocessor = ColumnTransformer([
    ('scaler', StandardScaler(), numeric_columns),
    ('passthrough', 'passthrough', binary_columns)
])


## Experiments

In [8]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = LinearRegression()

with mlflow.start_run(run_name=f"Stacking Regressor with Linear Regression as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [10]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = DecisionTreeRegressor(max_depth=2)

with mlflow.start_run(run_name=f"Stacking Regressor with Decision Tree as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [13]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = Ridge()

with mlflow.start_run(run_name=f"Stacking Regressor with Ridge as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [14]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = Ridge(alpha=0.5)

with mlflow.start_run(run_name=f"Stacking Regressor with Ridge (alpha=0.5) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [16]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = Lasso()

with mlflow.start_run(run_name=f"Stacking Regressor with Lasso as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [17]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = ElasticNet(max_iter=5000)

with mlflow.start_run(run_name=f"Stacking Regressor with ElasticNet as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [19]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = xgb.XGBRegressor(max_depth=4, n_estimators=80)

with mlflow.start_run(run_name=f"Stacking Regressor with XGBRegressor (depth=4) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [21]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = RandomForestRegressor(max_depth=4, n_estimators=120)

with mlflow.start_run(run_name=f"Stacking Regressor with RandomForestRegressor (depth=4, est=120) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [22]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = RandomForestRegressor(max_depth=6, n_estimators=120)

with mlflow.start_run(run_name=f"Stacking Regressor with RandomForestRegressor (depth=6, est=120) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [23]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = RandomForestRegressor(max_depth=6, n_estimators=150)

with mlflow.start_run(run_name=f"Stacking Regressor with RandomForestRegressor (depth=6, est=150) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [24]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = ElasticNet(alpha=0.5, l1_ratio=0.3, max_iter=5000)

with mlflow.start_run(run_name=f"Stacking Regressor with ElasticNet(alpha=0.5, l1_ratio=0.3) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [25]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = ElasticNet(alpha=0.7, l1_ratio=0.6, max_iter=5000)

with mlflow.start_run(run_name=f"Stacking Regressor with ElasticNet(alpha=0.7, l1_ratio=0.6) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [26]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = ElasticNet(alpha=0.7, l1_ratio=0.3, max_iter=5000)

with mlflow.start_run(run_name=f"Stacking Regressor with ElasticNet(alpha=0.7, l1_ratio=0.3) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [27]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = ElasticNet(alpha=0.8, l1_ratio=0.2, max_iter=5000)

with mlflow.start_run(run_name=f"Stacking Regressor with ElasticNet(alpha=0.8, l1_ratio=0.2) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [29]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = ElasticNet(alpha=0.9, l1_ratio=0.2, max_iter=5000)

with mlflow.start_run(run_name=f"Stacking Regressor with ElasticNet(alpha=0.9, l1_ratio=0.2) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [30]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = ElasticNet(alpha=0.8, l1_ratio=0.3, max_iter=5000)

with mlflow.start_run(run_name=f"Stacking Regressor with ElasticNet(alpha=0.8, l1_ratio=0.3) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

We found that switching the final estimator to ElasticNet with alpha=0.7 and l1_ratio=0.3 gave us slightly better results. The Train $R^2$ score went up to 91.49%, and the Test $R^2$ score was 91.10%. For RMSE, we got 1822 on the Train set and 1860 on the Test set.