In our previous experiments, we found that the StackingRegressor gave us pretty solid results, with a Train $R^2$ score of 91.33% and a Test $R^2$ score of 90.77% on the raw data (the accuracy wasn't as good when we used the processed data with feature engineering). In this notebook, we’re going to try some new experiments to see if changing the final estimator in the StackingRegressor can help us get even better results.

## Imports

In [55]:
import os
import sys
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

In [56]:
import warnings
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import xgboost as xgb

import mlflow

from src.utils import r2_and_adjusted_r2_score as score, root_mean_squared_error as rmse
from src.mlflow_util import setup_mlflow_experiment as setup_exp

warnings.filterwarnings('ignore')

## Read Data

In [57]:
raw_data_dir = os.path.join(parent_dir, 'data', 'raw')
df = pd.read_csv(os.path.join(raw_data_dir, 'insurance.csv'))

## MLFlow Setup

In [58]:
exp_name = 'Experiments Age Stratified on Raw Data'
exp_description = '''The goal is to predict the PremiumPrice based on the given features.
Data is splitted using the Age feature to ensure that the distribution of Age is similar in both train and test sets.
There are no engineered features in this experiment.
'''

experiment = setup_exp(exp_name, exp_description)

experiment_id = experiment.experiment_id

mlflow.set_experiment(experiment_id=experiment_id)

<Experiment: artifact_location=('file:///Users/wasimmujawar/Desktop/Case '
 'Study/Insaurance/mlruns/116828091641320024'), creation_time=1735710435805, experiment_id='116828091641320024', last_update_time=1735710435805, lifecycle_stage='active', name='Experiments Age Stratified on Raw Data', tags={'mlflow.note.content': 'The goal is to predict the PremiumPrice based on the '
                        'given features.\n'
                        'Data is splitted using the Age feature to ensure that '
                        'the distribution of Age is similar in both train and '
                        'test sets.\n'
                        'There are no engineered features in this '
                        'experiment.\n'}>

## Train Test Split

In [59]:
age_bins = [18, 25, 40, 55, np.inf]
age_labels = ['Young Adult', 'Adult', 
              'Middle Aged Adults', 'Senior']

age_category = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=False)

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['PremiumPrice']), df['PremiumPrice'],
                                                    stratify=age_category, test_size=0.2, random_state=42)

## Preprocessing

In [60]:
numeric_columns = ['Age', 'Height', 'Weight']
binary_columns = ['Diabetes', 'BloodPressureProblems', 'AnyTransplants', 'AnyChronicDiseases', 'KnownAllergies', 'HistoryOfCancerInFamily', 'NumberOfMajorSurgeries']

preprocessor = ColumnTransformer([
    ('scaler', StandardScaler(), numeric_columns),
    ('passthrough', 'passthrough', binary_columns)
])


## Experiments

In [61]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = LinearRegression()

with mlflow.start_run(run_name=f"Stacking Regressor with Linear Regression as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [62]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = DecisionTreeRegressor(max_depth=2)

with mlflow.start_run(run_name=f"Stacking Regressor with Decision Tree as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [63]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = Ridge()

with mlflow.start_run(run_name=f"Stacking Regressor with Ridge as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [64]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = Ridge(alpha=0.5)

with mlflow.start_run(run_name=f"Stacking Regressor with Ridge (alpha=0.5) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [65]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = Lasso()

with mlflow.start_run(run_name=f"Stacking Regressor with Lasso as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [66]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = ElasticNet(max_iter=5000)

with mlflow.start_run(run_name=f"Stacking Regressor with ElasticNet as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [67]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = xgb.XGBRegressor(max_depth=4, n_estimators=80)

with mlflow.start_run(run_name=f"Stacking Regressor with XGBRegressor (depth=4) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [68]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = RandomForestRegressor(max_depth=4, n_estimators=120)

with mlflow.start_run(run_name=f"Stacking Regressor with RandomForestRegressor (depth=4, est=120) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [69]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = RandomForestRegressor(max_depth=6, n_estimators=120)

with mlflow.start_run(run_name=f"Stacking Regressor with RandomForestRegressor (depth=6, est=120) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [70]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = RandomForestRegressor(max_depth=6, n_estimators=150)

with mlflow.start_run(run_name=f"Stacking Regressor with RandomForestRegressor (depth=6, est=150) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [71]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = ElasticNet(alpha=0.5, l1_ratio=0.3, max_iter=5000)

with mlflow.start_run(run_name=f"Stacking Regressor with ElasticNet(alpha=0.5, l1_ratio=0.3) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [72]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = ElasticNet(alpha=0.7, l1_ratio=0.6, max_iter=5000)

with mlflow.start_run(run_name=f"Stacking Regressor with ElasticNet(alpha=0.7, l1_ratio=0.6) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [73]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = ElasticNet(alpha=0.7, l1_ratio=0.3, max_iter=5000)

with mlflow.start_run(run_name=f"Stacking Regressor with ElasticNet(alpha=0.7, l1_ratio=0.3) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [74]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = ElasticNet(alpha=0.8, l1_ratio=0.2, max_iter=5000)

with mlflow.start_run(run_name=f"Stacking Regressor with ElasticNet(alpha=0.8, l1_ratio=0.2) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [75]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = ElasticNet(alpha=0.9, l1_ratio=0.2, max_iter=5000)

with mlflow.start_run(run_name=f"Stacking Regressor with ElasticNet(alpha=0.9, l1_ratio=0.2) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [76]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=80, max_depth=5, learning_rate=0.3, loss='huber')),
    ('gb3', GradientBoostingRegressor(n_estimators=120, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=110, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = ElasticNet(alpha=0.8, l1_ratio=0.3, max_iter=5000)

with mlflow.start_run(run_name=f"Stacking Regressor with ElasticNet(alpha=0.8, l1_ratio=0.3) as Final Estimator"):
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

We found that switching the final estimator to ElasticNet with alpha=0.7 and l1_ratio=0.3 gave us slightly better results. The Train $R^2$ score went up to 91.49%, and the Test $R^2$ score was 91.10%. For RMSE, we got 1822 on the Train set and 1860 on the Test set.

## Cross Validation

In [77]:
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.linear_model import Lasso, Ridge, ElasticNet

In [78]:
X_data = df.drop(columns=['PremiumPrice'])
y_data = df['PremiumPrice']

In [79]:
result = cross_val_score(estimator=LinearRegression(), 
                        X=X_data, 
                        y=y_data, 
                        scoring='r2', 
                        cv=5)

result

array([0.63344721, 0.65804126, 0.64586657, 0.74747812, 0.486833  ])

In [80]:
l1_ratios = [0.02, 0.06, 0.3, 0.58, 0.7, 0.9, 1]

for l1 in l1_ratios:
    result = cross_val_score(estimator=Lasso(alpha=l1),
                        X=X_data, 
                        y=y_data,
                        scoring='r2', 
                        cv=5)

    print(f'Score for alpha={l1}: {result}')

Score for alpha=0.02: [0.63344752 0.65803985 0.64587338 0.74747329 0.48683642]
Score for alpha=0.06: [0.63344812 0.65803705 0.64588699 0.7474636  0.48684326]
Score for alpha=0.3: [0.63345165 0.6580201  0.64596853 0.74740543 0.48688418]
Score for alpha=0.58: [0.63345568 0.65800007 0.64606355 0.7473376  0.48693174]
Score for alpha=0.7: [0.63345726 0.65799141 0.6461042  0.74730835 0.48695206]
Score for alpha=0.9: [0.63345977 0.65797686 0.64617184 0.74725952 0.48698583]
Score for alpha=1: [0.63346098 0.65796954 0.64620495 0.74723507 0.48700268]


In [81]:
l2_ratios = [0.1, 0.3, 0.5, 0.7, 0.05, 0.004, 0.08]

for l2 in l2_ratios:
    result = cross_val_score(estimator=Ridge(alpha=l2),
                        X=X_data, 
                        y=y_data,
                        scoring='r2', 
                        cv=5)

    print(f'Score for alpha={l2}: {result}')

Score for alpha=0.1: [0.63341916 0.65799753 0.64602229 0.74732778 0.48702006]
Score for alpha=0.3: [0.63335944 0.65790898 0.6463281  0.74702647 0.48738577]
Score for alpha=0.5: [0.63329502 0.65781903 0.64662653 0.74672437 0.48774053]
Score for alpha=0.7: [0.63322605 0.65772771 0.64691773 0.74642153 0.48808463]
Score for alpha=0.05: [0.63343334 0.65801944 0.64594467 0.74740298 0.48692688]
Score for alpha=0.004: [0.63344611 0.65803951 0.64587283 0.74747211 0.48684054]
Score for alpha=0.08: [0.63342487 0.6580063  0.6459913  0.74735786 0.48698287]


In [82]:
for l1 in l1_ratios:
    for l2 in l2_ratios:
        result = cross_val_score(estimator=ElasticNet(alpha=l2, l1_ratio=l1),
                        X=X_data, 
                        y=y_data,
                        scoring='r2', 
                        cv=5)

        print(f'Score for l1={l1}, l2={l2}: {result}')

Score for l1=0.02, l2=0.1: [0.57609822 0.61801663 0.63983662 0.66516183 0.4753094 ]
Score for l1=0.02, l2=0.3: [0.53450044 0.58517519 0.60958008 0.61376933 0.44724569]
Score for l1=0.02, l2=0.5: [0.51916339 0.57168918 0.59615832 0.59461835 0.43629592]
Score for l1=0.02, l2=0.7: [0.51104615 0.56425359 0.58867856 0.58443202 0.43046386]
Score for l1=0.02, l2=0.05: [0.60093661 0.63531848 0.65209066 0.69607539 0.48948862]
Score for l1=0.02, l2=0.004: [0.63209708 0.65654282 0.64989444 0.7427585  0.49146758]
Score for l1=0.02, l2=0.08: [0.58466579 0.62415099 0.64468109 0.67573062 0.48054839]
Score for l1=0.06, l2=0.1: [0.57772275 0.61919439 0.6407984  0.66716249 0.47632328]
Score for l1=0.06, l2=0.3: [0.5359021  0.5863702  0.61075415 0.61551339 0.4482364 ]
Score for l1=0.06, l2=0.5: [0.52027749 0.57269364 0.59716703 0.5960132  0.437096  ]
Score for l1=0.06, l2=0.7: [0.51196403 0.56510448 0.58953559 0.58558592 0.43112295]
Score for l1=0.06, l2=0.05: [0.60221893 0.63617575 0.6525524  0.69770876

Let's use some of the tried and test model with better performance from 05-1_PremiumPrice_Prediction_RawData_StratifiedAge.ipynb file

In [83]:
model = GradientBoostingRegressor(n_estimators=110, max_depth=5, learning_rate=0.1, loss='huber')

cv_results = cross_validate(model, X_data, y_data, cv=5, return_train_score=True, scoring=('r2', 'neg_root_mean_squared_error'))

cv_results

{'fit_time': array([0.30188489, 0.35745621, 0.30556989, 0.29974699, 0.27579904]),
 'score_time': array([0.00221992, 0.00156188, 0.00156879, 0.00168586, 0.0013938 ]),
 'test_r2': array([0.86348225, 0.74466964, 0.78499819, 0.91582313, 0.67049145]),
 'train_r2': array([0.90250353, 0.93931496, 0.93031811, 0.90425578, 0.94130358]),
 'test_neg_root_mean_squared_error': array([-2301.42502118, -3092.33720115, -2655.43871953, -1862.08977497,
        -3795.20144993]),
 'train_neg_root_mean_squared_error': array([-1951.10001746, -1543.41033726, -1680.26550631, -1917.55453916,
        -1487.86070263])}

In [84]:
model = RandomForestRegressor(n_estimators=100, max_depth=7, n_jobs=-1)

cv_results = cross_validate(model, X_data, y_data, cv=5, return_train_score=True, scoring=('r2', 'neg_root_mean_squared_error'))

cv_results

{'fit_time': array([0.06442213, 0.06643605, 0.06436992, 0.11081314, 0.06693983]),
 'score_time': array([0.01362371, 0.01353192, 0.0149622 , 0.01379681, 0.01405215]),
 'test_r2': array([0.84467857, 0.76986452, 0.80831664, 0.90018582, 0.69550379]),
 'train_r2': array([0.93390069, 0.94673158, 0.93764069, 0.9258052 , 0.94541225]),
 'test_neg_root_mean_squared_error': array([-2454.81043894, -2935.80636097, -2507.30653589, -2027.68435875,
        -3648.31592184]),
 'train_neg_root_mean_squared_error': array([-1606.51039528, -1446.02395593, -1589.52959908, -1688.02226384,
        -1434.8418505 ])}

In [85]:
model = xgb.XGBRegressor(max_depth=6, n_estimators=90, learning_rate=0.03, reg_alpha=0.5, reg_lambda=0.7)

cv_results = cross_validate(model, X_data, y_data, cv=5, return_train_score=True, scoring=('r2', 'neg_root_mean_squared_error'))

cv_results

{'fit_time': array([0.06813812, 0.05330324, 0.0564642 , 0.05135369, 0.06240892]),
 'score_time': array([0.00274086, 0.00247192, 0.00262403, 0.00224304, 0.00277305]),
 'test_r2': array([0.80945683, 0.78494561, 0.79003209, 0.88181549, 0.66385293]),
 'train_r2': array([0.92571282, 0.93614727, 0.93406212, 0.92825586, 0.9438473 ]),
 'test_neg_root_mean_squared_error': array([-2718.93601231, -2837.9828658 , -2624.16788679, -2206.40126467,
        -3833.24103127]),
 'train_neg_root_mean_squared_error': array([-1703.10688769, -1583.17983665, -1634.50176858, -1659.91078874,
        -1455.26367136])}

In [86]:
base_learner = [
    ('gb1', GradientBoostingRegressor(n_estimators=110, max_depth=5, learning_rate=0.1, loss='huber')),
    ('gb2', GradientBoostingRegressor(n_estimators=120, max_depth=5, learning_rate=0.1, loss='huber')),
    ('rd1', RandomForestRegressor(n_estimators=100, max_depth=7, n_jobs=-1)),
    ('rd2', RandomForestRegressor(n_estimators=90, max_depth=7, n_jobs=-1)),
    ('xg1', xgb.XGBRegressor(max_depth=6, n_estimators=90, learning_rate=0.03, reg_alpha=0.5, reg_lambda=0.7)),
    ('xg2', xgb.XGBRegressor(max_depth=6, n_estimators=80, learning_rate=0.04, reg_alpha=0.1, reg_lambda=0.7))
]

final_estimator = LinearRegression()

model = StackingRegressor(estimators=base_learner, final_estimator=final_estimator)
    
model.fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

rmse_train = rmse(y_train, y_train_pred)
rmse_test = rmse(y_test, y_test_pred)

r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

print('*'*18, 'Train Result', '*'*10)
print(f'RMSE: {rmse_train}')
print(f'R2 Score: {r2_train}')
print(f'Adjusted R2 Score: {adj_r2_train}')
print('')

print('*'*18, 'Test Result', '*'*10)
print(f'RMSE: {rmse_test}')
print(f'R2 Score: {r2_test}')
print(f'Adjusted R2 Score: {adj_r2_test}')

****************** Train Result **********
RMSE: 1608.663764075114
R2 Score: 0.9336699595751619
Adjusted R2 Score: 0.9328162911012257

****************** Test Result **********
RMSE: 1877.1003073691975
R2 Score: 0.909381127827578
Adjusted R2 Score: 0.9045351988344004


- Individually these models are not performing any better, However, the stacking regressor gave us better performance and hence can be used for final model creation.
- This can also be observed in the ML Flow run