The goal is to predict the PremiumPrice using the given features. No stratification technique is used in this experiment. Instead, feature engineering is applied to create new features, such as BMI and MajorSurgeryDone.

## Imports

In [1]:
import os
import sys
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

In [2]:
import warnings
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, VotingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import xgboost as xgb

import mlflow

from src.utils import r2_and_adjusted_r2_score as score, root_mean_squared_error as rmse
from src.utils import setup_mlflow_experiment as setup_exp

warnings.filterwarnings('ignore')


## Read Data

In [3]:
processed_data_dir = os.path.join(parent_dir, 'data', 'processed')

In [4]:
df = pd.read_csv(os.path.join(processed_data_dir, 'insurance_feature_engineered_for_model.csv'))
df.head()

Unnamed: 0,Age,Diabetes,BloodPressureProblems,AnyTransplants,AnyChronicDiseases,Height,Weight,KnownAllergies,HistoryOfCancerInFamily,NumberOfMajorSurgeries,PremiumPrice,MajorSurgeryDone,BMI
0,45,0,0,0,0,155,57,0,0,0,25000,0,23.725286
1,60,1,0,0,0,180,73,0,0,0,29000,0,22.530864
2,36,1,1,0,0,158,59,0,0,1,23000,1,23.634033
3,52,1,1,0,1,183,93,0,0,2,28000,1,27.770313
4,38,0,0,0,1,166,88,0,0,1,23000,1,31.934969


## MLFlow Setup

In [5]:
exp_name = 'Premium Prediction - No stratification - Feature Engineered'
exp_description = '''The goal is to predict the PremiumPrice based on the given features.
There is no stratification technique used in the experiment
Feature engineering is done to create new features like BMI and MajorSurgeryDone
'''

experiment = setup_exp(exp_name, exp_description)

experiment_id = experiment.experiment_id

mlflow.set_experiment(experiment_id=experiment_id)


<Experiment: artifact_location=('file:///Users/wasimmujawar/Desktop/Case '
 'Study/Insaurance/mlruns/611947279263242194'), creation_time=1735669989723, experiment_id='611947279263242194', last_update_time=1735669989723, lifecycle_stage='active', name='Premium Prediction - No stratification - Feature Engineered', tags={'mlflow.note.content': 'The goal is to predict the PremiumPrice based on the '
                        'given features.\n'
                        'There is no stratification technique used in the '
                        'experiment\n'
                        'Feature engineering is done to create new features '
                        'like BMI and MajorSurgeryDone\n'}>

## Model Building

### Train Test Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['PremiumPrice']), df['PremiumPrice'], test_size=0.2, random_state=42)

### Linear Regression

In [7]:
numeric_columns = ['Age', 'Height', 'Weight', 'BMI']
binary_columns = ['Diabetes', 'BloodPressureProblems', 'AnyTransplants', 'AnyChronicDiseases', 'KnownAllergies', 'HistoryOfCancerInFamily', 'MajorSurgeryDone', 'NumberOfMajorSurgeries']

preprocessor = ColumnTransformer([
    ('scaler', StandardScaler(), numeric_columns),
    ('passthrough', 'passthrough', binary_columns)
])

pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', LinearRegression())
])

with mlflow.start_run(run_name="Linear Regression"):
    pipeline.fit(X_train, y_train)
    
    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)
    
    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)
    
    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])
    
    mlflow.set_tag('Model', 'Linear Regression')
    
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

### Decision Tree

In [8]:
def run_mlflow_decision_tree(max_dept: int) -> None:
    with mlflow.start_run(run_name="Decision Tree"):
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('regressor', DecisionTreeRegressor(max_depth=max_dept))
        ])
        
        pipeline.fit(X_train, y_train)
        
        y_train_pred = pipeline.predict(X_train)
        y_test_pred = pipeline.predict(X_test)
        
        rmse_train = rmse(y_train, y_train_pred)
        rmse_test = rmse(y_test, y_test_pred)
        
        signature = mlflow.models.infer_signature(X_train, y_train_pred)

        r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
        r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])
        
        mlflow.set_tag('Model', 'Decision Tree Regressor')
        mlflow.log_param('max_depth', max_dept)
    
        mlflow.log_metric('Train - RMSE', rmse_train)
        mlflow.log_metric('Test - RMSE', rmse_test)
        mlflow.log_metric('Train - r2 score', r2_train)
        mlflow.log_metric('Test - r2 score', r2_test)
        mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
        mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
        mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [9]:
for depth in range(3, 9):
    run_mlflow_decision_tree(depth)

### Random Forest Regressor

In [10]:
def run_mlflow_random_forest(n_estimators: int, max_dept: int, max_feature: str = None) -> None:
    with mlflow.start_run(run_name="Random Forest"):
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('regressor', RandomForestRegressor(n_estimators=n_estimators, 
                                                max_depth=max_dept, 
                                                max_features=max_feature, 
                                                n_jobs=-1))
        ])
        
        pipeline.fit(X_train, y_train)
        
        y_train_pred = pipeline.predict(X_train)
        y_test_pred = pipeline.predict(X_test)
        
        rmse_train = rmse(y_train, y_train_pred)
        rmse_test = rmse(y_test, y_test_pred)
        
        signature = mlflow.models.infer_signature(X_train, y_train_pred)

        r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
        r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])
        
        mlflow.set_tag('Model', 'Random Forest Regressor')
        mlflow.log_param('n_estimators', n_estimators)
        mlflow.log_param('max_depth', max_dept)
        mlflow.log_param('max_features', max_feature)
    
        mlflow.log_metric('Train - RMSE', rmse_train)
        mlflow.log_metric('Test - RMSE', rmse_test)
        mlflow.log_metric('Train - r2 score', r2_train)
        mlflow.log_metric('Test - r2 score', r2_test)
        mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
        mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
        mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [11]:
n_estimator = [80, 90, 100, 110, 120]
max_depth = [3, 4, 5, 6]
max_features = [None, 'sqrt', 'log2']

for n_est in n_estimator:
    for depth in max_depth:
        for feature in max_features:
            run_mlflow_random_forest(n_est, depth, feature)

In [12]:
run_mlflow_random_forest(90, 7)

In [13]:
run_mlflow_random_forest(100, 7)

### Gradient Boosting Regressor

In [14]:
def run_mlflow_gbdt(loss: str, learning_rate: float, n_estimators: int, max_dept: int) -> None:
    with mlflow.start_run(run_name="GBDT"):
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('regressor', GradientBoostingRegressor(n_estimators=n_estimators, 
                                                    max_depth=max_dept, 
                                                    learning_rate=learning_rate, 
                                                    loss=loss))
        ])
        
        pipeline.fit(X_train, y_train)
        
        y_train_pred = pipeline.predict(X_train)
        y_test_pred = pipeline.predict(X_test)
        
        rmse_train = rmse(y_train, y_train_pred)
        rmse_test = rmse(y_test, y_test_pred)
        
        signature = mlflow.models.infer_signature(X_train, y_train_pred)

        r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
        r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])
        
        mlflow.set_tag('Model', 'GBDT Regressor')
        mlflow.log_param('loss', loss)
        mlflow.log_param('learning_rate', learning_rate)
        mlflow.log_param('n_estimators', n_estimators)
        mlflow.log_param('max_depth', max_dept)
    
        mlflow.log_metric('Train - RMSE', rmse_train)
        mlflow.log_metric('Test - RMSE', rmse_test)
        mlflow.log_metric('Train - r2 score', r2_train)
        mlflow.log_metric('Test - r2 score', r2_test)
        mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
        mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
        mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [15]:
loss = ['squared_error', 'huber']
n_estimator = [80, 90, 100, 110, 120]
max_depth = [3, 4, 5, 6]
learning_rate = [0.01, 0.1, 0.2, 0.3]

In [16]:
for l in loss:
    for n_est in n_estimator:
        for depth in max_depth:
            for lr in learning_rate:
                run_mlflow_gbdt(l, lr, n_est, depth)

### XGBoost Regression

In [17]:
def run_mlflow_xgboost(max_dept: int, n_estimators: int, learning_rate: float, reg_alpha: float, reg_lambda: float) -> None:
    with mlflow.start_run(run_name="XGBoost"):
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('regressor', xgb.XGBRegressor(max_depth=max_dept, n_estimators=n_estimators, 
                                 learning_rate=learning_rate, n_jobs=-1, reg_alpha=reg_alpha, 
                                 reg_lambda=reg_lambda))
        ])
        
        pipeline.fit(X_train, y_train)
        
        y_train_pred = pipeline.predict(X_train)
        y_test_pred = pipeline.predict(X_test)
        
        rmse_train = rmse(y_train, y_train_pred)
        rmse_test = rmse(y_test, y_test_pred)
        
        signature = mlflow.models.infer_signature(X_train, y_train_pred)

        r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
        r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])
        
        mlflow.set_tag('Model', 'XGBoost Regressor')
        mlflow.log_param('max_depth', max_dept)
        mlflow.log_param('n_estimators', n_estimators)
        mlflow.log_param('learning_rate', learning_rate)
        mlflow.log_param('reg_alpha', reg_alpha)
        mlflow.log_param('reg_lambda', reg_lambda)
    
        mlflow.log_metric('Train - RMSE', rmse_train)
        mlflow.log_metric('Test - RMSE', rmse_test)
        mlflow.log_metric('Train - r2 score', r2_train)
        mlflow.log_metric('Test - r2 score', r2_test)
        mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
        mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
        mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

In [18]:
n_estimator = [80, 83, 87]
reg_alpha = [0.1, 0.2, 0.7]
reg_lambda = [0.1, 0.3, 0.7]
max_depth = [3, 4]
learning_rate = [0.02, 0.05, 0.7]

for n_est in n_estimator:
    for depth in max_depth:
        for lr in learning_rate:
            for alpha in reg_alpha:
                for lam in reg_lambda:
                    run_mlflow_xgboost(depth, n_est, lr, alpha, lam)

### Stacking Regression

In [19]:
base_learner = [
    ('gb', GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.2, loss='huber')),
    ('rd', RandomForestRegressor(n_estimators=100, max_depth=6, n_jobs=-1)),
    ('xg', xgb.XGBRegressor(max_depth=4, n_estimators=80, learning_rate=0.05, reg_alpha=0.7, reg_lambda=0.1))
]

final_estimator = LinearRegression()

with mlflow.start_run(run_name="Stacking Regressor"):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', StackingRegressor(estimators=base_learner, final_estimator=final_estimator))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Stacking Regressor')
    
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)

### Voting Regressor

In [20]:
with mlflow.start_run(run_name='Voting Regressor'):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', VotingRegressor(estimators=base_learner))
    ])
    
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict(X_train)
    y_test_pred = pipeline.predict(X_test)

    rmse_train = rmse(y_train, y_train_pred)
    rmse_test = rmse(y_test, y_test_pred)

    signature = mlflow.models.infer_signature(X_train, y_train_pred)

    r2_train, adj_r2_train = score(y_train, y_train_pred, X_train.shape[0], X_train.shape[1])
    r2_test, adj_r2_test = score(y_test, y_test_pred, X_test.shape[0], X_test.shape[1])

    mlflow.set_tag('Model', 'Voting Regressor')
    
    mlflow.log_metric('Train - RMSE', rmse_train)
    mlflow.log_metric('Test - RMSE', rmse_test)
    mlflow.log_metric('Train - r2 score', r2_train)
    mlflow.log_metric('Test - r2 score', r2_test)
    mlflow.log_metric('Train - Adjusted r2 score', adj_r2_train)
    mlflow.log_metric('Test - Adjusted r2 score', adj_r2_test)
    mlflow.sklearn.log_model(pipeline, 'model',  signature=signature)