In [4]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

import seaborn as sns

import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

import mlflow 
import mlflow.sklearn

In [5]:
DATA_PATH = '../data/insurance_60k.csv'
MODEL_DIR = 'artifacts/best_model'

In [11]:
df = df.iloc[0:10000]
df.to_csv('insurance_10k.csv')

In [6]:
df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,54.907277,female,24.043568,1.995771,no,southwest,9222.721214
1,36.945607,male,23.992936,-0.051169,yes,southwest,28348.59309
2,58.382876,male,22.053382,-0.031955,no,northwest,10438.340609
3,39.20734,female,26.307519,-0.02681,no,northeast,8596.622439
4,18.163581,male,24.950164,0.992198,yes,southwest,24771.84457


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       60000 non-null  float64
 1   sex       60000 non-null  object 
 2   bmi       60000 non-null  float64
 3   children  60000 non-null  float64
 4   smoker    60000 non-null  object 
 5   region    60000 non-null  object 
 6   charges   60000 non-null  float64
dtypes: float64(4), object(3)
memory usage: 3.2+ MB


In [5]:
X, y = df.drop(columns=["charges"]), df['charges']

In [14]:
numeric = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical = X.select_dtypes(include=["object"]).columns.tolist()
categorical

['sex', 'smoker', 'region']

In [18]:
numeric_pipe = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ])
categorical_pipe = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

In [27]:
ColumnTransformer(
    transformers=[
        ("num", numeric_pipe, numeric),
        ("cat", categorical_pipe, categorical)
    ]
)

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [28]:
def build_preprocessor(X):
    numeric = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
    categorical = X.select_dtypes(include=["object"]).columns.tolist()

    return ColumnTransformer(
        transformers=[
            ("num", numeric_pipe, numeric),
            ("cat", categorical_pipe, categorical)
        ]
    )
    

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
len(X_train), len(X_test), len(y_train), len(y_test)

(48000, 12000, 48000, 12000)

In [35]:
preprocessor = build_preprocessor(X_train)
preprocessor

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [38]:
models = {
    "lr": LinearRegression(),
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "ElasticNet": ElasticNet(),
    "RandomForest": RandomForestRegressor(random_state=42),
    "XGBoost": xgb.XGBRFRegressor(
        objective="reg:squarederror",
        eval_metric="rmse",
        random_state=42
    )
}
models

{'lr': LinearRegression(),
 'Ridge': Ridge(),
 'Lasso': Lasso(),
 'ElasticNet': ElasticNet(),
 'RandomForest': RandomForestRegressor(random_state=42),
 'XGBoost': XGBRFRegressor(base_score=None, booster=None, callbacks=None,
                colsample_bylevel=None, colsample_bytree=None, device=None,
                early_stopping_rounds=None, enable_categorical=False,
                eval_metric='rmse', feature_types=None, feature_weights=None,
                gamma=None, grow_policy=None, importance_type=None,
                interaction_constraints=None, max_bin=None,
                max_cat_threshold=None, max_cat_to_onehot=None,
                max_delta_step=None, max_depth=None, max_leaves=None,
                min_child_weight=None, missing=nan, monotone_constraints=None,
                multi_strategy=None, n_estimators=None, n_jobs=None,
                num_parallel_tree=None, objective='reg:squarederror',
                random_state=42, ...)}

In [39]:
param_grids = {
    "Ridge": {"model__alpha": [0.1, 1, 10]},
    "Lasso": {"model__alpha": [0.01, 0.1, 1]},
    "ElasticNet": {"model__alpha": [0.01, 0.1, 1],
                   "model__l1_ratio": [0.2, 0.5, 0.8]},
    "RandomForest": {"model__n_estimators": [100, 200],
                     "model__max_depth": [None, 5, 10]},
    "XGBoost": {"model__n_estimators": [100, 200],
                "model__max_depth": [3, 5],
                "model__learning_rate": [0.05, 0.1]},
}

In [44]:
for name, model in models.items():
    pipe = Pipeline(
        steps=[
            ("prep", preprocessor),
            ("model", model)
        ])
    if name in param_grids:
        grid = GridSearchCV(
            pipe,
            param_grids[name],
            cv=5,
            scoring="neg_root_mean_squared_error",
            n_jobs=-1
        )
        grid.fit(X_train, y_train)
        pipe = grid.best_estimator_
        print(pipe)

Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'bmi', 'children']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['sex', 'smoker',
                                                   'region'])])),
                ('model', Ridge(alpha=0.1))])
Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
   

KeyboardInterrupt: 