In [None]:

!pip install ydata-profiling
!pip install fastparquet



In [None]:
import pandas as pd
import numpy as np
from ydata_profiling import ProfileReport
from sklearn.model_selection import train_test_split
from pyarrow.parquet import ParquetDataset
import fastparquet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
src_file =  'insurance_ct1.csv'

# defining categories for categorical columns
tsx = pd.CategoricalDtype(categories=['male', 'female'], ordered=False)
tsmkr = pd.CategoricalDtype(categories=['yes', 'no'], ordered=False)
t_rgn = pd.CategoricalDtype(categories=['southwest','southeast','northwest','northeast'], ordered=False)

# defining elemnets of file metadata
column_names = ['age','sex','bmi','children','smoker','region','charges']
data_types = {'age' : 'int64' ,'sex' :  tsx, 'bmi' : 'float64', 'children' : 'int64', 'smoker' : tsmkr,'region' : t_rgn ,'charges' : 'float64'}
# data_types = {'age' : 'object' ,'sex' :  tsx, 'bmi' : 'object', 'children' : 'object', 'smoker' : tsmkr,'region' : t_rgn ,'charges' : 'object'}
null_values = {'age' : -1 , 'sex' : 'NULL', 'bmi' : -1, 'children' : -1, 'smoker' : 'NULL', 'region' : 'NULL', 'charges' : -1}

src_df = pd.read_csv(src_file, header = 0, names=column_names, dtype = data_types, na_values=null_values)

print(' ----  Reading CSV file ----')
print(src_df.head())

# Saving CSV to Parquet file
srcpqt = 'insurance.parquet'
src_df.to_parquet(srcpqt)

# Reading Parquet file to confirm its written properly
print('')
print(' ----  Reading PARQUET file ----')

df_prqt = pd.read_parquet('insurance.parquet')

print(df_prqt.head())

 ----  Reading CSV file ----
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520

 ----  Reading PARQUET file ----
   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520


In [None]:
profile = ProfileReport(src_df, title = "Insurance Data")
profile.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Split the Dataset into Train, Test and Production.
# We will split the dataset in Train, Test and Production with percentage 60-20-20 repectively

features = src_df.drop('charges', axis =1)
targets = src_df['charges']

# Step 1 - Spliting dataset on Train and Test using stratify sampling
features_train, features_test, targets_train, targets_test = train_test_split(
        features, targets, test_size=0.2, random_state=42 )

# Step 2 - further Spliting training dataset into Training & Production
features_train, features_prod, targets_train, targets_prod = train_test_split(
        features_train, targets_train, test_size=0.25, random_state=42)

print(features_train.head())

print(features_test.head())

print(features_prod.head())

print(targets_train.head())

print(targets_test.head())

print(targets_prod.head())

# Saving datasets
ftrain_dspath = 'features_train.parquet'
ftest_dspath = 'features_test.parquet'
fprod_dspath =  'features_prod.parquet'
ytrain_dspath = 'target_train.parquet'
ytest_dspath = 'target_test.parquet'
yprod_dspath = 'target_prod.parquet'


features_train.to_parquet(ftrain_dspath)
features_test.to_parquet(ftest_dspath)
features_prod.to_parquet(fprod_dspath)
pd.DataFrame(targets_train).to_parquet(ytrain_dspath)
pd.DataFrame(targets_test).to_parquet(ytest_dspath)
pd.DataFrame(targets_prod).to_parquet(yprod_dspath)

## ML Pipeline with Scikit-Learn

In [None]:
# Setting up the urls for training and Test Dataset

ftr_trn_url = 'https://raw.githubusercontent.com/swarnkarnitin/MLOps/Main/Datasets/features_train.parquet'
ftr_tst_url = 'https://raw.githubusercontent.com/swarnkarnitin/MLOps/Main/Datasets/features_test.parquet'
tgt_trn_url = 'https://raw.githubusercontent.com/swarnkarnitin/MLOps/Main/Datasets/target_train.parquet'
tgt_tst_url = 'https://raw.githubusercontent.com/swarnkarnitin/MLOps/Main/Datasets/target_test.parquet'

# Reading dataset from Git
ftr_trn_df = pd.read_parquet(ftr_trn_url, engine='auto')
ftr_tst_df = pd.read_parquet(ftr_tst_url, engine='auto')
tgt_trn_df = pd.read_parquet(tgt_trn_url, engine='auto')
tgt_tst_df = pd.read_parquet(tgt_tst_url, engine='auto')

#printing df to verify the read
print('----- Feature Training dataset-------')
print(ftr_trn_df.head())
print('')

#printing df to verify the read
print('----- Feature Test dataset-------')
print(ftr_tst_df.head())
print('')

#printing df to verify the read
print('----- Target Training dataset-------')
print(tgt_trn_df.head())
print('')

#printing df to verify the read
print('----- Target Test dataset-------')
print(tgt_tst_df.head())
print('')


In [None]:
# Defining categorical features list
cat_features = ['sex', 'smoker', 'region']

# Defining Numerical feature list
num_features = ['age', 'bmi', 'children']

# Defining Numerica Transaformer
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# Defining Categorical Transformer
categorical_transformer = Pipeline(steps=[('onehot',
                                           OneHotEncoder(handle_unknown='ignore'))])
# Creating Column Transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_features),
        ('cat', categorical_transformer, cat_features),
    ])

# Defining XG Boost params
params = { "n_estimators": 400,
           "max_depth": 4 }

# Creating XG Boos instance
xgb_regressor = GradientBoostingRegressor(**params)

# Creating Pipeline with Preprocessor and regressor
reg = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', xgb_regressor)])

reg.fit(ftr_trn_df, tgt_trn_df)

rmse = np.sqrt(mean_squared_error(tgt_tst_df,
                                  reg.predict(ftr_tst_df)))
r2 = r2_score(tgt_tst_df, reg.predict(ftr_tst_df))

In [None]:
reg

In [None]:
from joblib import dump

In [None]:
dump(reg, "insurance_model_v1.pkl")

## 7. ML Experimentation

In [None]:
#!pip install wandb

In [None]:
import wandb
import os
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [None]:
os.environ["WANDB_API_KEY"] = "36947616cdd4551a969299dbac9d516abb9d0fc7"

## Linear Regression

In [None]:
linear_reg = LinearRegression()

linear_model = Pipeline(steps=[('preprocessor', preprocessor),
                               ('linear_model', linear_reg)])
## Pipeline for the applying imputation and then scaling

linear_model.fit(ftr_trn_df, tgt_trn_df)

wandb.init(project='insurance_v1', config=None, tags = ['Linear Model', 'baseline', 'OHE Encoding'])
wandb.run.name = "LinearModel"
rmse = np.sqrt(mean_squared_error(tgt_tst_df, linear_model.predict(ftr_tst_df)))
r2 = linear_model.score(ftr_tst_df, tgt_tst_df)

wandb.log( {"rmse" : rmse,
            "r2": r2} )

wandb.Artifact("LinearModel",
               type = 'model',
               description = None)

wandb.save()
wandb.finish()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mronitsalvi[0m ([33mronitsalvi-isb[0m). Use [1m`wandb login --relogin`[0m to force relogin




0,1
r2,▁
rmse,▁

0,1
r2,0.78258
rmse,5809.79605


### Predict on Test Set

In [None]:
y_pred = linear_model.predict(ftr_tst_df)

## K-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
scores = cross_val_score( linear_model,
                          ftr_trn_df,
                          tgt_trn_df,
                          cv = 10,
                          scoring = 'r2')

In [None]:
scores

array([0.81103455, 0.87420576, 0.772997  , 0.91381431, 0.86247998,
       0.80490268, 0.89043332, 0.86502201, 0.93503074, 0.97769048,
       0.60317031, 0.82756849, 0.87785935, 0.82069477, 0.82997802,
       0.84811795, 0.86479946, 0.90534212, 0.84381763, 0.91698772])

In [None]:
scores.mean()

0.8522973321902245

In [None]:
scores.std()

0.07445142661374396

# Decision Tree

## Decision Tree

When max_depth = 10

In [None]:
 params = {"max_depth": 10}

dtree = DecisionTreeRegressor(**params)

dtree_model = Pipeline(steps=[('preprocessor', preprocessor),
                               ('dt_model', dtree)])


dtree_model.fit(ftr_trn_df, tgt_trn_df)

wandb.init(project='insurance_v1', config=params, tags = ['Decision Tree',
                                                           'OHE Encoding'])
wandb.run.name = "DecisionTree"
rmse = np.sqrt(mean_squared_error(tgt_tst_df, dtree_model.predict(ftr_tst_df)))
r2 = dtree_model.score(ftr_tst_df, tgt_tst_df)

wandb.log( {"rmse" : rmse,
            "r2": r2} )

wandb.Artifact("DecisionTree",
               type = 'model',
               description = params)

wandb.save()
wandb.finish()

0,1
r2,▁
rmse,▁

0,1
r2,0.73527
rmse,6410.85581


## Decision Tree

When max_depth = 5

In [None]:
 params = {"max_depth": 5}

dtree = DecisionTreeRegressor(**params)

dtree_model = Pipeline(steps=[('preprocessor', preprocessor),
                               ('dt_model', dtree)])


dtree_model.fit(ftr_trn_df, tgt_trn_df)

wandb.init(project='insurance_v1', config=params, tags = ['Decision Tree',
                                                           'OHE Encoding'])
wandb.run.name = "DecisionTree"
rmse = np.sqrt(mean_squared_error(tgt_tst_df, dtree_model.predict(ftr_tst_df)))
r2 = dtree_model.score(ftr_tst_df, tgt_tst_df)

wandb.log( {"rmse" : rmse,
            "r2": r2} )

wandb.Artifact("DecisionTree",
               type = 'model',
               description = params)

wandb.save()
wandb.finish()

0,1
r2,▁
rmse,▁

0,1
r2,0.85795
rmse,4696.08326


## Sweep Function

Using Sweep function to efficiently track how the decision tree model changes when max_depth is changed

In [None]:
def train_decision_tree(config=None):
    # Initialize WandB
    with wandb.init(config=config):
        config = wandb.config

        dtree = DecisionTreeRegressor(max_depth=config.max_depth)

        dtree_model = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('dt_model', dtree)])
        dtree_model.fit(ftr_trn_df, tgt_trn_df)

        # Evaluate the model
        rmse = np.sqrt(mean_squared_error(tgt_tst_df, dtree_model.predict(ftr_tst_df)))
        r2 = dtree_model.score(ftr_tst_df, tgt_tst_df)

        # Log metrics to WandB
        wandb.log( {"rmse" : rmse,
                    "r2": r2,
                    "max_depth": config.max_depth} )


In [None]:
sweep_config = {
    "method": "grid",  # Can be 'grid', 'random', or 'bayes'
    "metric": {"name": "r2", "goal": "maximize"},
    "parameters": {
        "max_depth": {
            "values": [4, 6, 8, 12]  # Depths to evaluate
        },
    },
}

In [None]:
sweep_id = wandb.sweep(sweep_config, project="insurance_v1")

Create sweep with ID: jnr6ryna
Sweep URL: https://wandb.ai/ronitsalvi-isb/insurance_v1/sweeps/jnr6ryna


In [None]:
wandb.agent(sweep_id,
            function=train_decision_tree)  # Run all experiments

[34m[1mwandb[0m: Agent Starting Run: r3egnste with config:
[34m[1mwandb[0m: 	max_depth: 4


0,1
max_depth,▁
r2,▁
rmse,▁

0,1
max_depth,4.0
r2,0.86374
rmse,4599.39958


[34m[1mwandb[0m: Agent Starting Run: uo0mu72l with config:
[34m[1mwandb[0m: 	max_depth: 6


0,1
max_depth,▁
r2,▁
rmse,▁

0,1
max_depth,6.0
r2,0.82276
rmse,5245.6595


[34m[1mwandb[0m: Agent Starting Run: 953jmmkr with config:
[34m[1mwandb[0m: 	max_depth: 8


0,1
max_depth,▁
r2,▁
rmse,▁

0,1
max_depth,8.0
r2,0.775
rmse,5910.29103


[34m[1mwandb[0m: Agent Starting Run: pivobwz8 with config:
[34m[1mwandb[0m: 	max_depth: 12


0,1
max_depth,▁
r2,▁
rmse,▁

0,1
max_depth,12.0
r2,0.72629
rmse,6518.6281


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


## Manul Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = { "dt_model__max_depth" : range(2, 10)}

dtree = DecisionTreeRegressor()

dtree_model = Pipeline(steps=[('preprocessor', preprocessor),
                               ('dt_model', dtree)])

dt_grid = GridSearchCV(dtree_model,
                       param_grid = params,
                       cv = 10,
                       scoring = 'r2')

dt_grid.fit(ftr_trn_df, tgt_trn_df)

In [None]:
dt_grid.best_params_

{'dt_model__max_depth': 4}

Thus we should use the value of max_depth = 4 while building a Decision Tree

## Storing the Model

In [None]:
from joblib import dump

MODEL_DIR = "./insurance_v1_save"

os.mkdir(MODEL_DIR)
dump(linear_model, MODEL_DIR + "/" + 'insurance_v1_save1.pkl')

['./insurance_v1_save/insurance_v1_save1.pkl']

## Logging the model artifact in Weights and Biases

In [None]:
wandb.init(project='insurance_v1',
           config=None,
           tags = ['Final Model'])
wandb.run.name = "FinalModel"



In [None]:
model_artifact = wandb.Artifact("Linear_Model_insurance_v1",
                                type = 'model',
                                description = 'Linear Model for insurance')

In [None]:
model_artifact.add_dir(MODEL_DIR)

wandb.run.log_artifact(model_artifact)


[34m[1mwandb[0m: Adding directory to artifact (./insurance_v1_save)... Done. 0.0s


<Artifact Linear_Model_insurance_v1>