In [51]:
# import libraries needed for exploratory data analysis (eda) and feature engineering (fe)
import os
import time
import datetime
import pandas as pd
pd.set_option('display.max_columns',None)
import warnings
warnings.filterwarnings('ignore')

# import libraries needed for model training
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from urllib.parse import urlparse

import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

In [52]:
df = pd.read_csv('../data/insurance.csv')

In [53]:
df.head(5) #display head (top 5 rows)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [54]:
print(f"Shape: ",df.shape) #get total shape of dataset, total rows and columns
print("Number of Columns:", df.shape[1])
print("Number of Rows:", df.shape[0])

Shape:  (1338, 7)
Number of Columns: 7
Number of Rows: 1338


In [55]:
df.info() #quick info about data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [56]:
df.describe().transpose() #statistics for numerical datatypes

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
charges,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


In [57]:
df.isna().sum() #number of missing values per column

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [58]:
df.dropna() #drop rows with any NA values

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [59]:
print("Number of Duplicates: ", df.duplicated().sum())

Number of Duplicates:  1


In [60]:
df.nunique() #number of unique values in each column

age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64

In [61]:
df.columns #show all cloumns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [62]:
numerical_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

print('Numerical Features : {} : {}'.format(len(numerical_features), numerical_features))
print('Categorical Features : {} : {}'.format(len(categorical_features), categorical_features))

Numerical Features : 4 : ['age', 'bmi', 'children', 'charges']
Categorical Features : 3 : ['sex', 'smoker', 'region']


In [63]:
#get unique values in categorical columns
for column in categorical_features:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}': {unique_values}")

Unique values in column 'sex': ['female' 'male']
Unique values in column 'smoker': ['yes' 'no']
Unique values in column 'region': ['southwest' 'southeast' 'northwest' 'northeast']


In [64]:
x = df.drop(columns=['charges'],axis=1) #dataframe contains all cloumns which shold be used to predicted
y=df['charges'] #series contains to be predicted

In [65]:
print(x.head())
print(type(x)) #datatype of x

   age     sex     bmi  children smoker     region
0   19  female  27.900         0    yes  southwest
1   18    male  33.770         1     no  southeast
2   28    male  33.000         3     no  southeast
3   33    male  22.705         0     no  northwest
4   32    male  28.880         0     no  northwest
<class 'pandas.core.frame.DataFrame'>


In [66]:
print(y.head())
print(type(y)) #datatype of y

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64
<class 'pandas.core.series.Series'>


In [67]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numerical_features = x.select_dtypes(exclude="object").columns
categorical_features = x.select_dtypes(include="object").columns

numerical_transformer = StandardScaler()
ohe_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", ohe_transformer, categorical_features),
         ("StandardScaler", numerical_transformer, numerical_features),
    ]
)
X = preprocessor.fit_transform(x)   #pre-processing source data x data and saving in X 
print(f"Shape of original data (x): {x.shape}")
print(f"Shape of transformed data (X): {X.shape}")

Shape of original data (x): (1338, 6)
Shape of transformed data (X): (1338, 11)


In [68]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=None) #using 20% to test and 80% for training.
print(f"Shape of training data : {X_train.shape}")
print(f"Shape of test data : {X_test.shape}")

Shape of training data : (1070, 11)
Shape of test data : (268, 11)


In [69]:
#Initialise dataframe for Regression Performace Metrics
performance_metrics={
    'Model Name':[], 
    'MAE':[] ,
    'MSE':[] ,
    'RMSE':[] ,
    'R2 Score':[],
    'Adjusted R2 Score':[] 
    }
df_ModelPerformance=pd.DataFrame(performance_metrics)
print(type(df_ModelPerformance))
df_ModelPerformance.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Model Name,MAE,MSE,RMSE,R2 Score,Adjusted R2 Score


In [70]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

#Define a function to evaluate model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = root_mean_squared_error(true, predicted)
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [71]:
#Define Models

models = {
    "Random Forest": RandomForestRegressor()
}

for key, value in models.items():
    model_name = key
    model = value
    test_performance_metrics = {}

    print('-'*80)
    
    t1=time.time()
    print(f'{datetime.datetime.fromtimestamp(t1).strftime("%Y-%m-%d %H:%M:%S")} - {model_name} - performing training')
    model.fit(X_train, y_train) # Training the Model with training dataset

    # Predicting Values of test dataset
    t2=time.time()
    print(f'{datetime.datetime.fromtimestamp(t2).strftime("%Y-%m-%d %H:%M:%S")} - {model_name} - predecting test dataset')
    y_test_pred = model.predict(X_test)
    
    # Evaluating Model Performance
    t3=time.time()
    print(f'{datetime.datetime.fromtimestamp(t3).strftime("%Y-%m-%d %H:%M:%S")} - {model_name} - evaluating performance of test dataset')
    model_test_mae ,model_test_mse, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)    
    

    model_test_adjusted_r2 = (1 - (1-model_test_r2)*(len(y)-1)/(len(y)-x.shape[1]-1))
    model_test_mae = round(model_test_mae,2)
    model_test_mse = round(model_test_mse,2)
    model_test_rmse = round(model_test_rmse,2)
    model_test_r2 = round(model_test_r2,2)
    model_test_adjusted_r2 = round(model_test_adjusted_r2,2)


    test_performance_metrics=pd.DataFrame({'Model Name':f'{model_name} (Test)', 
                                        'MAE':[model_test_mae] ,
                                        'MSE':[model_test_mse] ,
                                        'RMSE':[model_test_rmse] ,
                                        'R2 Score':[model_test_r2],
                                        'Adjusted R2 Score':[model_test_adjusted_r2]
                                        })
    df_ModelPerformance = pd.concat([test_performance_metrics,df_ModelPerformance], ignore_index=True)
print('-'*80)

--------------------------------------------------------------------------------
2024-10-23 23:00:20 - Random Forest - performing training
2024-10-23 23:00:21 - Random Forest - predecting test dataset
2024-10-23 23:00:21 - Random Forest - evaluating performance of test dataset
--------------------------------------------------------------------------------


In [72]:
df_ModelPerformance

Unnamed: 0,Model Name,MAE,MSE,RMSE,R2 Score,Adjusted R2 Score
0,Random Forest (Test),2724.26,21912515.06,4681.08,0.83,0.83


In [73]:
def hyperparameter_tuning(X_train,y_train,model_params):
    rf=RandomForestRegressor()
    grid_search=GridSearchCV(estimator=rf,param_grid=model_params,cv=3,n_jobs=-1,verbose=2,
                             scoring="neg_mean_squared_error")
    grid_search.fit(X_train,y_train)
    return grid_search

In [74]:

## Split data into training and test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20)

from mlflow.models import infer_signature
signature=infer_signature(X_train,y_train)

model_params = {
    'n_estimators': [100, 200],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

## start the MLFLOW Experiments

with mlflow.start_run():
    ## Perform hyperparameter tuning
    grid_search=hyperparameter_tuning(X_train,y_train,model_params)

    ## Get the best model
    best_model=grid_search.best_estimator_

    ## Evaluate the best model
    y_pred=best_model.predict(X_test)
    mse=mean_squared_error(y_test,y_pred)

    ## Log best parameters and metrics
    mlflow.log_param("best_n_estimators",grid_search.best_params_['n_estimators'])
    mlflow.log_param("best_max_depth", grid_search.best_params_['max_depth'])
    mlflow.log_param("best_min_samples_split", grid_search.best_params_['min_samples_split'])
    mlflow.log_param("best_min_samples_leaf", grid_search.best_params_['min_samples_leaf'])
    mlflow.log_metric("mse",mse)


 ## Tracking url

    mlflow.set_tracking_uri(uri="http://127.0.0.1:5000")
    tracking_url_type_store=urlparse(mlflow.get_tracking_uri()).scheme

    if tracking_url_type_store !='file':
        mlflow.sklearn.log_model(best_model,"model",registered_model_name="Best Randomforest Model")
    else:
        mlflow.sklearn.log_model(best_model,"model",signature=signature)

    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"Mean Squared Error: {mse}")

Fitting 3 folds for each of 24 candidates, totalling 72 fits


Successfully registered model 'Best Randomforest Model'.
2024/10/23 23:00:40 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Best Randomforest Model, version 1
Created version '1' of model 'Best Randomforest Model'.
2024/10/23 23:00:40 INFO mlflow.tracking._tracking_service.client: 🏃 View run honorable-dove-907 at: http://127.0.0.1:5000/#/experiments/0/runs/a628104677774b91aef35546bee1a8a0.
2024/10/23 23:00:40 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0.


Best Hyperparameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Mean Squared Error: 18162667.5138712
