In [2]:
# import libraries needed for exploratory data analysis (eda) and feature engineering (fe)
import os
import time
import datetime
import pandas as pd
pd.set_option('display.max_columns',None)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# import libraries needed for model training
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor, ExtraTreesRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [3]:
df = pd.read_csv('../data/insurance.csv')

In [4]:
df.head(5) #display head (top 5 rows)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
print(f"Shape: ",df.shape) #get total shape of dataset, total rows and columns
print("Number of Columns:", df.shape[1])
print("Number of Rows:", df.shape[0])

Shape:  (1338, 7)
Number of Columns: 7
Number of Rows: 1338


In [6]:
df.info() #quick info about data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [7]:
df.describe().transpose() #statistics for numerical datatypes

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
bmi,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
children,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
charges,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


In [8]:
df.isna().sum() #number of missing values per column

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [9]:
df.dropna() #drop rows with any NA values

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [10]:
print("Number of Duplicates: ", df.duplicated().sum())

Number of Duplicates:  1


In [11]:
df.nunique() #number of unique values in each column

age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64

In [12]:
df.columns #show all cloumns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [13]:
numerical_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

print('Numerical Features : {} : {}'.format(len(numerical_features), numerical_features))
print('Categorical Features : {} : {}'.format(len(categorical_features), categorical_features))

Numerical Features : 4 : ['age', 'bmi', 'children', 'charges']
Categorical Features : 3 : ['sex', 'smoker', 'region']


In [14]:
#get unique values in categorical columns
for column in categorical_features:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}': {unique_values}")

Unique values in column 'sex': ['female' 'male']
Unique values in column 'smoker': ['yes' 'no']
Unique values in column 'region': ['southwest' 'southeast' 'northwest' 'northeast']


In [15]:
x = df.drop(columns=['charges'],axis=1) #dataframe contains all cloumns which shold be used to predicted
y=df['charges'] #series contains to be predicted

In [16]:
print(x.head())
print(type(x)) #datatype of x

   age     sex     bmi  children smoker     region
0   19  female  27.900         0    yes  southwest
1   18    male  33.770         1     no  southeast
2   28    male  33.000         3     no  southeast
3   33    male  22.705         0     no  northwest
4   32    male  28.880         0     no  northwest
<class 'pandas.core.frame.DataFrame'>


In [17]:
print(y.head())
print(type(y)) #datatype of y

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64
<class 'pandas.core.series.Series'>


In [18]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numerical_features = x.select_dtypes(exclude="object").columns
categorical_features = x.select_dtypes(include="object").columns

numerical_transformer = StandardScaler()
ohe_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", ohe_transformer, categorical_features),
         ("StandardScaler", numerical_transformer, numerical_features),
    ]
)
X = preprocessor.fit_transform(x)   #pre-processing source data x data and saving in X 
print(f"Shape of original data (x): {x.shape}")
print(f"Shape of transformed data (X): {X.shape}")

Shape of original data (x): (1338, 6)
Shape of transformed data (X): (1338, 11)


In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=None) #using 20% to test and 80% for training.
print(f"Shape of training data : {X_train.shape}")
print(f"Shape of test data : {X_test.shape}")

Shape of training data : (1070, 11)
Shape of test data : (268, 11)


In [20]:
#Initialise dataframe for Regression Performace Metrics
performance_metrics={
    'Model Name':[], 
    'MAE':[] ,
    'MSE':[] ,
    'RMSE':[] ,
    'R2 Score':[],
    'Adjusted R2 Score':[] 
    }
df_ModelPerformance=pd.DataFrame(performance_metrics)
print(type(df_ModelPerformance))
df_ModelPerformance.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Model Name,MAE,MSE,RMSE,R2 Score,Adjusted R2 Score


In [21]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error

#Define a function to evaluate model
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = root_mean_squared_error(true, predicted)
    r2_square = r2_score(true, predicted)
    return mae, mse, rmse, r2_square

In [22]:
#Define Models

models = {
    "Linear": LinearRegression(),
    "Lasso": Lasso(alpha=0.1),
    "Ridge": Ridge(),
    "Bagging": BaggingRegressor(),
    "ExtraTrees": ExtraTreesRegressor(),
    "SVR": SVR(),
    "K-Neighbors": KNeighborsRegressor(n_neighbors=5),
    "Random Forest": RandomForestRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "Gradient Boost": GradientBoostingRegressor(),
    "CatBoosting": CatBoostRegressor(verbose=False),
    "AdaBoost": AdaBoostRegressor()
}

for key, value in models.items():
    model_name = key
    model = value
    test_performance_metrics = {}

    print('-'*80)
    
    t1=time.time()
    print(f'{datetime.datetime.fromtimestamp(t1).strftime("%Y-%m-%d %H:%M:%S")} - {model_name} - performing training')
    model.fit(X_train, y_train) # Training the Model with training dataset

    # Predicting Values of test dataset
    t2=time.time()
    print(f'{datetime.datetime.fromtimestamp(t2).strftime("%Y-%m-%d %H:%M:%S")} - {model_name} - predecting test dataset')
    y_test_pred = model.predict(X_test)
    
    # Evaluating Model Performance
    t3=time.time()
    print(f'{datetime.datetime.fromtimestamp(t3).strftime("%Y-%m-%d %H:%M:%S")} - {model_name} - evaluating performance of test dataset')
    model_test_mae ,model_test_mse, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)    
    

    model_test_adjusted_r2 = (1 - (1-model_test_r2)*(len(y)-1)/(len(y)-x.shape[1]-1))
    model_test_mae = round(model_test_mae,2)
    model_test_mse = round(model_test_mse,2)
    model_test_rmse = round(model_test_rmse,2)
    model_test_r2 = round(model_test_r2,2)
    model_test_adjusted_r2 = round(model_test_adjusted_r2,2)


    test_performance_metrics=pd.DataFrame({'Model Name':f'{model_name} (Test)', 
                                        'MAE':[model_test_mae] ,
                                        'MSE':[model_test_mse] ,
                                        'RMSE':[model_test_rmse] ,
                                        'R2 Score':[model_test_r2],
                                        'Adjusted R2 Score':[model_test_adjusted_r2]
                                        })
    df_ModelPerformance = pd.concat([test_performance_metrics,df_ModelPerformance], ignore_index=True)
print('-'*80)

--------------------------------------------------------------------------------
2024-10-22 22:46:51 - Linear - performing training
2024-10-22 22:46:51 - Linear - predecting test dataset
2024-10-22 22:46:51 - Linear - evaluating performance of test dataset
--------------------------------------------------------------------------------
2024-10-22 22:46:51 - Lasso - performing training
2024-10-22 22:46:51 - Lasso - predecting test dataset
2024-10-22 22:46:51 - Lasso - evaluating performance of test dataset
--------------------------------------------------------------------------------
2024-10-22 22:46:51 - Ridge - performing training
2024-10-22 22:46:51 - Ridge - predecting test dataset
2024-10-22 22:46:51 - Ridge - evaluating performance of test dataset
--------------------------------------------------------------------------------
2024-10-22 22:46:51 - Bagging - performing training
2024-10-22 22:46:51 - Bagging - predecting test dataset
2024-10-22 22:46:51 - Bagging - evaluating per

In [23]:
pd.set_option('display.max_columns',None)

filepath = f'../outputs/{time.strftime("%Y%m%d_%H%M%S")}_ModelPerformance.csv'
df_ModelPerformance.to_csv(filepath)  
df_ModelPerformance.sort_values(by=['Adjusted R2 Score'], ascending=False)

Unnamed: 0,Model Name,MAE,MSE,RMSE,R2 Score,Adjusted R2 Score
2,Gradient Boost (Test),2351.76,16567420.0,4070.31,0.89,0.89
1,CatBoosting (Test),2486.04,18287200.0,4276.35,0.88,0.87
5,Random Forest (Test),2521.96,18883420.0,4345.51,0.87,0.87
0,AdaBoost (Test),3607.74,20446220.0,4521.75,0.86,0.86
9,Bagging (Test),2721.29,20915780.0,4573.38,0.86,0.86
8,ExtraTrees (Test),2384.9,21895550.0,4679.27,0.85,0.85
3,XGBRegressor (Test),3143.63,25521980.0,5051.93,0.83,0.83
6,K-Neighbors (Test),3611.36,33269020.0,5767.93,0.77,0.77
10,Ridge (Test),4084.12,33132380.0,5756.07,0.77,0.77
11,Lasso (Test),4080.7,33144950.0,5757.16,0.77,0.77
