#### Insurance  liner dataset

## Data reading

In [1]:
import pandas  as  pd
import  numpy as  np
import matplotlib.pyplot as plt
import seaborn as  sns

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
path  =  r"C:\Users\admin\Downloads\insuranceLiner.xlsx"

In [4]:
df  =  pd.read_excel(path)

In [5]:
##  basic  checks

In [6]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [7]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [9]:
# check  duplicates
df.duplicated().sum()

np.int64(1)

In [10]:
# drop duplicate
df.drop_duplicates(keep = 'first' ,
                     inplace  =  True  , 
                     ignore_index=True) 

In [11]:
df.shape

(1337, 7)

In [12]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [13]:
##  seperate  x and y
x  =  df.drop('charges' , axis  = 1 )
y  =  df['charges'] 

#### seperation of  cat and  num data

In [14]:
cat  =  x.columns[x.dtypes == 'object']
num  =  x.columns[x.dtypes != 'object']

In [15]:
print(cat)
print()
print(num)

Index(['sex', 'smoker', 'region'], dtype='object')

Index(['age', 'bmi', 'children'], dtype='object')


### create pipeline

In [16]:
from  sklearn.pipeline  import Pipeline
from  sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose  import ColumnTransformer

In [17]:
num_pipeline =  Pipeline(steps  =  [('imputer' , SimpleImputer(strategy  = 'mean' )),
                                     ('scaler' ,  StandardScaler())])

cat_pipeline  = Pipeline(steps  =  [('imputer', SimpleImputer(strategy='most_frequent'))  ,  ('encoder' , OneHotEncoder(handle_unknown='ignore' ))]) 

In [20]:
##  compose  these  pipeline 
pre  =  ColumnTransformer([('num_pipeline'  ,  num_pipeline ,  num) ,  
                            ('cat_pipeline' , cat_pipeline , cat)])

In [21]:
pre

0,1,2
,transformers,"[('num_pipeline', ...), ('cat_pipeline', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'


In [22]:
## apply to  data
x_scaled  =  pre.fit_transform(x)


In [23]:
x_scaled

array([[-1.44041773, -0.45315959, -0.90923416, ...,  0.        ,
         0.        ,  1.        ],
       [-1.51164747,  0.50942165, -0.07944162, ...,  0.        ,
         1.        ,  0.        ],
       [-0.79935006,  0.3831546 ,  1.58014347, ...,  0.        ,
         1.        ,  0.        ],
       ...,
       [-1.51164747,  1.01448983, -0.90923416, ...,  0.        ,
         1.        ,  0.        ],
       [-1.29795825, -0.79752426, -0.90923416, ...,  0.        ,
         0.        ,  1.        ],
       [ 1.55123139, -0.26129928, -0.90923416, ...,  1.        ,
         0.        ,  0.        ]], shape=(1337, 11))

In [24]:
# convert in dataframe
x_scaled =  pd.DataFrame(x_scaled ,columns  = pre.get_feature_names_out())

In [25]:
x_scaled

Unnamed: 0,num_pipeline__age,num_pipeline__bmi,num_pipeline__children,cat_pipeline__sex_female,cat_pipeline__sex_male,cat_pipeline__smoker_no,cat_pipeline__smoker_yes,cat_pipeline__region_northeast,cat_pipeline__region_northwest,cat_pipeline__region_southeast,cat_pipeline__region_southwest
0,-1.440418,-0.453160,-0.909234,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,-1.511647,0.509422,-0.079442,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,-0.799350,0.383155,1.580143,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
3,-0.443201,-1.305052,-0.909234,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
4,-0.514431,-0.292456,-0.909234,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1332,0.767704,0.050269,1.580143,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
1333,-1.511647,0.206053,-0.909234,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
1334,-1.511647,1.014490,-0.909234,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1335,-1.297958,-0.797524,-0.909234,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


#### train test  split

In [26]:
from sklearn.model_selection import train_test_split
x_train ,  x_test  ,  y_train  ,  y_test  =  train_test_split(x_scaled ,  y  ,  
                                                            test_size  =  0.2  , 
                                                            random_state= 4)

In [27]:
print(x_train.shape  ,  y_train.shape)
print(x_test.shape , y_test.shape)

(1069, 11) (1069,)
(268, 11) (268,)


In [28]:
x_train.head()

Unnamed: 0,num_pipeline__age,num_pipeline__bmi,num_pipeline__children,cat_pipeline__sex_female,cat_pipeline__sex_male,cat_pipeline__smoker_no,cat_pipeline__smoker_yes,cat_pipeline__region_northeast,cat_pipeline__region_northwest,cat_pipeline__region_southeast,cat_pipeline__region_southwest
127,0.910164,1.104681,-0.909234,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
578,0.910164,-0.075998,-0.079442,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
385,-1.440418,0.612731,-0.909234,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1203,-1.511647,-0.554829,1.580143,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
413,-1.013039,-1.109092,3.239729,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0


In [29]:
y_train.head()

127      9634.5380
578      9724.5300
385      1261.8590
1203    18223.4512
413      5080.0960
Name: charges, dtype: float64

In [30]:
x_test.head()

Unnamed: 0,num_pipeline__age,num_pipeline__bmi,num_pipeline__children,cat_pipeline__sex_female,cat_pipeline__sex_male,cat_pipeline__smoker_no,cat_pipeline__smoker_yes,cat_pipeline__region_northeast,cat_pipeline__region_northwest,cat_pipeline__region_southeast,cat_pipeline__region_southwest
726,-0.72812,-1.460836,-0.079442,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
260,1.337542,-0.895914,-0.909234,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
902,0.696474,1.01449,-0.909234,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
566,-0.087053,1.623687,-0.079442,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
776,0.411556,1.49906,-0.909234,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0


In [31]:
y_test.head()

726    16657.71745
260    11837.16000
902     8125.78450
566     6373.55735
776     7448.40395
Name: charges, dtype: float64

In [None]:
## model  fitting
from sklearn.neighbors import KNeighborsRegressor
KNN  =  KNeighborsRegressor(n_neighbors= 5   , )

In [33]:
KNN.fit(x_train  , y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [35]:
KNN.score(x_train  ,  y_train)

0.8440921336968048

In [36]:
KNN.score(x_test ,  y_test)

0.7866671795207615

###  grid  search cv

In [40]:
from  sklearn.model_selection import GridSearchCV
model  =  KNeighborsRegressor()
para  =  {'n_neighbors'  :  range(3 , 15), 
          'metric' :  ['euclidean' , 'minkwoski' , 'manhattan']}

In [41]:
KNNGrid  = GridSearchCV( estimator  =  model  ,  
                        cv  = 5  ,
                        n_jobs  =  -1  ,
                        param_grid= para)

In [42]:
KNNGrid.fit(x_train  ,  y_train)

0,1,2
,estimator,KNeighborsRegressor()
,param_grid,"{'metric': ['euclidean', 'minkwoski', ...], 'n_neighbors': range(3, 15)}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_neighbors,3
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'manhattan'
,metric_params,
,n_jobs,


In [43]:
KNNGrid.best_params_

{'metric': 'manhattan', 'n_neighbors': 3}

In [44]:
KNNGrid.score(x_train  ,  y_train)

0.890228678988307

In [45]:
KNNGrid.score(x_test  ,  y_test)

0.7772031174797138

In [46]:
##   model  performance
from sklearn.metrics import (mean_squared_error, 
                            root_mean_squared_error , 
                            mean_absolute_error , 
                            mean_absolute_percentage_error ,  
                            r2_score)
def  evaluate(model  , x ,  y):
    pred  =  model.predict(x)
    MSE  =  mean_squared_error(y  ,  pred)
    RMSE  =  root_mean_squared_error(y  ,  pred)
    MAE  = mean_absolute_error(y  , pred)
    MAPE  =  mean_absolute_percentage_error(y ,  pred)
    r2  =  r2_score(y ,  pred)

    print(f'MSE --- >  {MSE}')
    print(f'RMSE --- >  {RMSE}')
    print(f'MAE --- >  {MAE}')
    print(f'MAPE --- >  {MAPE}')
    print(f'R2 --- >  {r2}')

In [47]:
#  check evaluation  on grid  KNN  training 
evaluate(KNN ,  x_train ,  y_train)

MSE --- >  23612819.52463542
RMSE --- >  4859.302370159262
MAE --- >  3010.221478503648
MAPE --- >  0.30888893044076426
R2 --- >  0.8440921336968048


In [48]:
evaluate(KNNGrid ,  x_train ,  y_train)

MSE --- >  16625270.125814069
RMSE --- >  4077.409732393112
MAE --- >  2358.3555415104465
MAPE --- >  0.2294062900700521
R2 --- >  0.890228678988307


In [49]:
#  testing evaluation 
evaluate(KNN  ,  x_test  ,  y_test)

MSE --- >  26790572.219807845
RMSE --- >  5175.960994811287
MAE --- >  2968.402209985821
MAPE --- >  0.3233049286702859
R2 --- >  0.7866671795207615


In [50]:
evaluate(KNNGrid ,  x_test  , y_test)

MSE --- >  27979079.628249973
RMSE --- >  5289.525463427695
MAE --- >  2912.6748794776113
MAPE --- >  0.3221584929911413
R2 --- >  0.7772031174797138


##### Decision tree regression

In [51]:
from  sklearn.tree  import DecisionTreeRegressor
DT  =  DecisionTreeRegressor(max_depth  =  4)


In [52]:
DT.fit(x_train ,  y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,4
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [54]:
DT.score(x_train , y_train)

0.8690997332563295

In [55]:
DT.score(x_test  ,  y_test)

0.8556702334034949

In [56]:
evaluate(DT  , x_train ,  y_train)

MSE --- >  19825326.634475157
RMSE --- >  4452.564051698208
MAE --- >  2619.8771619819527
MAPE --- >  0.27944667170543563
R2 --- >  0.8690997332563295


In [57]:
evaluate(DT  , x_test  , y_test)

MSE --- >  18125092.17655979
RMSE --- >  4257.357417055771
MAE --- >  2419.3700863829035
MAPE --- >  0.2788777747823843
R2 --- >  0.8556702334034949


In [58]:
## grid  search 
DTmodel  = DecisionTreeRegressor()
para  =  {'max_depth' :  range(4  , 10) }
GridDT  =  GridSearchCV(DTmodel  , 
                        param_grid  =  para  ,
                         cv =  5  ,
                          n_jobs = -1 )
 

In [59]:
GridDT.fit(x_train  ,  y_train)

0,1,2
,estimator,DecisionTreeRegressor()
,param_grid,"{'max_depth': range(4, 10)}"
,scoring,
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,4
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [60]:
evaluate(GridDT ,  x_train  ,  y_train)

MSE --- >  19825326.634475157
RMSE --- >  4452.564051698208
MAE --- >  2619.8771619819527
MAPE --- >  0.27944667170543563
R2 --- >  0.8690997332563295


In [61]:
evaluate(GridDT ,  x_test ,  y_test)

MSE --- >  18125092.176559787
RMSE --- >  4257.357417055771
MAE --- >  2419.370086382903
MAPE --- >  0.2788777747823843
R2 --- >  0.8556702334034949


### Linear  regression  

In [62]:
from  sklearn.linear_model import LinearRegression
LR  =  LinearRegression()

In [63]:
LR.fit(x_train , y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [64]:
evaluate(LR  ,  x_train ,  y_train)

MSE --- >  37130688.14480461
RMSE --- >  6093.495560415598
MAE --- >  4250.192416235064
MAPE --- >  0.42542139436770426
R2 --- >  0.7548379871795419


In [65]:
evaluate(LR  , x_test  , y_test)

MSE --- >  34847537.91248279
RMSE --- >  5903.180321867425
MAE --- >  4126.465266241464
MAPE --- >  0.4508414750922211
R2 --- >  0.722509713916052


##### check  for  RFR ,  GradientBossingRegressor  ,  XGB

###### 

##### from  above  models  DT  giving us  best performance  which  we have to  use for  future  prediction