In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings

from sklearn.preprocessing import (OneHotEncoder,LabelEncoder,OrdinalEncoder,StandardScaler, MinMaxScaler,RobustScaler)
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import(RandomForestRegressor, 
                            GradientBoostingRegressor, 
                            AdaBoostRegressor)
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score,root_mean_squared_error


from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV
from sklearn.compose import ColumnTransformer

warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv('/Users/suryasaikadali/Downloads/pw_skills/kaggle/insurance_prediction/artifacts/insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [4]:
x = df.drop('expenses',axis = 1)
y = df[['expenses']]

In [5]:
x.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.8,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.7,0,no,northwest
4,32,male,28.9,0,no,northwest


In [6]:
y.head()

Unnamed: 0,expenses
0,16884.92
1,1725.55
2,4449.46
3,21984.47
4,3866.86


In [7]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)

In [8]:
x_train.shape,x_test.shape

((1070, 6), (268, 6))

In [9]:
numerical_cols = [i for i in x.columns if x[i].dtype in ('int','float')]

In [10]:
numerical_cols

['age', 'bmi', 'children']

In [11]:
categorical_cols = [i for i in x.columns if x[i].dtype == 'O' and i!='smoker']

In [12]:
categorical_cols

['sex', 'region']

In [13]:
ord_categorical_cols = [i for i in x.columns if x[i].dtype == 'O' and i=='smoker']

In [14]:
ord_categorical_cols

['smoker']

In [15]:
smoker_categories = [
    'no',
    'yes'
]

In [16]:
sc = StandardScaler()

In [17]:
# 1st experiment of preprocessing class 
class Preprocessor1:
    def __init__(self,x_train,x_test):
        self.x_train = x_train
        self.x_test = x_test
        
    def cat_pipeline(self):
        ohe = OneHotEncoder(drop='first', sparse_output=False)
        
        cat_encoded_train = pd.DataFrame(ohe.fit_transform(self.x_train[categorical_cols]),
                                         columns = ohe.get_feature_names_out(categorical_cols))
        
        
        cat_encoded_test = pd.DataFrame(ohe.transform(self.x_test[categorical_cols]),
                                        columns = ohe.get_feature_names_out(categorical_cols))
        
        
        return(cat_encoded_train,cat_encoded_test)

    def ord_pipeline(self):
        ode = OrdinalEncoder(categories=[smoker_categories])
        
        ord_encoded_train = pd.DataFrame(ode.fit_transform(self.x_train[ord_categorical_cols]),
                                         columns = ode.get_feature_names_out(ord_categorical_cols))
        ord_encoded_train = pd.DataFrame(sc.fit_transform(ord_encoded_train),
                                         columns = ode.get_feature_names_out(ord_categorical_cols))
        
        ord_encoded_test = pd.DataFrame(ode.transform(self.x_test[ord_categorical_cols]),
                                         columns = ode.get_feature_names_out(ord_categorical_cols))
        ord_encoded_test = pd.DataFrame(sc.transform(ord_encoded_test),
                                        columns = ode.get_feature_names_out(ord_categorical_cols))
        
        return(ord_encoded_train,ord_encoded_test)
        
    def num_pipeline(self):
        num_encoded_train = pd.DataFrame(sc.fit_transform(self.x_train[numerical_cols]),
                                         columns = numerical_cols)
        num_encoded_test = pd.DataFrame(sc.transform(self.x_test[numerical_cols]),
                                        columns = numerical_cols)
        
        return(num_encoded_train,num_encoded_test)

    def preprocessor(self):
        
        x_train_1,x_test_1 = self.cat_pipeline()
        x_train_2,x_test_2 = self.ord_pipeline()
        x_train_3,x_test_3 = self.num_pipeline()

        self.x_train = pd.concat([x_train_1,x_train_2, x_train_3],
                                  axis = 1)
        self.x_test = pd.concat([x_test_1,x_test_2, x_test_3],
                                 axis = 1)
        
        return(self.x_train,self.x_test)
        
        

In [18]:
preprocessor1 = Preprocessor1(x_train,x_test)

In [19]:
new_train_1,new_test_1 = preprocessor1.preprocessor()

In [20]:
new_train_1

Unnamed: 0,sex_male,region_northwest,region_southeast,region_southwest,smoker,age,bmi,children
0,0.0,1.0,0.0,0.0,-0.508747,0.472227,-1.748572,0.734336
1,0.0,0.0,0.0,0.0,-0.508747,0.543313,-1.036704,-0.911192
2,0.0,0.0,1.0,0.0,-0.508747,0.898745,-0.937373,-0.911192
3,0.0,0.0,1.0,0.0,-0.508747,-0.025379,0.618804,3.202629
4,0.0,1.0,0.0,0.0,-0.508747,1.040918,-1.500246,1.557100
...,...,...,...,...,...,...,...,...
1065,0.0,0.0,0.0,0.0,-0.508747,-1.518194,0.138707,2.379865
1066,0.0,0.0,1.0,0.0,-0.508747,-0.025379,-1.102924,3.202629
1067,1.0,0.0,0.0,0.0,-0.508747,1.325264,-0.887708,-0.911192
1068,0.0,0.0,0.0,1.0,1.965613,-0.167551,2.820630,0.734336


In [21]:
new_test_1

Unnamed: 0,sex_male,region_northwest,region_southeast,region_southwest,smoker,age,bmi,children
0,0.0,0.0,0.0,0.0,-0.508747,0.401140,-0.887708,0.734336
1,0.0,1.0,0.0,0.0,-0.508747,-0.238638,-0.093064,-0.911192
2,0.0,1.0,0.0,0.0,1.965613,1.751782,-0.606272,-0.911192
3,1.0,1.0,0.0,0.0,-0.508747,0.472227,-0.804933,1.557100
4,1.0,1.0,0.0,0.0,1.965613,-1.447107,0.221482,-0.911192
...,...,...,...,...,...,...,...,...
263,1.0,0.0,1.0,0.0,1.965613,1.680696,0.751245,-0.911192
264,0.0,1.0,0.0,0.0,-0.508747,1.325264,-0.556607,-0.911192
265,1.0,0.0,0.0,0.0,-0.508747,-0.096465,-0.424166,-0.088428
266,0.0,0.0,1.0,0.0,1.965613,1.040918,2.787520,-0.911192


In [22]:
# 2nd experiment of preprocessing class with labelencoder
class Preprocessor2:
    def __init__(self,x_train,x_test):
        self.x_train = x_train
        self.x_test = x_test
        
    def cat_pipeline(self):
        lb = LabelEncoder()
        lb_train = pd.DataFrame()
        lb_test = pd.DataFrame()
        
        for i in categorical_cols:
            lb_train[i] = lb.fit_transform(self.x_train[i])  
            lb_test[i] = lb.transform(self.x_test[i])
        
        return(lb_train,lb_test)

    def ord_pipeline(self):
        ode = OrdinalEncoder(categories=[smoker_categories])
        
        ord_encoded_train = pd.DataFrame(ode.fit_transform(self.x_train[ord_categorical_cols]),
                                         columns = ode.get_feature_names_out(ord_categorical_cols))
        ord_encoded_train = pd.DataFrame(sc.fit_transform(ord_encoded_train),
                                         columns = ode.get_feature_names_out(ord_categorical_cols))
        
        ord_encoded_test = pd.DataFrame(ode.transform(self.x_test[ord_categorical_cols]),
                                         columns = ode.get_feature_names_out(ord_categorical_cols))
        ord_encoded_test = pd.DataFrame(sc.transform(ord_encoded_test),
                                        columns = ode.get_feature_names_out(ord_categorical_cols))
        
        return(ord_encoded_train,ord_encoded_test)
        
    def num_pipeline(self):
        num_encoded_train = pd.DataFrame(sc.fit_transform(self.x_train[numerical_cols]),
                                         columns = numerical_cols)
        num_encoded_test = pd.DataFrame(sc.transform(self.x_test[numerical_cols]),
                                        columns = numerical_cols)
        
        return(num_encoded_train,num_encoded_test)

    def preprocessor(self):
        
        x_train_1,x_test_1 = self.cat_pipeline()
        x_train_2,x_test_2 = self.ord_pipeline()
        x_train_3,x_test_3 = self.num_pipeline()

        self.x_train = pd.concat([x_train_1,x_train_2, x_train_3],
                                  axis = 1)
        self.x_test = pd.concat([x_test_1,x_test_2, x_test_3],
                                 axis = 1)
        
        return(self.x_train,self.x_test)
        
        

In [23]:
preprocessor2 = Preprocessor2(x_train,x_test)

In [24]:
new_train_2,new_test_2 = preprocessor2.preprocessor()

In [25]:
models = {
    'xgb':XGBRegressor(),
    'random_forest':RandomForestRegressor(),
    'KNNregressor':KNeighborsRegressor(),
    'gradient_boost':GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor()
    
}

In [26]:
def model_evaluation(x_train,x_test,y_train,y_test):
    total_scores_test = {}
    total_errors = {}
    for key,value in models.items():
        model = value.fit(x_train,y_train)
        
        y_pred = model.predict(x_test)
        
        score = r2_score(y_test,y_pred)
        
        error = root_mean_squared_error(y_test,y_pred)
        
        total_scores_test.update({key:score})
        total_errors.update({key:error})
        
    print(total_scores_test)
    print('%%%%%%%%%%%')
    print('%%%%%%%%%%%')
    print(total_errors)
    

In [27]:
model_evaluation(new_train_1,new_test_1,y_train,y_test)

{'xgb': 0.812964677810669, 'random_forest': 0.8630462752926327, 'KNNregressor': 0.8415310014946985, 'gradient_boost': 0.8797924649325901, 'adaboost': 0.8402215159278391}
%%%%%%%%%%%
%%%%%%%%%%%
{'xgb': 5388.598572745468, 'random_forest': 4611.061953608586, 'KNNregressor': 4960.05154740713, 'gradient_boost': 4319.961528030177, 'adaboost': 4980.502718503885}


In [28]:
model_evaluation(new_train_2,new_test_2,y_train,y_test)

{'xgb': 0.8433202505111694, 'random_forest': 0.8684705631954367, 'KNNregressor': 0.8396291104283747, 'gradient_boost': 0.87575740018233, 'adaboost': 0.829473570440119}
%%%%%%%%%%%
%%%%%%%%%%%
{'xgb': 4931.970497956028, 'random_forest': 4518.8249038589165, 'KNNregressor': 4989.727200225415, 'gradient_boost': 4391.8682011924275, 'adaboost': 5145.2902985312785}


In [29]:
# 3rd experiment of preprocessing class with labelencoder and robust scaler
class Preprocessor3:
    def __init__(self,x_train,x_test):
        self.x_train = x_train
        self.x_test = x_test
        
    def cat_pipeline(self):
        lb = LabelEncoder()
        lb_train = pd.DataFrame()
        lb_test = pd.DataFrame()
        
        for i in categorical_cols:
            lb_train[i] = lb.fit_transform(self.x_train[i])  
            lb_test[i] = lb.transform(self.x_test[i])
        
        return(lb_train,lb_test)

    def ord_pipeline(self):
        ode = OrdinalEncoder(categories=[smoker_categories])
        
        ord_encoded_train = pd.DataFrame(ode.fit_transform(self.x_train[ord_categorical_cols]),
                                         columns = ode.get_feature_names_out(ord_categorical_cols))
        ord_encoded_train = pd.DataFrame(sc.fit_transform(ord_encoded_train),
                                         columns = ode.get_feature_names_out(ord_categorical_cols))
        
        ord_encoded_test = pd.DataFrame(ode.transform(self.x_test[ord_categorical_cols]),
                                         columns = ode.get_feature_names_out(ord_categorical_cols))
        ord_encoded_test = pd.DataFrame(sc.transform(ord_encoded_test),
                                        columns = ode.get_feature_names_out(ord_categorical_cols))
        
        return(ord_encoded_train,ord_encoded_test)
        
    def num_pipeline(self):
        rs = RobustScaler()
        num_encoded_train = pd.DataFrame(rs.fit_transform(self.x_train[numerical_cols]),
                                         columns = numerical_cols)
        num_encoded_test = pd.DataFrame(rs.transform(self.x_test[numerical_cols]),
                                        columns = numerical_cols)
        
        return(num_encoded_train,num_encoded_test)

    def preprocessor(self):
        
        x_train_1,x_test_1 = self.cat_pipeline()
        x_train_2,x_test_2 = self.ord_pipeline()
        x_train_3,x_test_3 = self.num_pipeline()

        self.x_train = pd.concat([x_train_1,x_train_2, x_train_3],
                                  axis = 1)
        self.x_test = pd.concat([x_test_1,x_test_2, x_test_3],
                                 axis = 1)
        
        return(self.x_train,self.x_test)
        
        

In [30]:
preprocessor3 = Preprocessor3(x_train,x_test)

In [31]:
new_train_3,new_test_3 = preprocessor3.preprocessor()

In [32]:
model_evaluation(new_train_3,new_test_3,y_train,y_test)

{'xgb': 0.8433202505111694, 'random_forest': 0.8608720813887732, 'KNNregressor': 0.8118389898170828, 'gradient_boost': 0.8753148077034604, 'adaboost': 0.8247093301484981}
%%%%%%%%%%%
%%%%%%%%%%%
{'xgb': 4931.970497956028, 'random_forest': 4647.519036052223, 'KNNregressor': 5404.790335220541, 'gradient_boost': 4399.683877115129, 'adaboost': 5216.6708280861885}


In [39]:
list(np.arange(1,8,2))

[1, 3, 5, 7]

In [40]:
m1 = RandomForestRegressor(oob_score = True)

In [42]:
params = {
    'n_estimators': list(np.arange(100,301,50)),
    'max_depth': list(np.arange(1,8,2)),
    'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
        
}

In [46]:
gr_rand_for = GridSearchCV(estimator = m1,
                           param_grid = params, 
                           cv = 5,
                           scoring = 'r2',
                           verbose = 3)

In [48]:
gr_rand_for.fit(new_train_1,y_train)

Fitting 5 folds for each of 80 candidates, totalling 400 fits
[CV 1/5] END criterion=squared_error, max_depth=1, n_estimators=100;, score=0.616 total time=   0.1s
[CV 2/5] END criterion=squared_error, max_depth=1, n_estimators=100;, score=0.648 total time=   0.1s
[CV 3/5] END criterion=squared_error, max_depth=1, n_estimators=100;, score=0.589 total time=   0.1s
[CV 4/5] END criterion=squared_error, max_depth=1, n_estimators=100;, score=0.524 total time=   0.1s
[CV 5/5] END criterion=squared_error, max_depth=1, n_estimators=100;, score=0.637 total time=   0.1s
[CV 1/5] END criterion=squared_error, max_depth=1, n_estimators=150;, score=0.616 total time=   0.1s
[CV 2/5] END criterion=squared_error, max_depth=1, n_estimators=150;, score=0.648 total time=   0.1s
[CV 3/5] END criterion=squared_error, max_depth=1, n_estimators=150;, score=0.589 total time=   0.1s
[CV 4/5] END criterion=squared_error, max_depth=1, n_estimators=150;, score=0.524 total time=   0.1s
[CV 5/5] END criterion=square

In [51]:
best_params = gr_rand_for.best_params_

In [52]:
best_params

{'criterion': 'poisson', 'max_depth': 5, 'n_estimators': 250}

In [50]:
gr_rand_for.best_score_

0.8457630592145204

In [53]:
best_rf = RandomForestRegressor(**best_params,oob_score = True)

In [57]:
best_rf.fit(new_train_1,y_train)

In [58]:
y_pred = best_rf.predict(new_test_1)

In [59]:
r2_score(y_test,y_pred)

0.8738402336485553

In [63]:
best_rf.fit(new_train_3,y_train)

In [64]:
y_pred = best_rf.predict(new_test_3)

In [65]:
r2_score(y_test,y_pred)

0.8735676794139597

In [86]:
best_rf.feature_importances_

array([0.00145959, 0.00705105, 0.67504804, 0.19277806, 0.10394345,
       0.01971981])

In [87]:
best_rf.feature_names_in_

array(['sex', 'region', 'smoker', 'age', 'bmi', 'children'], dtype=object)

In [66]:
m2 = GradientBoostingRegressor()

In [67]:
params = {
    'loss':['squared_error', 'absolute_error','huber'],
    'learning_rate':[0.05,0.1,0.15],
    'n_estimators': list(np.arange(100,301,50)),
    'max_depth': list(np.arange(1,8,2)),
    'criterion': ['squared_error', 'friedman_mse']
        
}

In [68]:
gr_boost = GridSearchCV(estimator = m2,
                        param_grid = params, 
                        cv = 5,
                        scoring = 'r2',
                        verbose = 3)

In [69]:
gr_boost.fit(new_train_2,y_train)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
[CV 1/5] END criterion=squared_error, learning_rate=0.05, loss=squared_error, max_depth=1, n_estimators=100;, score=0.703 total time=   0.1s
[CV 2/5] END criterion=squared_error, learning_rate=0.05, loss=squared_error, max_depth=1, n_estimators=100;, score=0.774 total time=   0.0s
[CV 3/5] END criterion=squared_error, learning_rate=0.05, loss=squared_error, max_depth=1, n_estimators=100;, score=0.720 total time=   0.0s
[CV 4/5] END criterion=squared_error, learning_rate=0.05, loss=squared_error, max_depth=1, n_estimators=100;, score=0.667 total time=   0.0s
[CV 5/5] END criterion=squared_error, learning_rate=0.05, loss=squared_error, max_depth=1, n_estimators=100;, score=0.726 total time=   0.0s
[CV 1/5] END criterion=squared_error, learning_rate=0.05, loss=squared_error, max_depth=1, n_estimators=150;, score=0.714 total time=   0.0s
[CV 2/5] END criterion=squared_error, learning_rate=0.05, loss=squared_error, max_depth=1,

In [70]:
best_params = gr_boost.best_params_

In [71]:
best_params

{'criterion': 'friedman_mse',
 'learning_rate': 0.05,
 'loss': 'huber',
 'max_depth': 3,
 'n_estimators': 100}

In [72]:
gr_boost.best_score_

0.8483564048785996

In [73]:
best_gr = GradientBoostingRegressor(**best_params)

In [74]:
best_gr.fit(new_train_2,y_train)

In [75]:
y_pred = best_gr.predict(new_test_2)

In [76]:
score = r2_score(y_test,y_pred)

In [77]:
score

0.8765254617524345

In [82]:
best_gr.fit(new_train_1,y_train)

In [83]:
y_pred = best_gr.predict(new_test_1)

In [84]:
score = r2_score(y_test,y_pred)

In [85]:
score

0.8759793474117812

In [78]:
gr_boost.fit(new_train_1,y_train)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
[CV 1/5] END criterion=squared_error, learning_rate=0.05, loss=squared_error, max_depth=1, n_estimators=100;, score=0.703 total time=   0.1s
[CV 2/5] END criterion=squared_error, learning_rate=0.05, loss=squared_error, max_depth=1, n_estimators=100;, score=0.774 total time=   0.0s
[CV 3/5] END criterion=squared_error, learning_rate=0.05, loss=squared_error, max_depth=1, n_estimators=100;, score=0.720 total time=   0.0s
[CV 4/5] END criterion=squared_error, learning_rate=0.05, loss=squared_error, max_depth=1, n_estimators=100;, score=0.667 total time=   0.0s
[CV 5/5] END criterion=squared_error, learning_rate=0.05, loss=squared_error, max_depth=1, n_estimators=100;, score=0.726 total time=   0.0s
[CV 1/5] END criterion=squared_error, learning_rate=0.05, loss=squared_error, max_depth=1, n_estimators=150;, score=0.714 total time=   0.0s
[CV 2/5] END criterion=squared_error, learning_rate=0.05, loss=squared_error, max_depth=1,

In [79]:
best_params = gr_boost.best_params_

In [80]:
best_params

{'criterion': 'friedman_mse',
 'learning_rate': 0.05,
 'loss': 'huber',
 'max_depth': 3,
 'n_estimators': 100}

In [81]:
gr_boost.best_score_

0.8481167855826215