In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings

from sklearn.preprocessing import (OneHotEncoder,OrdinalEncoder,StandardScaler, MinMaxScaler,RobustScaler)
from sklearn.pipeline import Pipeline

from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score,root_mean_squared_error


from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer

warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv('/Users/suryasaikadali/Downloads/pw_skills/kaggle/insurance_prediction/artifacts/insurance.csv')

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86


In [4]:
x = df.drop('expenses',axis = 1)
y = df[['expenses']]

In [5]:
x.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.8,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.7,0,no,northwest
4,32,male,28.9,0,no,northwest


In [6]:
y.head()

Unnamed: 0,expenses
0,16884.92
1,1725.55
2,4449.46
3,21984.47
4,3866.86


In [7]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)

In [8]:
x_train.shape,x_test.shape

((1070, 6), (268, 6))

In [9]:
numerical_cols = [i for i in x.columns if x[i].dtype in ('int','float')]

In [10]:
numerical_cols

['age', 'bmi', 'children']

In [11]:
categorical_cols = [i for i in x.columns if x[i].dtype == 'O' and i!='smoker']

In [12]:
categorical_cols

['sex', 'region']

In [13]:
ord_categorical_cols = [i for i in x.columns if x[i].dtype == 'O' and i=='smoker']

In [14]:
ord_categorical_cols

['smoker']

In [15]:
smoker_categories = [
    'no',
    'yes'
]

In [16]:

sc = StandardScaler()

In [20]:
class OnehotPreprocessor:
    def __init__(self,x_train,x_test):
        self.x_train = x_train
        self.x_test = x_test
        
    def cat_pipeline(self):
        ohe = OneHotEncoder(drop='first', sparse_output=False)
        
        cat_encoded_train = pd.DataFrame(ohe.fit_transform(self.x_train[categorical_cols]),
                                         columns = ohe.get_feature_names_out(categorical_cols))
        
        
        cat_encoded_test = pd.DataFrame(ohe.transform(self.x_test[categorical_cols]),
                                        columns = ohe.get_feature_names_out(categorical_cols))
        
        
        return(cat_encoded_train,cat_encoded_test)

    def ord_pipeline(self):
        ode = OrdinalEncoder(categories=[smoker_categories])
        
        ord_encoded_train = pd.DataFrame(ode.fit_transform(self.x_train[ord_categorical_cols]),
                                         columns = ode.get_feature_names_out(ord_categorical_cols))
        ord_encoded_train = pd.DataFrame(sc.fit_transform(ord_encoded_train),
                                         columns = ode.get_feature_names_out(ord_categorical_cols))
        
        ord_encoded_test = pd.DataFrame(ode.transform(self.x_test[ord_categorical_cols]),
                                         columns = ode.get_feature_names_out(ord_categorical_cols))
        ord_encoded_test = pd.DataFrame(sc.transform(ord_encoded_test),
                                        columns = ode.get_feature_names_out(ord_categorical_cols))
        
        return(ord_encoded_train,ord_encoded_test)
        
    def num_pipeline(self):
        num_encoded_train = pd.DataFrame(sc.fit_transform(self.x_train[numerical_cols]),
                                         columns = numerical_cols)
        num_encoded_test = pd.DataFrame(sc.transform(self.x_test[numerical_cols]),
                                        columns = numerical_cols)
        
        return(num_encoded_train,num_encoded_test)

    def preprocessor(self):
        
        x_train_1,x_test_1 = self.cat_pipeline()
        x_train_2,x_test_2 = self.ord_pipeline()
        x_train_3,x_test_3 = self.num_pipeline()

        new_train = pd.concat([x_train_1,x_train_2, x_train_3],
                              axis = 1)
        new_test = pd.concat([x_train_1,x_train_2, x_train_3],
                              axis = 1)
        
        return(new_train,new_test)
        
        

In [21]:
ohe_preprocessor = OnehotPreprocessor(x_train,x_test)

In [24]:
new_train,new_test = ohe_preprocessor.preprocessor()

In [25]:
new_train

Unnamed: 0,sex_male,region_northwest,region_southeast,region_southwest,smoker,age,bmi,children
0,0.0,1.0,0.0,0.0,-0.508747,0.472227,-1.748572,0.734336
1,0.0,0.0,0.0,0.0,-0.508747,0.543313,-1.036704,-0.911192
2,0.0,0.0,1.0,0.0,-0.508747,0.898745,-0.937373,-0.911192
3,0.0,0.0,1.0,0.0,-0.508747,-0.025379,0.618804,3.202629
4,0.0,1.0,0.0,0.0,-0.508747,1.040918,-1.500246,1.557100
...,...,...,...,...,...,...,...,...
1065,0.0,0.0,0.0,0.0,-0.508747,-1.518194,0.138707,2.379865
1066,0.0,0.0,1.0,0.0,-0.508747,-0.025379,-1.102924,3.202629
1067,1.0,0.0,0.0,0.0,-0.508747,1.325264,-0.887708,-0.911192
1068,0.0,0.0,0.0,1.0,1.965613,-0.167551,2.820630,0.734336


In [21]:
x.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.8,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.7,0,no,northwest
4,32,male,28.9,0,no,northwest


In [22]:
num_pipeline = Pipeline(
    steps = [
        ('standard_scaler',StandardScaler())
    ]
)

In [23]:
cat_pipeline = Pipeline(
    steps = [
        ('one_hot',OneHotEncoder())
    ]
)

In [24]:
ord_cat_pipeline = Pipeline(
    steps = [
        ('ordinal_encoder',OrdinalEncoder(categories = [smoker_categories])),
        ('standard_scaler',StandardScaler())
    ]
)

In [25]:
preprocessor = ColumnTransformer([
    ('numerical_cols',num_pipeline,numerical_cols),
    ('cat_pipeline', cat_pipeline, categorical_cols),
    ('ord_cat_pipeline',ord_cat_pipeline, ord_categorical_cols)
])

In [26]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train), columns = preprocessor.get_feature_names_out())


In [27]:
x_train.head()

Unnamed: 0,numerical_cols__age,numerical_cols__bmi,numerical_cols__children,cat_pipeline__sex_female,cat_pipeline__sex_male,cat_pipeline__region_northeast,cat_pipeline__region_northwest,cat_pipeline__region_southeast,cat_pipeline__region_southwest,ord_cat_pipeline__smoker
0,0.472227,-1.748572,0.734336,1.0,0.0,0.0,1.0,0.0,0.0,-0.508747
1,0.543313,-1.036704,-0.911192,1.0,0.0,1.0,0.0,0.0,0.0,-0.508747
2,0.898745,-0.937373,-0.911192,1.0,0.0,0.0,0.0,1.0,0.0,-0.508747
3,-0.025379,0.618804,3.202629,1.0,0.0,0.0,0.0,1.0,0.0,-0.508747
4,1.040918,-1.500246,1.5571,1.0,0.0,0.0,1.0,0.0,0.0,-0.508747


In [28]:
x_test = pd.DataFrame(preprocessor.transform(x_test),columns = preprocessor.get_feature_names_out())

In [29]:
x_test.head()

Unnamed: 0,numerical_cols__age,numerical_cols__bmi,numerical_cols__children,cat_pipeline__sex_female,cat_pipeline__sex_male,cat_pipeline__region_northeast,cat_pipeline__region_northwest,cat_pipeline__region_southeast,cat_pipeline__region_southwest,ord_cat_pipeline__smoker
0,0.40114,-0.887708,0.734336,1.0,0.0,1.0,0.0,0.0,0.0,-0.508747
1,-0.238638,-0.093064,-0.911192,1.0,0.0,0.0,1.0,0.0,0.0,-0.508747
2,1.751782,-0.606272,-0.911192,1.0,0.0,0.0,1.0,0.0,0.0,1.965613
3,0.472227,-0.804933,1.5571,0.0,1.0,0.0,1.0,0.0,0.0,-0.508747
4,-1.447107,0.221482,-0.911192,0.0,1.0,0.0,1.0,0.0,0.0,1.965613


In [30]:
models = {
    'xgb':XGBRegressor(),
    'random_forest':RandomForestRegressor(),
    'SVR':SVR(),
    'KNNregressor':KNeighborsRegressor(),
    'linear_regression':LinearRegression()
    
}

In [31]:
def model_evaluation(x_train,x_test,y_train,y_test):
    total_scores = {}
    total_errors = {}
    for key,value in models.items():
        model = value.fit(x_train,y_train)
        y_pred = model.predict(x_test)
        score = r2_score(y_test,y_pred)
        error = root_mean_squared_error(y_test,y_pred)
        total_scores.update({key:score})
        total_errors.update({key:error})
    print(total_scores)
    print('%%%%%%%%%%%')
    print(total_errors)
    

In [32]:
model_evaluation(x_train,x_test,y_train,y_test)

{'xgb': 0.8418595194816589, 'random_forest': 0.8619314041501206, 'SVR': -0.06753288283480363, 'KNNregressor': 0.8165469811726096, 'linear_regression': 0.7818112514518168}
%%%%%%%%%%%
{'xgb': 4954.907574696352, 'random_forest': 4629.792076666123, 'SVR': 12873.7379292904, 'KNNregressor': 5336.745157055011, 'linear_regression': 5820.096814107729}
