In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('data/medical_insurance.csv')

In [3]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
from sklearn.model_selection import train_test_split
X=df.drop('charges',axis=1)
y=df['charges']

In [6]:

num_col=[col for col in X.columns if X[col].dtype=='int' or X[col].dtype=='float']

In [7]:
num_col

['age', 'bmi', 'children']

In [8]:
X['sex'].unique()

array(['female', 'male'], dtype=object)

In [9]:
X['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [10]:
X['sex']= X['sex'].map({'male':1,'female':0})
X['smoker']=X['smoker'].map({'yes':1,'no':0})

In [11]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,1,southwest
1,18,1,33.770,1,0,southeast
2,28,1,33.000,3,0,southeast
3,33,1,22.705,0,0,northwest
4,32,1,28.880,0,0,northwest
...,...,...,...,...,...,...
2767,47,0,45.320,1,0,southeast
2768,21,0,34.600,0,0,southwest
2769,19,1,26.030,1,1,northwest
2770,23,1,18.715,0,0,northwest


In [13]:
from sklearn.preprocessing import OneHotEncoder

In [14]:
oh=OneHotEncoder(drop='first',sparse_output=False)
region_encoded=oh.fit_transform(X[['region']])

region_columns=oh.get_feature_names_out()

In [15]:
df_region=pd.DataFrame(region_encoded,columns=region_columns)

In [16]:
df_region.shape,X.shape

((2772, 3), (2772, 6))

In [17]:
type(X)

pandas.core.frame.DataFrame

In [18]:
type(df_region)

pandas.core.frame.DataFrame

In [19]:
X=pd.concat([X.drop('region',axis=1),df_region],axis=1)

In [20]:
num_col=[col for col in X.columns if X[col].dtypes =='int' or 'float64']

In [21]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()

In [22]:
X[['age', 'bmi', 'children']]=scaler.fit_transform(X[['age', 'bmi', 'children']])

In [23]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region_northwest,region_southeast,region_southwest
0,-1.428353,0,-0.457114,-0.907084,1,0.0,0.0,1.0
1,-1.499381,1,0.500731,-0.083758,0,0.0,1.0,0.0
2,-0.789099,1,0.375085,1.562893,0,0.0,1.0,0.0
3,-0.433959,1,-1.304814,-0.907084,0,1.0,0.0,0.0
4,-0.504987,1,-0.297201,-0.907084,0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...
2767,0.560436,0,2.385417,-0.083758,0,0.0,1.0,0.0
2768,-1.286297,0,0.636167,-0.907084,0,0.0,0.0,1.0
2769,-1.428353,1,-0.762253,-0.083758,1,1.0,0.0,0.0
2770,-1.144240,1,-1.955887,-0.907084,0,1.0,0.0,0.0


In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =train_test_split(X,y,random_state=37, test_size=0.28)

In [25]:
X_train.shape,y_train.shape,X_test.shape,y_test.shape

((1995, 8), (1995,), (777, 8), (777,))

## Model Training And Model Selection

In [29]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [27]:

def evaluate_model(true,predicted):
    mae=mean_absolute_error(true,predicted)
    r_2=r2_score(true,predicted)
    mse=mean_squared_error(true,predicted)
    return mae,mse,r_2

In [46]:
models={
'Linear Regression': LinearRegression(),
'Ridge':Ridge(),
'Lasso':Ridge(),
'K-Neighbors Regressor': KNeighborsRegressor(),
}

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)


    mae_score,mse_score,r_2_score=evaluate_model(y_test,y_pred)

    print(f'---{list(models.keys())[i]}----')

    print('Mean Absolute Error : {:.4f} '.format(mae_score))
    print('Mean Squred Error namaste : {:.4f}'.format(mse_score))
    print('R2 Score : {:.4f}'.format(r_2_score))

    print('-------------------------------')

---Linear Regression----
Mean Absolute Error : 4340.3189 
Mean Squred Error namaste : 39929115.4972
R2 Score : 0.7296
-------------------------------
---Ridge----
Mean Absolute Error : 4343.9990 
Mean Squred Error namaste : 39909399.1607
R2 Score : 0.7297
-------------------------------
---Lasso----
Mean Absolute Error : 4343.9990 
Mean Squred Error namaste : 39909399.1607
R2 Score : 0.7297
-------------------------------
---K-Neighbors Regressor----
Mean Absolute Error : 2975.1867 
Mean Squred Error namaste : 31119795.8213
R2 Score : 0.7892
-------------------------------
