In [None]:
!pip install xgboost

In [24]:

# Build an insurance price prediction model using RF and XGBoost. Check which model gives the best results.

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [4]:
data = pd.read_csv('D:\Python\datasets\insurance.csv')
data.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
data.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [45]:
## Convert data to features.

In [7]:
data['sex'] = data['sex'].map({'male':1, 'female':0})
data['smoker']= data['smoker'].map({'yes':1,'no':0})

In [8]:
data.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462


In [9]:
#Create dummy features for categorical variables.

data_cat=pd.get_dummies(data['region'],dtype=int)
data_cat.head(4)

Unnamed: 0,northeast,northwest,southeast,southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(4), object(1)
memory usage: 73.3+ KB


In [11]:
data =pd.concat([data,data_cat],axis=1)
data.drop(['region'],axis =1,inplace=True)
data.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,charges,northeast,northwest,southeast,southwest
0,19,0,27.9,0,1,16884.924,0,0,0,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0
2,28,1,33.0,3,0,4449.462,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.88,0,0,3866.8552,0,1,0,0


In [13]:
x= data.drop('charges', axis =1)
y= data['charges']

In [31]:
## Train test split and Data Scaling.

x_train,x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 40)
x_train.head(5)

Unnamed: 0,age,sex,bmi,children,smoker,northeast,northwest,southeast,southwest
737,26,1,23.7,2,0,0,0,0,1
369,18,1,30.4,3,0,1,0,0,0
284,52,0,31.2,0,0,0,0,0,1
1302,25,0,20.8,1,0,0,0,0,1
958,43,1,34.96,1,1,1,0,0,0


In [32]:
x_test.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,northeast,northwest,southeast,southwest
1099,25,0,33.99,1,0,0,0,1,0
759,18,1,38.17,0,1,0,0,1,0
215,41,0,37.1,2,0,0,0,0,1


In [36]:
print (x.shape)
print(y.shape)

(1338, 9)
(1338,)


In [37]:
# Feature scaling

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


In [39]:
print (x_train_scaled.shape)

(1070, 9)


In [40]:
print (y_train.shape)

(1070,)


In [34]:
#Analysis by using Random Forest

rf = RandomForestRegressor(n_estimators = 100, random_state =40)
rf_scores = cross_val_score(rf,x_train_scaled, y_train, cv=5, scoring ='r2')
rf.fit(x_train_scaled, y_train)

rf_pred = rf.predict(x_test_scaled)
rf_r2 = r2_score(y_test,rf_pred)
rf_rmse = np.sqrt (mean_squared_error(y_test,rf_pred))

#print (rf_scores.mean(), rf_scores.std())
#print(rf_r2, rf_rmse)

print("Random Forest - CV R²: {:.3f} ± {:.3f}".format(rf_scores.mean(), rf_scores.std()))
print("Random Forest Test - R²: {:.3f}, RMSE: {:,.0f}".format(rf_r2, rf_rmse))


Random Forest - CV R²: 0.833 ± 0.040
Random Forest Test - R²: 0.846, RMSE: 4,798


In [47]:
# Analysis by using XGBoost technique

xgb_model = XGBRegressor(n_estimators =100, random_state=40,)
xgb_score = cross_val_score(xgb_model,x_train_scaled, y_train, cv=5, scoring ='r2')

xgb_model.fit(x_train_scaled, y_train)
xgb_pred = xgb_model.predict(x_test_scaled)
xgb_r2 = r2_score(y_test, xgb_pred)
xgb_rmse = np.sqrt(mean_squared_error(y_test, xgb_pred))

print("\nXGBoost - CV R²: {:.3f} ± {:.3f}".format(xgb_score.mean(), xgb_score.std()))
print("XGBoost Test - R²: {:.3f}, RMSE: {:,.0f}".format(xgb_r2, xgb_rmse))


XGBoost - CV R²: 0.795 ± 0.052
XGBoost Test - R²: 0.802, RMSE: 5,439


In [53]:
print("\n" + "="*50)
print("BEST MODEL COMPARISON")
print("="*50)
models = pd.DataFrame({
    'Model': ['Random Forest', 'XGBoost'],
    'Test R²': [rf_r2, xgb_r2],
    'Test RMSE': [rf_rmse, xgb_rmse],
    'CV R² Mean': [rf_scores.mean(), xgb_score.mean()]
})
print (models)


BEST MODEL COMPARISON
           Model   Test R²    Test RMSE  CV R² Mean
0  Random Forest  0.846007  4798.032526    0.833059
1        XGBoost  0.802106  5439.131161    0.795351
