In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, MinMaxScaler 
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_squared_log_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

In [28]:
try:
    df = pd.read_csv("data/insurance.csv")
except:
    df = pd.read_csv("https://raw.githubusercontent.com/ysntrkc/ai-summer-camp-proje-1/main/data/insurance.csv?token=GHSAT0AAAAAABWZWGRVHO76I7S3ZQSMCHSKYYAYSNQ")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [30]:
ohe_cols = ["sex", "smoker", "region"]

In [31]:
df = pd.get_dummies(df, columns=ohe_cols)
df.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,1,0,0,0,1,0
2,28,33.0,3,4449.462,0,1,1,0,0,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0,1,0,0


In [32]:
y = df[["charges"]]
X = df.drop(columns=["charges"])

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [34]:
scale_cols = ["age", "bmi", "children"]

In [35]:
scaler = MinMaxScaler()
train = scaler.fit_transform(X_train[scale_cols])
test = scaler.transform(X_test[scale_cols])

In [36]:
X_train[scale_cols] = train
X_test[scale_cols] = test

In [37]:
x = scaler.fit_transform(X[scale_cols])
X[scale_cols] = x

In [38]:
xgb = XGBRegressor(objective="reg:squarederror")
xgb.fit(X_train, y_train)
pred = xgb.predict(X_test)
print(f"XGBRegressor \nR2 Score: {r2_score(y_test, pred)} \nMSE: {(mean_squared_error(y_test, pred)) ** 0.5}")

XGBRegressor 
R2 Score: 0.8208627272043386 
MSE: 5125.024745472466


In [39]:
lin = LinearRegression()
lin.fit(X_train, y_train)
pred = lin.predict(X_test)
print(f"LinearRegressor \nR2 Score: {r2_score(y_test, pred)} \nMSE: {(mean_squared_error(y_test, pred)) ** 0.5}")

LinearRegressor 
R2 Score: 0.7696118054369011 
MSE: 5812.100272258871


In [40]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train.values.ravel())
pred = rfr.predict(X_test)
print(f"RandomForestRegressor \nR2 Score: {r2_score(y_test, pred)} \nMSE: {(mean_squared_error(y_test, pred)) ** 0.5}")

RandomForestRegressor 
R2 Score: 0.8503973057492238 
MSE: 4683.523406967945


In [41]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
print(f"KNeighborsRegressor \nR2 Score: {r2_score(y_test, pred)} \nMSE: {(mean_squared_error(y_test, pred)) ** 0.5}")

KNeighborsRegressor 
R2 Score: 0.7786539147491962 
MSE: 5696.904120219449


In [42]:
lgbm = LGBMRegressor()
lgbm.fit(X_train, y_train)
pred = lgbm.predict(X_test)
print(f"KNeighborsRegressor \nR2 Score: {r2_score(y_test, pred)} \nMSE: {(mean_squared_error(y_test, pred)) ** 0.5}")

KNeighborsRegressor 
R2 Score: 0.8555977221914436 
MSE: 4601.400226237108


In [43]:
score_xgb = cross_val_score(xgb, X, y, cv=5)
score_xgb

array([0.81833719, 0.72379012, 0.85597004, 0.81797256, 0.8403727 ])

In [44]:
score_lin = cross_val_score(lin, X, y, cv=5)
score_lin

array([0.76148179, 0.70649339, 0.77955201, 0.73269475, 0.75557475])

In [45]:
score_rfr = cross_val_score(rfr, X, y.values.ravel(), cv=5)
score_rfr

array([0.85316602, 0.77614021, 0.87013605, 0.82774469, 0.85613741])

In [46]:
score_knn = cross_val_score(knn, X, y, cv=5)
score_knn

array([0.77129318, 0.73189372, 0.79422354, 0.74102327, 0.75827224])

In [47]:
score_lgbm = cross_val_score(lgbm, X, y, cv=5)
score_lgbm

array([0.86139644, 0.78390086, 0.87701539, 0.83409616, 0.85210798])

In [48]:
parameters = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['reg:squarederror'],
              'learning_rate': [.02], #so called `eta` value
              'max_depth': [3],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [275],
              'reg_alpha': [1.15],
              'reg_lambda': [1.35],}
xgb_grid = GridSearchCV(xgb, 
                        param_grid=parameters,
                        cv=3,
                        n_jobs=5,
                        verbose=True)
xgb_grid.fit(X_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    callbacks=None, colsample_bylevel=1,
                                    colsample_bynode=1, colsample_bytree=1,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    gamma=0, gpu_id=-1, grow_policy='depthwise',
                                    importance_type=None,
                                    interaction_constraints='',
                                    learning_rate=0.300000012, max_bin=256,
                                    max_cat...
                                    n_jobs=0, num_parallel_tree=1,
                                    predictor='auto', random_state=0,
                                    reg_alpha=0, reg_lambda=1, ...),
             n_jobs=5,
             param_grid={'colsample_bytree': [0.7], 'learning_r

In [49]:
print(xgb_grid.best_score_)

0.8495952149287026


In [50]:
xgb_grid.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.02,
 'max_depth': 3,
 'min_child_weight': 4,
 'n_estimators': 275,
 'nthread': 4,
 'objective': 'reg:squarederror',
 'reg_alpha': 1.15,
 'reg_lambda': 1.35,
 'silent': 1,
 'subsample': 0.7}

In [51]:
pred = xgb_grid.predict(X_test)
print(f"XGBRegressor \nR2 Score: {r2_score(y_test, pred)} \nRMSE: {(mean_squared_error(y_test, pred)) ** 0.5}")

XGBRegressor 
R2 Score: 0.8731460137786737 
RMSE: 4312.757578778684


In [52]:
# Evaluation Metrics
print(f"""
    XGBRegressor
    R2 Score: {r2_score(y_test, pred)}
    MSE: {mean_squared_error(y_test, pred)}
    RMSE: {(mean_squared_error(y_test, pred)) ** 0.5}
    MAE: {mean_absolute_error(y_test, pred)}
    RMAE: {(mean_absolute_error(y_test, pred)) ** 0.5}
    MSLE: {mean_squared_log_error(y_test, pred)}
    RMSLE: {(mean_squared_log_error(y_test, pred)) ** 0.5}
""")


    XGBRegressor
    R2 Score: 0.8731460137786737
    MSE: 18599877.93331297
    RMSE: 4312.757578778684
    MAE: 2421.712807287916
    RMAE: 49.21090130538066
    MSLE: 0.17061437748329528
    RMSLE: 0.41305493276717475

