# Import dataset

In [55]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

df = pd.read_csv('insurance.csv')

In [56]:
df[:5]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Label encoding categorical features

In [57]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
numeric_features = df._get_numeric_data().columns
cat_features = list(set(df.columns) - set(numeric_features))
for col in cat_features:
    df[col] = le.fit_transform(df[col])

### Split data into train and test sets

In [58]:
from sklearn.model_selection import train_test_split
X = df.drop(columns='charges')
y = df.charges
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Use GridSearch to find the best parameters for XGBRegressor

In [59]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

gbm = XGBRegressor()
reg_cv = GridSearchCV(gbm, {"colsample_bytree":[1.0],"min_child_weight":[1.0,1.2]
                            ,'max_depth': [3,4,6], 'n_estimators': [500,1000]}, verbose=1)
reg_cv.fit(X_train,y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:   13.2s finished


GridSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameters=None,
      

### Print the best parameters out and apply it

In [60]:
print(reg_cv.best_params_)
gbm = XGBRegressor(**reg_cv.best_params_)
gbm.fit(X_train,y_train)

{'colsample_bytree': 1.0, 'max_depth': 6, 'min_child_weight': 1.0, 'n_estimators': 500}


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1.0, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1.0, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

### Use the model to predict the value of column 'charges'

In [61]:
predictions = gbm.predict(X_test)

### Print the mean square error of the predicted and the real charges values

In [65]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(predictions, y_test)
print(mse)

29878447.69251136
