# Import dataset

In [1]:
# Import libraries and the data set
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sb
# %matplotlib inline

df = pd.read_csv('insurance.csv')

In [2]:
# Have a look at the first five rows of the data set
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
# Check for any columns having missing data
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
# Have a look at the data set's info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [5]:
# Get X and y from the data set
X = df.drop(columns='charges')
y = df.charges

In [6]:
# Retrieve numeric features from columns
numeric_features = X._get_numeric_data().columns

# Retrieve categorical features from columns
categorical_features = list(set(X.columns) - set(numeric_features))

In [7]:
# Encoding categorical columns using get_dummies embedded in pandas
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

In [8]:
# StandardScale the dataset and convert it back to dataframe type
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scaled = sc.fit_transform(X)
X = pd.DataFrame(X_scaled, index=X.index, columns=X.columns)

### Split data into train and test sets

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

### Use XGBoost + GridSearch to predict y

In [10]:
# from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# parameters = {'objective':['reg:linear'],
#               'learning_rate': [.01, 0.05, 0.1, 0.2], #so called `eta` value
#               'gamma': [0.5, 1, 1.5, 2, 5],
#               'subsample': [0.6, 0.8, 1.0],
#               'min_child_weight': [1, 5, 10],
#               'silent': [1],
#               'n_estimators': [500, 600, 700],
#                 'colsample_bytree': [0.6, 0.8, 1.0],
#                 'max_depth': [3, 4, 5],
#                 'reg_alpha': [1.1, 1.2, 1.3],
#                 'reg_lambda': [1.1, 1.2, 1.3],
#                 'subsample': [0.7, 0.8, 0.9]
#              }

# grid_search = GridSearchCV(XGBRegressor(),
#                         parameters,
#                         cv = 2,
#                         n_jobs = 2,
#                         verbose=True)

# # Try fitting training data sets with all parameters
# grid_search.fit(X_train,y_train)

# # Print the best parameters
# print(grid_search.best_params_)

#Fit the training tests using the best parameters
# best_grid = XGBRegressor(**grid_search.best_params_)
# best_grid.fit(X_train,y_train)

# # Get the predicted y
# predictions = best_grid.predict(X_test)

# # Print the mean square error of the predicted and the real charges values
# from sklearn.metrics import mean_squared_error

# mse = mean_squared_error(predictions, y_test)
# print(mse)

### Use RF + GridSearch to predict y

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Create the parameter grid based on the results of random search 
parameters = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Create a based model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = parameters, 
                          cv = 3, n_jobs = -1, verbose = 2)

# Try fitting training data sets with all parameters
grid_search.fit(X_train,y_train)

# Print the best parameters
print(grid_search.best_params_)

#Fit the training tests using the best parameters
best_grid = RandomForestRegressor(**grid_search.best_params_)
best_grid.fit(X_train,y_train)

# Get the predicted y
predictions = best_grid.predict(X_test)

# Print the mean square error of the predicted and the real charges values
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(predictions, y_test)
print(mse)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.9s
