<a href="https://colab.research.google.com/github/tecatanka/SWB_Mod_Opt/blob/main/Optuna_Insurance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [97]:
#!pip install optuna

In [98]:
#!pip install xgboost

In [99]:
#Google colab
from google.colab import files

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost
import optuna

In [100]:
#uploaded=files.upload()

In [101]:
raw = pd.read_csv('insurance.csv')
raw.head().T

Unnamed: 0,0,1,2,3,4
age,19,18,28,33,32
sex,female,male,male,male,male
bmi,27.9,33.77,33.0,22.705,28.88
children,0,1,3,0,0
smoker,yes,no,no,no,no
region,southwest,southeast,southeast,northwest,northwest
charges,16884.924,1725.5523,4449.462,21984.47061,3866.8552


In [102]:
raw.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [103]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [104]:
df=raw.copy()
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [105]:
df = pd.get_dummies(df, columns=['sex', 'smoker', 'region']).astype(int)
df.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27,0,16884,1,0,0,1,0,0,0,1
1,18,33,1,1725,0,1,1,0,0,0,1,0
2,28,33,3,4449,0,1,1,0,0,0,1,0
3,33,22,0,21984,0,1,1,0,0,1,0,0
4,32,28,0,3866,0,1,1,0,0,1,0,0


In [106]:
y = df['charges']
X = df.drop(['charges'], axis=1)
X.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27,0,1,0,0,1,0,0,0,1
1,18,33,1,0,1,1,0,0,0,1,0
2,28,33,3,0,1,1,0,0,0,1,0
3,33,22,0,0,1,1,0,0,1,0,0
4,32,28,0,0,1,1,0,0,1,0,0


In [107]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


In [108]:
# Scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [109]:
# Define the model
model = xgboost.XGBRegressor()

# Fit the model
model.fit(X_train, y_train)

In [110]:
# Make predictions with sxgboost model
y_pred = model.predict(X_test)

In [111]:
# Evaluate the model
print('MSE: ', mean_squared_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))

MSE:  25422006.601427507
RMSE:  5042.024057997692


## Use Optuna to tune XGBRegresor

In [112]:
def objective(trial):
    param = {
        'max_depth': trial.suggest_int('max_depth', 1, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_float('gamma', 0.01, 1.0),
        'subsample': trial.suggest_float('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 1.0),
        'random_state': trial.suggest_int('random_state', 1, 1000)
    }
    model = xgboost.XGBRegressor(**param)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return mean_squared_error(y_test, y_pred)

## Create Optuna Study

In [113]:
%%time
#optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction='minimize', study_name='regression')
study.optimize(objective, n_trials=100)

CPU times: user 59.8 s, sys: 979 ms, total: 1min
Wall time: 36.3 s


In [114]:
optuna.visualization.plot_optimization_history(study)

In [115]:
# Print the best parameters
print('Best parameters', study.best_params)

Best parameters {'max_depth': 2, 'learning_rate': 0.01802259932212122, 'n_estimators': 596, 'min_child_weight': 9, 'gamma': 0.7328835723044077, 'subsample': 0.36440661316924133, 'colsample_bytree': 0.942678985384224, 'reg_alpha': 0.37794968663954254, 'reg_lambda': 0.8922766806004259, 'random_state': 527}


In [116]:
# Print the best value
print('Best value', study.best_value)

Best value 17812885.064121194


In [117]:
# Print the best trial
print('Best trial', study.best_trial)

Best trial FrozenTrial(number=17, state=TrialState.COMPLETE, values=[17812885.064121194], datetime_start=datetime.datetime(2023, 8, 7, 20, 27, 54, 547230), datetime_complete=datetime.datetime(2023, 8, 7, 20, 27, 54, 887180), params={'max_depth': 2, 'learning_rate': 0.01802259932212122, 'n_estimators': 596, 'min_child_weight': 9, 'gamma': 0.7328835723044077, 'subsample': 0.36440661316924133, 'colsample_bytree': 0.942678985384224, 'reg_alpha': 0.37794968663954254, 'reg_lambda': 0.8922766806004259, 'random_state': 527}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'max_depth': IntDistribution(high=10, log=False, low=1, step=1), 'learning_rate': FloatDistribution(high=1.0, log=False, low=0.01, step=None), 'n_estimators': IntDistribution(high=1000, log=False, low=50, step=1), 'min_child_weight': IntDistribution(high=10, log=False, low=1, step=1), 'gamma': FloatDistribution(high=1.0, log=False, low=0.01, step=None), 'subsample': FloatDistribution(high=1.0, log=False

## Train model with best hyperparameters


In [118]:
model = xgboost.XGBRegressor(**study.best_params)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print('MSE: ', mean_squared_error(y_test, y_pred))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred)))

MSE:  17812885.064121194
RMSE:  4220.5313722470055
