### Reading and Preprocessing Data

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression # for linear regression
from sklearn.preprocessing import PolynomialFeatures # for adding polynomial features
from sklearn.linear_model import Lasso # for lasso regression
from sklearn.svm import SVR # for support vector regression
from sklearn.pipeline import Pipeline
import pickle
# hyptertuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


In [2]:
df = pd.read_csv("data/insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### Encoding

In [3]:
ohe_smoker = OneHotEncoder(drop='first').fit(df["smoker"].values.reshape(-1,1))
ohe_smoker.get_feature_names(['smoker'])

array(['smoker_yes'], dtype=object)

In [4]:
df = pd.concat(
    [
        df,
        pd.DataFrame(ohe_smoker.transform(df["smoker"].values.reshape(-1,1)).toarray(),
                    columns = ohe_smoker.get_feature_names(['smoker'])).astype(int)
    ], axis=1).drop("smoker", axis=1)

In [5]:
ohe_sex = OneHotEncoder(drop='first').fit(df["sex"].values.reshape(-1,1))
ohe_sex.get_feature_names(['sex'])

array(['sex_male'], dtype=object)

In [6]:
df = pd.concat(
    [
        df,
        pd.DataFrame(ohe_sex.transform(df["sex"].values.reshape(-1,1)).toarray(),
                    columns = ohe_sex.get_feature_names(['sex'])).astype(int)
    ], axis=1).drop("sex", axis=1)

In [7]:
ohe_region = OneHotEncoder(drop='first').fit(df["region"].values.reshape(-1,1))
ohe_region.get_feature_names(['region'])

array(['region_northwest', 'region_southeast', 'region_southwest'],
      dtype=object)

In [8]:
df = pd.concat(
    [
        df,
        pd.DataFrame(ohe_region.transform(df["region"].values.reshape(-1,1)).toarray(),
                    columns = ohe_region.get_feature_names(['region'])).astype(int)
    ], axis=1).drop("region", axis=1)

In [9]:
df.head()

Unnamed: 0,age,bmi,children,charges,smoker_yes,sex_male,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,1,0,0,0,1
1,18,33.77,1,1725.5523,0,1,0,1,0
2,28,33.0,3,4449.462,0,1,0,1,0
3,33,22.705,0,21984.47061,0,1,1,0,0
4,32,28.88,0,3866.8552,0,1,1,0,0


In [30]:
X = df.drop('charges', axis = 1).values
y = df['charges'].values.reshape(-1,1)

In [31]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [32]:
# Feature Scaling
sc_X = StandardScaler()
sc_y = StandardScaler()
X_tr_scaled = sc_X.fit_transform(X_train)
y_tr_scaled = sc_y.fit_transform(y_train.reshape(-1,1))

### Random Forest Regression Model

In [41]:
regressor_rf = RandomForestRegressor()

In [42]:
# Applying RandomSearch and GridSearch to find the best model and the best parameters
parameters =  { "n_estimators": [1200],
                "max_features": ["auto"],
                "max_depth": [50],
                "min_samples_split": [7],
                "min_samples_leaf": [10],
                "bootstrap": [True],
                "criterion": ["mse"],
                "random_state" : [42] }
            
regressor_rf = GridSearchCV(estimator = regressor_rf,
                                  param_grid = parameters,
                                  cv = 10,
                                # verbose = 4,
                                  n_jobs = -1)
regressor_rf = regressor_rf.fit(X_tr_scaled, y_tr_scaled.ravel())

print(regressor_rf.best_params_)
print(regressor_rf.best_score_)

{'bootstrap': True, 'criterion': 'mse', 'max_depth': 50, 'max_features': 'auto', 'min_samples_leaf': 10, 'min_samples_split': 7, 'n_estimators': 1200, 'random_state': 42}
0.8477045099894148


In [48]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
# Predicting Cross Validation Score
cv_rf = regressor_rf.best_score_

# Predicting R2 Score the Train set results
y_pred_rf_train = sc_y.inverse_transform(regressor_rf.predict(sc_X.transform(X_train)))
r2_score_rf_train = r2_score(y_train, y_pred_rf_train)

# Predicting R2 Score the Test set results
y_pred_rf_test = sc_y.inverse_transform(regressor_rf.predict(sc_X.transform(X_test)))
r2_score_rf_test = r2_score(y_test, y_pred_rf_test)

# Predicting RMSE the Test set results
rmse_rf = (np.sqrt(mean_squared_error(y_test, y_pred_rf_test)))
print('CV: ', cv_rf.mean())
print('R2_score (train): ', r2_score_rf_train)
print('R2_score (test): ', r2_score_rf_test)
print("RMSE: ", rmse_rf)

CV:  0.8477045099894148
R2_score (train):  0.8893935137474492
R2_score (test):  0.8640567204643813
RMSE:  4529.087809295125


In [59]:
filename = 'data/random_forest_model.sav'
pickle.dump(regressor_rf, open(filename, 'wb'))

### Lasso

In [18]:
steps = [
    ('scalar', StandardScaler()),
    ('poly', PolynomialFeatures(degree=2)),
    ('model', Lasso())
]

lasso_pipe = Pipeline(steps)

In [43]:
# Applying Grid Search to find the best model and the best parameters
# step 1: alpha:np.arange(0.01, 1, 0.005) -> 0.9949

parameters =  {  'model__alpha' : [0.9949],
                 'model__fit_intercept' : [True],
                 'model__tol' : [0.0001],
                 'model__max_iter' : [5000],
                'model__random_state': [42] 
}
regressor_lasso = GridSearchCV(lasso_pipe, parameters, iid=False, cv=10, n_jobs = -1, verbose = 4)
regressor_lasso = regressor_lasso.fit(X_train, y_train.ravel())

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    0.2s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished
  positive)


In [46]:
# Predicting Cross Validation Score
cv_lasso = regressor_lasso.best_score_

# Predicting R2 Score the Test set results
y_pred_lasso_train = regressor_lasso.predict(X_train)
r2_score_lasso_train = r2_score(y_train, y_pred_lasso_train)

# Predicting R2 Score the Test set results
y_pred_lasso_test = regressor_lasso.predict(X_test)
r2_score_lasso_test = r2_score(y_test, y_pred_lasso_test)

# Predicting RMSE the Test set results
rmse_lasso = (np.sqrt(mean_squared_error(y_test, y_pred_lasso_test)))
print('CV: ', cv_lasso.mean())
print('R2_score (train): ', r2_score_lasso_train)
print('R2_score (test): ', r2_score_lasso_test)
print("RMSE: ", rmse_lasso)

CV:  0.8245976695306669
R2_score (train):  0.844011538567293
R2_score (test):  0.8543140316524854
RMSE:  4688.573536046928


In [58]:
filename = 'data/lasso_model.sav'
pickle.dump(regressor_lasso, open(filename, 'wb'))

### SVR

In [53]:
# Creating the SVR regressor
regressor_svr = SVR()

In [54]:
# Applying Grid Search to find the best model and the best parameters
parameters =  { 'kernel' : ['rbf', 'sigmoid'],
                 'gamma' : [0.001, 0.01, 0.1, 1, 'scale'],
                 'tol' : [0.0001],
                 'C': [0.001, 0.01, 0.1, 1, 10, 100] }
regressor_svr = GridSearchCV(estimator = regressor_svr,
                           param_grid = parameters,
                           cv = 10,
                           verbose = 4,
                           iid = True,
                           n_jobs = -1)
regressor_svr = regressor_svr.fit(X_tr_scaled, y_tr_scaled.ravel())

Fitting 10 folds for each of 60 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:    5.0s
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed:   19.8s finished


In [55]:
# Predicting Cross Validation Score
cv_svr = regressor_svr.best_score_

# Predicting R2 Score the Train set results
y_pred_svr_train = sc_y.inverse_transform(regressor_svr.predict(sc_X.transform(X_train)))
r2_score_svr_train = r2_score(y_train, y_pred_svr_train)

# Predicting R2 Score the Test set results
y_pred_svr_test = sc_y.inverse_transform(regressor_svr.predict(sc_X.transform(X_test)))
r2_score_svr_test = r2_score(y_test, y_pred_svr_test)

# Predicting RMSE the Test set results
rmse_svr = (np.sqrt(mean_squared_error(y_test, y_pred_svr_test)))
print('CV: ', cv_svr.mean())
print('R2_score (train): ', r2_score_svr_train)
print('R2_score (test): ', r2_score_svr_test)
print("RMSE: ", rmse_svr)

CV:  0.8298459991778422
R2_score (train):  0.8521031819464114
R2_score (test):  0.8546297788530866
RMSE:  4683.4899752174515


In [57]:
filename = 'data/svr_model.sav'
pickle.dump(regressor_svr, open(filename, 'wb'))

### Test for application

In [67]:
sample = [19, "female", 27.900, 0, "no", "southwest"]
sample = pd.DataFrame([sample], columns = ["age", "sex", "bmi", "children", "smoker", "region"])

In [68]:
sample = pd.concat(
    [
        sample,
        pd.DataFrame(ohe_smoker.transform(sample["smoker"].values.reshape(-1,1)).toarray(),
                    columns = ohe_smoker.get_feature_names(['smoker'])).astype(int)
    ], axis=1).drop("smoker", axis=1)

In [69]:
sample = pd.concat(
    [
        sample,
        pd.DataFrame(ohe_sex.transform(sample["sex"].values.reshape(-1,1)).toarray(),
                    columns = ohe_sex.get_feature_names(['sex'])).astype(int)
    ], axis=1).drop("sex", axis=1)

In [70]:
sample = pd.concat(
    [
        sample,
        pd.DataFrame(ohe_region.transform(sample["region"].values.reshape(-1,1)).toarray(),
                    columns = ohe_region.get_feature_names(['region'])).astype(int)
    ], axis=1).drop("region", axis=1)

In [71]:
sample.head()

Unnamed: 0,age,bmi,children,smoker_yes,sex_male,region_northwest,region_southeast,region_southwest
0,19,27.9,0,0,0,0,0,1


In [73]:
loaded_model = pickle.load(open(filename, 'rb'))
result = sc_y.inverse_transform(loaded_model.predict(sc_X.transform(sample)))
print("$", result[0])

$ 2273.615190201772
