#PREVENDO O CUSTO PARA UMA SEGURADORA

O código abaixo tem como objetivo criar um modelo de predição de custo de um seguro a partir da base de dados *insurance*. Para isso, será feito a limpeza e tratamento dos dados, teste de varios tipos de modelo para identificar o que melhor performa dentro das métricas estabelecidas e, finalmente, o modelo escolhido será otimizado com o tunning de seus hiperparâmetros.

In [1]:
#Importando bibliotecas
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_excel("insurance.xlsx")
df.head()

Unnamed: 0,idade,sexo,imc,quantidade_filhos,fumante,regiao,custos_seguro
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.56,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Data Cleaning e Preparation

In [3]:
# Checando qual o tipo dos dados e a existencia de valores nulos
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1341 entries, 0 to 1340
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   idade              1341 non-null   int64  
 1   sexo               1338 non-null   object 
 2   imc                1341 non-null   float64
 3   quantidade_filhos  1341 non-null   int64  
 4   fumante            1341 non-null   object 
 5   regiao             1341 non-null   object 
 6   custos_seguro      1341 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.5+ KB


In [4]:
# Excluindo os três valores nulos
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1338 entries, 0 to 1340
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   idade              1338 non-null   int64  
 1   sexo               1338 non-null   object 
 2   imc                1338 non-null   float64
 3   quantidade_filhos  1338 non-null   int64  
 4   fumante            1338 non-null   object 
 5   regiao             1338 non-null   object 
 6   custos_seguro      1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 83.6+ KB


In [5]:
df.describe()

Unnamed: 0,idade,imc,quantidade_filhos,custos_seguro
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663815,1.094918,13270.422265
std,14.04996,6.098367,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [6]:
# Tratando variáveis categóricas

# Utilizando o LabelEncoder() para as categorias binárias (sexo e fumante)
le = LabelEncoder()

le.fit(df.sexo)
df.sexo = le.transform(df.sexo)

le.fit(df.fumante)
df.fumante = le.transform(df.fumante)

# Utilizando o método get_dummies para a variável regiao, visto que ela tem mais de dois valores distintos e não queremos assumir nenhuma relação ordinal entre os números que representam as regiões
df = pd.get_dummies(df)

In [7]:
df

Unnamed: 0,idade,sexo,imc,quantidade_filhos,fumante,custos_seguro,regiao_northeast,regiao_northwest,regiao_southeast,regiao_southwest
0,19,0,27.900,0,1,16884.92400,0,0,0,1
1,18,1,33.770,1,0,1725.55230,0,0,1,0
2,28,1,33.560,3,0,4449.46200,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.880,0,0,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1336,50,1,30.970,3,0,10600.54830,0,1,0,0
1337,18,0,31.920,0,0,2205.98080,1,0,0,0
1338,18,0,36.850,0,0,1629.83350,0,0,1,0
1339,21,0,25.800,0,0,2007.94500,0,0,0,1


In [8]:
# Normalizando valores numéricos
scaler = MinMaxScaler()

df_normalizado = pd.DataFrame(scaler.fit_transform(df), index = df.index, columns = df.columns)
df_normalizado

Unnamed: 0,idade,sexo,imc,quantidade_filhos,fumante,custos_seguro,regiao_northeast,regiao_northwest,regiao_southeast,regiao_southwest
0,0.021739,0.0,0.321227,0.0,1.0,0.251611,0.0,0.0,0.0,1.0
1,0.000000,1.0,0.479150,0.2,0.0,0.009636,0.0,0.0,1.0,0.0
2,0.217391,1.0,0.473500,0.6,0.0,0.053115,0.0,0.0,1.0,0.0
3,0.326087,1.0,0.181464,0.0,0.0,0.333010,0.0,1.0,0.0,0.0
4,0.304348,1.0,0.347592,0.0,0.0,0.043816,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1336,0.695652,1.0,0.403820,0.6,0.0,0.151299,0.0,1.0,0.0,0.0
1337,0.000000,0.0,0.429379,0.0,0.0,0.017305,1.0,0.0,0.0,0.0
1338,0.000000,0.0,0.562012,0.0,0.0,0.008108,0.0,0.0,1.0,0.0
1339,0.065217,0.0,0.264730,0.0,0.0,0.014144,0.0,0.0,0.0,1.0


## Feature Selection

In [9]:
# Checando p-value das variáveis e excluindo algumas variáveis

function = "custos_seguro~idade+sexo+imc+quantidade_filhos+fumante+regiao_northeast+regiao_northwest+regiao_southeast+regiao_southwest"
model = smf.ols(formula=function, data=df_normalizado).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          custos_seguro   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     500.8
Date:                Thu, 22 Feb 2024   Prob (F-statistic):               0.00
Time:                        22:14:10   Log-Likelihood:                 1230.8
No. Observations:                1338   AIC:                            -2444.
Df Residuals:                    1329   BIC:                            -2397.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            -0.0461      0.00

In [10]:
function = "custos_seguro~idade+imc+quantidade_filhos+fumante+regiao_southeast+regiao_southwest"
model = smf.ols(formula=function, data=df_normalizado).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          custos_seguro   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.750
Method:                 Least Squares   F-statistic:                     668.3
Date:                Thu, 22 Feb 2024   Prob (F-statistic):               0.00
Time:                        22:14:11   Log-Likelihood:                 1230.5
No. Observations:                1338   AIC:                            -2447.
Df Residuals:                    1331   BIC:                            -2411.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            -0.0520      0.00

In [11]:
function = "custos_seguro~idade+imc+quantidade_filhos+fumante+regiao_southeast"
model = smf.ols(formula=function, data=df_normalizado).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          custos_seguro   R-squared:                       0.750
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     799.7
Date:                Thu, 22 Feb 2024   Prob (F-statistic):               0.00
Time:                        22:14:11   Log-Likelihood:                 1228.7
No. Observations:                1338   AIC:                            -2445.
Df Residuals:                    1332   BIC:                            -2414.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            -0.0550      0.00

In [12]:
function = "custos_seguro~idade+imc+quantidade_filhos+fumante"
model = smf.ols(formula=function, data=df_normalizado).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          custos_seguro   R-squared:                       0.750
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     998.1
Date:                Thu, 22 Feb 2024   Prob (F-statistic):               0.00
Time:                        22:14:11   Log-Likelihood:                 1227.6
No. Observations:                1338   AIC:                            -2445.
Df Residuals:                    1333   BIC:                            -2419.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept            -0.0550      0.00

## Modelling

In [13]:
# Separando variável dependente das independentes
x = df_normalizado[['idade', 'imc', 'quantidade_filhos', 'fumante']]
y = df_normalizado[['custos_seguro']]

In [14]:
# Separando os dados em base de treino e teste
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### PRIMEIRO MODELO - Regressão Linear

In [15]:
lr = LinearRegression()

lr.fit(x_train, y_train)

In [16]:
# Métricas do modelo

r_sq_lr = lr.score(x,y)
y_pred_train = lr.predict(x_train)
y_pred_test = lr.predict(x_test)

print("R-quadrado do modelo de Regressão Linear é:", r_sq_lr)
print("MAE da base treino:", metrics.mean_absolute_error(y_train, y_pred_train))
print("MAE da base teste:", metrics.mean_absolute_error(y_test, y_pred_test))

R-quadrado do modelo de Regressão Linear é: 0.7496174987090598
MAE da base treino: 0.06721131086435903
MAE da base teste: 0.06725832329337633


### SEGUNDO MODELO - Random Forest Regressor

In [17]:
rf = RandomForestRegressor()
rf.fit(x_train, y_train)

  rf.fit(x_train, y_train)


In [18]:
# Métricas do modelo

r_sq_rf = rf.score(x,y)
y_pred_train = rf.predict(x_train)
y_pred_test = rf.predict(x_test)

print("R-quadrado do modelo de Random Forest Regressor é:", r_sq_rf)
print("MAE da base treino:", metrics.mean_absolute_error(y_train, y_pred_train))
print("MAE da base teste:", metrics.mean_absolute_error(y_test, y_pred_test))

R-quadrado do modelo de Random Forest Regressor é: 0.9491167219178652
MAE da base treino: 0.016839486910664454
MAE da base teste: 0.04112601816603124


### TERCEIRO MODELO - ADA Boost Regressor

In [19]:
ada = AdaBoostRegressor()
ada.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


In [20]:
# Métricas do modelo

r_sq_ada = ada.score(x,y)
y_pred_train = ada.predict(x_train)
y_pred_test = ada.predict(x_test)

print("R-quadrado do modelo ADA Boost Regressor é:", r_sq_ada)
print("MAE da base treino:", metrics.mean_absolute_error(y_train, y_pred_train))
print("MAE da base teste:", metrics.mean_absolute_error(y_test, y_pred_test))

R-quadrado do modelo ADA Boost Regressor é: 0.8336166798128379
MAE da base treino: 0.06149370613165277
MAE da base teste: 0.06560174834412218


### QUARTO MODELO - Gradient Boosting Regressor

In [21]:
grb = GradientBoostingRegressor()
grb.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


In [22]:
# Métricas do modelo

r_sq_grb = ada.score(x,y)
y_pred_train = grb.predict(x_train)
y_pred_test = grb.predict(x_test)

print("R-quadrado do modelo Gradient Boosting Regresson é:", r_sq_grb)
print("MAE da base treino:", metrics.mean_absolute_error(y_train, y_pred_train))
print("MAE da base teste:", metrics.mean_absolute_error(y_test, y_pred_test))

R-quadrado do modelo Gradient Boosting Regresson é: 0.8336166798128379
MAE da base treino: 0.033769571376751906
MAE da base teste: 0.038348593595500816


## Tunning dos hiperparâmetros

Apesar do modelo de Random Forest Regressor ter apresentado o maior valor de R-quadrado, o Erro Absoluto Médio da base de teste foi bem divergente da base de treino, por isso foi escolhido o quarto modelo (Gradient Boosting Regressor).

In [23]:
# Definindo os parametros do modelo e rodando o GridSearchCV

parameters = {"max_depth": [5],
              "min_samples_leaf": [4],
              "min_samples_split": [2],
              "n_estimators": [200]
              }

grid_search = GridSearchCV(grb, parameters, scoring="r2", cv=5, n_jobs=-1)

In [24]:
grid_search.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


In [25]:
# Printando os melhores estimadores e parâmetros

print(grid_search.best_estimator_)
print(grid_search.best_params_)

GradientBoostingRegressor(max_depth=5, min_samples_leaf=4, n_estimators=200)
{'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}


In [26]:
best_model = grid_search.best_estimator_

In [27]:
best_model.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'squared_error',
 'max_depth': 5,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 4,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 200,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [28]:
grb_tunnado = GradientBoostingRegressor(alpha= 0.9,
 ccp_alpha= 0.0,
 criterion= 'friedman_mse',
 init= None,
 learning_rate= 0.1,
 loss= 'squared_error',
 max_depth= 5,
 max_features= None,
 max_leaf_nodes= None,
 min_impurity_decrease= 0.0,
 min_samples_leaf= 4,
 min_samples_split= 2,
 min_weight_fraction_leaf= 0.0,
 n_estimators= 200,
 n_iter_no_change= None,
 random_state= None,
 subsample= 1.0,
 tol= 0.0001,
 validation_fraction= 0.1,
 verbose= 0,
 warm_start= False)

In [29]:
grb_tunnado.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


In [30]:
# Métricas do modelo pós tunning

r_sq_gbr_tunnado = grb_tunnado.score(x,y)
y_pred_train = grb_tunnado.predict(x_train)
y_pred_test = grb_tunnado.predict(x_test)

print("R-quadrado do modelo GRB após o tunning é:", r_sq_gbr_tunnado)
print("MAE da base treino:", metrics.mean_absolute_error(y_train, y_pred_train))
print("MAE da base teste:", metrics.mean_absolute_error(y_test, y_pred_test))

R-quadrado do modelo GRB após o tunning é: 0.9268249888228324
MAE da base treino: 0.025213319694021914
MAE da base teste: 0.04243946333717105
