# Previsão de Seguro Médico - Plano de Seguro

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings(action='once')

In [2]:
# Medical Cost Personal Datasets
url = 'seguros.csv'
df = pd.read_csv(url)
print(len(df))
df.head()

1338


Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
0,19,Feminino,27.9,0,sim,sul,16884.924
1,18,Masculino,33.77,1,nao,sudeste,1725.5523
2,28,Masculino,33.0,3,nao,sudeste,4449.462
3,33,Masculino,22.705,0,nao,Norte,21984.47061
4,32,Masculino,28.88,0,nao,Norte,3866.8552


# Análise Descritiva

In [3]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
idade,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
IMC,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
num_filhos,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
valor_seguro,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


In [4]:
df.sexo.value_counts()

Masculino    676
Feminino     662
Name: sexo, dtype: int64

In [5]:
df.fumante.value_counts()

nao    1064
sim     274
Name: fumante, dtype: int64

In [6]:
df.regiao.value_counts()

sudeste     364
sul         325
Norte       325
Nordeste    324
Name: regiao, dtype: int64

In [7]:
df.corr()

  df.corr()


Unnamed: 0,idade,IMC,num_filhos,valor_seguro
idade,1.0,0.109272,0.042469,0.299008
IMC,0.109272,1.0,0.012759,0.198341
num_filhos,0.042469,0.012759,1.0,0.067998
valor_seguro,0.299008,0.198341,0.067998,1.0


# Transformacão de Dados

In [8]:
dict_categorias = {}

dict_categorias['sexo'] = {
        'Feminino': 0,
        'Masculino': 1
}
dict_categorias

{'sexo': {'Feminino': 0, 'Masculino': 1}}

In [9]:
dict_categorias['sexo']

{'Feminino': 0, 'Masculino': 1}

In [10]:
dict_categorias['sexo']['Masculino']

1

In [11]:
df['sexo'] = df.sexo.map(dict_categorias['sexo'])
df.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
0,19,0,27.9,0,sim,sul,16884.924
1,18,1,33.77,1,nao,sudeste,1725.5523
2,28,1,33.0,3,nao,sudeste,4449.462
3,33,1,22.705,0,nao,Norte,21984.47061
4,32,1,28.88,0,nao,Norte,3866.8552


In [12]:
dict_categorias['fumante'] = {
        'nao': 0,
        'sim': 1
}
dict_categorias

{'sexo': {'Feminino': 0, 'Masculino': 1}, 'fumante': {'nao': 0, 'sim': 1}}

In [13]:
df['fumante'] = df.fumante.map(dict_categorias['fumante'])
df.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
0,19,0,27.9,0,1,sul,16884.924
1,18,1,33.77,1,0,sudeste,1725.5523
2,28,1,33.0,3,0,sudeste,4449.462
3,33,1,22.705,0,0,Norte,21984.47061
4,32,1,28.88,0,0,Norte,3866.8552


In [14]:
df.regiao.unique()

array(['sul', 'sudeste', 'Norte', 'Nordeste'], dtype=object)

In [15]:
dict_categorias['regiao'] = {
        'sul' : 0, 
        'sudeste': 1, 
        'Norte': 2, 
        'Nordeste': 3
}
dict_categorias

{'sexo': {'Feminino': 0, 'Masculino': 1},
 'fumante': {'nao': 0, 'sim': 1},
 'regiao': {'sul': 0, 'sudeste': 1, 'Norte': 2, 'Nordeste': 3}}

In [16]:
df['regiao'] = df.regiao.map(dict_categorias['regiao'])
df.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
0,19,0,27.9,0,1,0,16884.924
1,18,1,33.77,1,0,1,1725.5523
2,28,1,33.0,3,0,1,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552


# construção do modelo

In [17]:
from sklearn import linear_model
modelo = linear_model.LinearRegression()

In [18]:
len(df)

1338

# X e y

In [19]:
X = df[ ['idade', 'sexo', 'IMC', 'num_filhos', 'fumante', 'regiao'] ]
X.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao
0,19,0,27.9,0,1,0
1,18,1,33.77,1,0,1
2,28,1,33.0,3,0,1
3,33,1,22.705,0,0,2
4,32,1,28.88,0,0,2


In [20]:
y = df['valor_seguro']
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: valor_seguro, dtype: float64

In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
len(X_train), len(X_test)

(1003, 335)

# Fit do modelo - Treinamento

In [41]:
modelo.fit(X_train, y_train)

# ## Métrica para avaliar o modelo
### R² - coeficiente de determinação.
É uma métrica que mede os o quanto dos futuros examplos são previstos corretamente. <br />
Varia entre 0 e 1. Quanto mais o R² se aproximar de 1, melhor a previsão. <br />
Um R² próximo de 0, não reflete o modelo.

In [42]:
X_train.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao
693,24,1,23.655,0,0,2
1297,28,0,26.51,2,0,1
634,51,1,39.7,1,0,0
1022,47,1,36.08,1,1,1
178,46,0,28.9,2,0,0


In [43]:
y_train.head()

693      2352.96845
1297     4340.44090
634      9391.34600
1022    42211.13820
178      8823.27900
Name: valor_seguro, dtype: float64

In [44]:
# Predict values


Xm = X_train.copy()
Xm['valor_seguro'] = y_train.values

Xm.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
693,24,1,23.655,0,0,2,2352.96845
1297,28,0,26.51,2,0,1,4340.4409
634,51,1,39.7,1,0,0,9391.346
1022,47,1,36.08,1,1,1,42211.1382
178,46,0,28.9,2,0,0,8823.279


In [45]:
# Calcula a hipótese - h(x)

y_pred = modelo.predict(X_train.values)

Xm['valor_seguro_previsto'] = y_pred

Xm.head()



Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro,valor_seguro_previsto
693,24,1,23.655,0,0,2,2352.96845,1841.975416
1297,28,0,26.51,2,0,1,4340.4409,4351.617965
634,51,1,39.7,1,0,0,9391.346,14096.029552
1022,47,1,36.08,1,1,1,42211.1382,35757.942474
178,46,0,28.9,2,0,0,8823.279,9535.819733


In [46]:
import numpy as np

In [47]:
Xm['Erro'] =  Xm.valor_seguro - Xm.valor_seguro_previsto
Xm['Erro'] = np.mean(Xm['Erro'])
Xm.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro,valor_seguro_previsto,Erro
693,24,1,23.655,0,0,2,2352.96845,1841.975416,3.554556e-13
1297,28,0,26.51,2,0,1,4340.4409,4351.617965,3.554556e-13
634,51,1,39.7,1,0,0,9391.346,14096.029552,3.554556e-13
1022,47,1,36.08,1,1,1,42211.1382,35757.942474,3.554556e-13
178,46,0,28.9,2,0,0,8823.279,9535.819733,3.554556e-13


In [48]:
Xm.Erro.sum()

3.5652192309498787e-10

In [None]:
# 2.5756889954209324e-09
# 1.837179297581315e-09

In [50]:
from sklearn.metrics import r2_score

r2 = r2_score(y_train, y_pred)
print('r2 = ', r2 )

r2 =  0.7449087316606229


In [51]:
X_train['Valor_Seguro_Previsto'] = y_pred
X_train['Valor_Seguro'] =  y_train
X_train['Erro'] = X_train['Valor_Seguro'] - X_train['Valor_Seguro_Previsto']

X_train.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,Valor_Seguro_Previsto,Valor_Seguro,Erro
693,24,1,23.655,0,0,2,1841.975416,2352.96845,510.993034
1297,28,0,26.51,2,0,1,4351.617965,4340.4409,-11.177065
634,51,1,39.7,1,0,0,14096.029552,9391.346,-4704.683552
1022,47,1,36.08,1,1,1,35757.942474,42211.1382,6453.195726
178,46,0,28.9,2,0,0,9535.819733,8823.279,-712.540733


## Mean Squared Error - Erro Médio Quadrado
https://en.wikipedia.org/wiki/Mean_squared_error
Quanto menor -> Melhor

![title](https://wikimedia.org/api/rest_v1/media/math/render/svg/b4647a2cc4c8f9a4c90b628faad2dcf80c4aae84)

In [55]:
from sklearn.metrics import mean_squared_error

y_test_pred = modelo.predict(X_test.values)

mean_squared_error(y_test, y_test_pred)



35174149.327053055

In [56]:
from sklearn.metrics import r2_score

r2_score(y_test, y_test_pred)

0.7668905583460909

# Deploy do modelo

In [57]:
type(modelo)

sklearn.linear_model._base.LinearRegression

In [58]:
modelo

# Salva o Modelo - DEPLOY

In [59]:
import pickle 

# salvar
with open('modelo.pkl', 'wb') as f:
    pickle.dump(modelo, f)
    
with open('dict_categorias.pkl', 'wb') as f:
    pickle.dump(dict_categorias, f)
        
# Carregar 
with open('modelo.pkl', 'rb') as f:
    loaded_model = pickle.load(f)
    
print(loaded_model)

LinearRegression()


In [61]:
!head modelo.pkl

��t      �sklearn.linear_model._base��LinearRegression���)��}�(�fit_intercept���copy_X���n_jobs�N�positive���feature_names_in_��numpy.core.multiarray��_reconstruct����numpy��ndarray���K ��Cb���R�(KK��h�dtype����O8�����R�(K�|�NNNJ����J����K?t�b�]�(�idade��sexo��IMC��
num_filhos��fumante��regiao�et�b�n_features_in_�K�coef_�hhK ��h��R�(KK��h�f8�����R�(K�<�NNNJ����J����K t�b�C0s�Z�=p@f��k�H@�4�u@�͆�_�z@_��<��@�n�K��r@�t�b�rank_�K�	singular_�hhK ��h��R�(KK��h,�C0��"�U�{@�x��ٟg@�^��~C@�k3D�YA@ˊ�D��/@��F;t)@�t�b�
intercept_�h
�scalar���h,C���x$k�����R��_sklearn_version��1.3.0�ub.

In [62]:
!cat modelo.pkl

��t      �sklearn.linear_model._base��LinearRegression���)��}�(�fit_intercept���copy_X���n_jobs�N�positive���feature_names_in_��numpy.core.multiarray��_reconstruct����numpy��ndarray���K ��Cb���R�(KK��h�dtype����O8�����R�(K�|�NNNJ����J����K?t�b�]�(�idade��sexo��IMC��
num_filhos��fumante��regiao�et�b�n_features_in_�K�coef_�hhK ��h��R�(KK��h�f8�����R�(K�<�NNNJ����J����K t�b�C0s�Z�=p@f��k�H@�4�u@�͆�_�z@_��<��@�n�K��r@�t�b�rank_�K�	singular_�hhK ��h��R�(KK��h,�C0��"�U�{@�x��ٟg@�^��~C@�k3D�YA@ˊ�D��/@��F;t)@�t�b�
intercept_�h
�scalar���h,C���x$k�����R��_sklearn_version��1.3.0�ub.

In [60]:
y_test_pred = loaded_model.predict(X_test.values)
print(mean_squared_error(y_test.values, y_test_pred))
print(r2_score(y_test.values, y_test_pred))

35174149.327053055
0.7668905583460909




In [63]:
#  idade       sexo     IMC   num_filhos   fumante   regiao  valor_seguro
# 32            1     28.880  0            0          2      3866.85520
x1 = np.array([35,     1,   28.880, 0,        0,     2 ])
modelo.predict(x1.reshape(1, -1))[0]



6461.687126017314

In [64]:
x1

array([35.  ,  1.  , 28.88,  0.  ,  0.  ,  2.  ])

In [65]:
x1.reshape(1, -1)

array([[35.  ,  1.  , 28.88,  0.  ,  0.  ,  2.  ]])

In [66]:
modelo.predict(x1.reshape(1, -1))



array([6461.68712602])

In [67]:
modelo.predict(x1.reshape(1, -1))[0]



6461.687126017314

In [68]:
df.columns

Index(['idade', 'sexo', 'IMC', 'num_filhos', 'fumante', 'regiao',
       'valor_seguro'],
      dtype='object')

In [69]:
df.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
0,19,0,27.9,0,1,0,16884.924
1,18,1,33.77,1,0,1,1725.5523
2,28,1,33.0,3,0,1,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552


In [70]:
dict_categorias

{'sexo': {'Feminino': 0, 'Masculino': 1},
 'fumante': {'nao': 0, 'sim': 1},
 'regiao': {'sul': 0, 'sudeste': 1, 'Norte': 2, 'Nordeste': 3}}

In [71]:
!ls -lah

total 488K
drwxr-xr-x 5 valencar valencar 4,0K nov 30 12:20 .
drwxr-xr-x 3 valencar valencar 4,0K mai 29  2023 ..
-rw-r--r-- 1 valencar valencar 3,1K mai 29  2023 app_seguros_cloud.py
-rw-r--r-- 1 valencar valencar 2,6K mai 29  2023 app_seguros.py
-rw-r--r-- 1 valencar valencar  140 nov 30 12:18 dict_categorias.pkl
-rw-r--r-- 1 valencar valencar  55K mai 28  2023 insurance.csv
drwxr-xr-x 2 valencar valencar 4,0K mai 30  2023 .ipynb_checkpoints
-rw-r--r-- 1 valencar valencar  639 nov 30 12:18 modelo.pkl
-rw-r--r-- 1 valencar valencar   13 mai 29  2023 packages.txt
-rw-r--r-- 1 valencar valencar  76K mai 28  2023 previsao_seguros-Dummies-var-v02.ipynb
-rw-r--r-- 1 valencar valencar  66K mai 28  2023 previsao_seguros.ipynb
-rw-r--r-- 1 valencar valencar  78K nov 30 12:20 previsao_seguros-v03-02.ipynb
-rw-r--r-- 1 valencar valencar   90 mai 29  2023 requirements.txt
-rw-r--r-- 1 valencar valencar 108K mai 28  2023 seguro.jpeg
-rw-r--r-- 1 valencar valencar  55K mai 28  2023 

# Seleção de Atributos

In [72]:
list(df.columns)

['idade', 'sexo', 'IMC', 'num_filhos', 'fumante', 'regiao', 'valor_seguro']

In [74]:
%%time 

# aplicando a RFE
from sklearn.feature_selection import RFE
from sklearn import linear_model

num_atributos_relevantes = 4
estimator = modelo = linear_model.LinearRegression() #normalize = False, fit_intercept = True)
selector = RFE(estimator, n_features_to_select=num_atributos_relevantes, step=1)
selector = selector.fit(X, y)

print("Num Features: ", selector.n_features_)

print(X.columns)
print("Selected Features: ", selector.support_)
print("Feature Ranking: ", selector.ranking_)

Num Features:  4
Index(['idade', 'sexo', 'IMC', 'num_filhos', 'fumante', 'regiao'], dtype='object')
Selected Features:  [False False  True  True  True  True]
Feature Ranking:  [2 3 1 1 1 1]
CPU times: user 13.1 ms, sys: 130 µs, total: 13.3 ms
Wall time: 8.99 ms


In [75]:
dfatributos = pd.DataFrame( {'Atributo': X.columns,
                              'Importancia': selector.ranking_ })
                             
dfatributos = dfatributos.sort_values(by='Importancia', ascending=True)
dfatributos

Unnamed: 0,Atributo,Importancia
2,IMC,1
3,num_filhos,1
4,fumante,1
5,regiao,1
0,idade,2
1,sexo,3
