# Previsão de Seguro Médico - Plano de Seguro

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [2]:
# Medical Cost Personal Datasets
url = 'seguros.csv'
df = pd.read_csv(url)
print(len(df))
df.head()

1338


Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
0,19,Feminino,27.9,0,sim,sul,16884.924
1,18,Masculino,33.77,1,nao,sudeste,1725.5523
2,28,Masculino,33.0,3,nao,sudeste,4449.462
3,33,Masculino,22.705,0,nao,Norte,21984.47061
4,32,Masculino,28.88,0,nao,Norte,3866.8552


# Análise Descritiva

In [3]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
idade,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
IMC,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
num_filhos,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
valor_seguro,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


In [4]:
df.sexo.value_counts()

Masculino    676
Feminino     662
Name: sexo, dtype: int64

In [5]:
df.fumante.value_counts()

nao    1064
sim     274
Name: fumante, dtype: int64

In [6]:
df.regiao.value_counts()

sudeste     364
sul         325
Norte       325
Nordeste    324
Name: regiao, dtype: int64

In [7]:
df.corr()

Unnamed: 0,idade,IMC,num_filhos,valor_seguro
idade,1.0,0.109272,0.042469,0.299008
IMC,0.109272,1.0,0.012759,0.198341
num_filhos,0.042469,0.012759,1.0,0.067998
valor_seguro,0.299008,0.198341,0.067998,1.0


# Transformacão de Dados

In [8]:
dict_categorias = {}

dict_categorias['sexo'] = {
        'Feminino': 0,
        'Masculino': 1
}
dict_categorias

{'sexo': {'Feminino': 0, 'Masculino': 1}}

In [9]:
dict_categorias['sexo']

{'Feminino': 0, 'Masculino': 1}

In [10]:
dict_categorias['sexo']['Masculino']

1

In [11]:
df['sexo'] = df.sexo.map(dict_categorias['sexo'])
df.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
0,19,0,27.9,0,sim,sul,16884.924
1,18,1,33.77,1,nao,sudeste,1725.5523
2,28,1,33.0,3,nao,sudeste,4449.462
3,33,1,22.705,0,nao,Norte,21984.47061
4,32,1,28.88,0,nao,Norte,3866.8552


In [12]:
dict_categorias['fumante'] = {
        'nao': 0,
        'sim': 1
}
dict_categorias

{'sexo': {'Feminino': 0, 'Masculino': 1}, 'fumante': {'nao': 0, 'sim': 1}}

In [13]:
df['fumante'] = df.fumante.map(dict_categorias['fumante'])
df.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
0,19,0,27.9,0,1,sul,16884.924
1,18,1,33.77,1,0,sudeste,1725.5523
2,28,1,33.0,3,0,sudeste,4449.462
3,33,1,22.705,0,0,Norte,21984.47061
4,32,1,28.88,0,0,Norte,3866.8552


In [14]:
df.regiao.unique()

array(['sul', 'sudeste', 'Norte', 'Nordeste'], dtype=object)

In [15]:
dict_categorias['regiao'] = {
        'sul' : 0, 
        'sudeste': 1, 
        'Norte': 2, 
        'Nordeste': 3
}
dict_categorias

{'sexo': {'Feminino': 0, 'Masculino': 1},
 'fumante': {'nao': 0, 'sim': 1},
 'regiao': {'sul': 0, 'sudeste': 1, 'Norte': 2, 'Nordeste': 3}}

In [16]:
df['regiao'] = df.regiao.map(dict_categorias['regiao'])
df.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
0,19,0,27.9,0,1,0,16884.924
1,18,1,33.77,1,0,1,1725.5523
2,28,1,33.0,3,0,1,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552


# construção do modelo

In [18]:
from sklearn import linear_model
modelo = linear_model.LinearRegression()

In [19]:
len(df)

1338

# X e y

In [20]:
X = df[ ['idade', 'sexo', 'IMC', 'num_filhos', 'fumante', 'regiao'] ]
X.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao
0,19,0,27.9,0,1,0
1,18,1,33.77,1,0,1
2,28,1,33.0,3,0,1
3,33,1,22.705,0,0,2
4,32,1,28.88,0,0,2


In [21]:
y = df['valor_seguro']
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: valor_seguro, dtype: float64

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
len(X_train), len(X_test)

(1070, 268)

## converte em matrizes

In [24]:
X_train_mat = X_train.to_numpy()
X_train_mat

array([[46.   ,  0.   , 19.95 ,  2.   ,  0.   ,  2.   ],
       [47.   ,  0.   , 24.32 ,  0.   ,  0.   ,  3.   ],
       [52.   ,  0.   , 24.86 ,  0.   ,  0.   ,  1.   ],
       ...,
       [58.   ,  1.   , 25.175,  0.   ,  0.   ,  3.   ],
       [37.   ,  0.   , 47.6  ,  2.   ,  1.   ,  0.   ],
       [55.   ,  1.   , 29.9  ,  0.   ,  0.   ,  0.   ]])

In [25]:
y_train_mat = y_train.to_numpy()
y_train_mat[:10]

array([ 9193.8385 ,  8534.6718 , 27117.99378,  8596.8278 , 12475.3513 ,
       13405.3903 ,  2150.469  , 13747.87235,  6610.1097 , 39047.285  ])

In [26]:
# Fit do modelo - Treinamento

In [27]:
modelo.fit(X_train_mat, y_train_mat)

In [28]:
# Avaliacao do modelo - Teste

In [29]:
y_test_pred = modelo.predict(X_test.to_numpy() )
y_test_pred[:10]

array([ 8924.40724442,  7116.29501758, 36909.01352144,  9507.87469118,
       27013.3500079 , 10790.77956153,   226.29844571, 16942.71599941,
        1056.63079407, 11267.91997309])

In [30]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test.to_numpy(), y_test_pred)

33635210.431178406

In [31]:
from sklearn.metrics import r2_score

r2_score(y_test.to_numpy(), y_test_pred)

0.7833463107364539

# Deploy do modelo

In [32]:
type(modelo)

sklearn.linear_model._base.LinearRegression

In [33]:
modelo

In [34]:
import pickle 

with open('modelo.pkl', 'wb') as f:
    pickle.dump(modelo, f)
    
with open('dict_categorias.pkl', 'wb') as f:
    pickle.dump(dict_categorias, f)
        
with open('modelo.pkl', 'rb') as f:
    loaded_model = pickle.load(f)
    
print(loaded_model)

LinearRegression()


In [35]:
y_test_pred = loaded_model.predict(X_test.to_numpy() )
print(mean_squared_error(y_test.to_numpy(), y_test_pred))
print(r2_score(y_test.to_numpy(), y_test_pred))

33635210.431178406
0.7833463107364539


In [36]:
#  idade       sexo     IMC   num_filhos   fumante   regiao  valor_seguro
# 32            1     28.880  0            0          2      3866.85520
x1 = np.array([35,     1,   28.880, 0,        0,     2 ])
modelo.predict(x1.reshape(1, -1))[0]

6457.656397499495

In [37]:
df.columns

Index(['idade', 'sexo', 'IMC', 'num_filhos', 'fumante', 'regiao',
       'valor_seguro'],
      dtype='object')

In [38]:
df.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
0,19,0,27.9,0,1,0,16884.924
1,18,1,33.77,1,0,1,1725.5523
2,28,1,33.0,3,0,1,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552


In [39]:
dict_categorias

{'sexo': {'Feminino': 0, 'Masculino': 1},
 'fumante': {'nao': 0, 'sim': 1},
 'regiao': {'sul': 0, 'sudeste': 1, 'Norte': 2, 'Nordeste': 3}}

In [40]:
!ls -lah

total 856
drwxr-xr-x@ 13 valencar  staff   416B May 28 19:31 [34m.[m[m
drwxr-xr-x@  6 valencar  staff   192B May 28 11:55 [34m..[m[m
drwxr-xr-x@  5 valencar  staff   160B May 28 19:28 [34m.ipynb_checkpoints[m[m
drwxr-xr-x@  3 valencar  staff    96B May 28 11:32 [34m.streamlit[m[m
-rw-r--r--@  1 valencar  staff   2.5K May 28 18:54 app_seguros.py
-rw-r--r--@  1 valencar  staff   140B May 28 19:32 dict_categorias.pkl
-rw-r--r--@  1 valencar  staff    54K May 28 10:09 insurance.csv
-rw-r--r--@  1 valencar  staff   499B May 28 19:32 modelo.pkl
-rw-r--r--@  1 valencar  staff    76K May 28 19:27 previsao_seguros-Dummies-var-v02.ipynb
-rw-r--r--@  1 valencar  staff    49K May 28 19:31 previsao_seguros-v03.ipynb
-rw-r--r--@  1 valencar  staff    66K May 28 10:16 previsao_seguros.ipynb
-rw-r--r--@  1 valencar  staff   108K May 28 11:40 seguro.jpeg
-rw-r--r--@  1 valencar  staff    55K May 28 10:14 seguros.csv
