In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
# Medical Cost Personal Datasets
url = 'seguros.csv'
df = pd.read_csv(url)
print(len(df))
df.head()

1338


Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
0,19,Feminino,27.9,0,sim,sul,16884.924
1,18,Masculino,33.77,1,nao,sudeste,1725.5523
2,28,Masculino,33.0,3,nao,sudeste,4449.462
3,33,Masculino,22.705,0,nao,Norte,21984.47061
4,32,Masculino,28.88,0,nao,Norte,3866.8552


# Análise Descritiva

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
idade,1338.0,39.207025,14.04996,18.0,27.0,39.0,51.0,64.0
IMC,1338.0,30.663397,6.098187,15.96,26.29625,30.4,34.69375,53.13
num_filhos,1338.0,1.094918,1.205493,0.0,0.0,1.0,2.0,5.0
valor_seguro,1338.0,13270.422265,12110.011237,1121.8739,4740.28715,9382.033,16639.912515,63770.42801


In [9]:
df.sexo.value_counts()

Masculino    676
Feminino     662
Name: sexo, dtype: int64

In [10]:
df.fumante.value_counts()

nao    1064
sim     274
Name: fumante, dtype: int64

In [12]:
df.regiao.value_counts()

sudeste     364
sul         325
Norte       325
Nordeste    324
Name: regiao, dtype: int64

In [13]:
df.corr()

  df.corr()


Unnamed: 0,idade,IMC,num_filhos,valor_seguro
idade,1.0,0.109272,0.042469,0.299008
IMC,0.109272,1.0,0.012759,0.198341
num_filhos,0.042469,0.012759,1.0,0.067998
valor_seguro,0.299008,0.198341,0.067998,1.0


# Transformacão de Dados

In [14]:
dict_categorias = {}

dict_categorias['sexo'] = {
        'Feminino': 0,
        'Masculino': 1
}
dict_categorias

{'sexo': {'Feminino': 0, 'Masculino': 1}}

In [15]:
dict_categorias['sexo']

{'Feminino': 0, 'Masculino': 1}

In [16]:
dict_categorias['sexo']['Masculino']

1

In [17]:
df['sexo'] = df.sexo.map(dict_categorias['sexo'])
df.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
0,19,0,27.9,0,sim,sul,16884.924
1,18,1,33.77,1,nao,sudeste,1725.5523
2,28,1,33.0,3,nao,sudeste,4449.462
3,33,1,22.705,0,nao,Norte,21984.47061
4,32,1,28.88,0,nao,Norte,3866.8552


In [18]:
dict_categorias['fumante'] = {
        'nao': 0,
        'sim': 1
}
dict_categorias

{'sexo': {'Feminino': 0, 'Masculino': 1}, 'fumante': {'nao': 0, 'sim': 1}}

In [19]:
df['fumante'] = df.fumante.map(dict_categorias['fumante'])
df.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
0,19,0,27.9,0,1,sul,16884.924
1,18,1,33.77,1,0,sudeste,1725.5523
2,28,1,33.0,3,0,sudeste,4449.462
3,33,1,22.705,0,0,Norte,21984.47061
4,32,1,28.88,0,0,Norte,3866.8552


In [20]:
df.regiao.unique()

array(['sul', 'sudeste', 'Norte', 'Nordeste'], dtype=object)

In [21]:
dict_categorias['regiao'] = {
        'sul' : 0, 
        'sudeste': 1, 
        'Norte': 2, 
        'Nordeste': 3
}
dict_categorias

{'sexo': {'Feminino': 0, 'Masculino': 1},
 'fumante': {'nao': 0, 'sim': 1},
 'regiao': {'sul': 0, 'sudeste': 1, 'Norte': 2, 'Nordeste': 3}}

In [22]:
df['regiao'] = df.regiao.map(dict_categorias['regiao'])
df.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
0,19,0,27.9,0,1,0,16884.924
1,18,1,33.77,1,0,1,1725.5523
2,28,1,33.0,3,0,1,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552


In [50]:
dfd = pd.get_dummies(df, columns=['sexo', 'fumante', 'regiao'])
dfd.head()

Unnamed: 0,idade,IMC,num_filhos,valor_seguro,sexo_0,sexo_1,fumante_0,fumante_1,regiao_0,regiao_1,regiao_2,regiao_3
0,19,27.9,0,16884.924,1,0,0,1,1,0,0,0
1,18,33.77,1,1725.5523,0,1,1,0,0,1,0,0
2,28,33.0,3,4449.462,0,1,1,0,0,1,0,0
3,33,22.705,0,21984.47061,0,1,1,0,0,0,1,0
4,32,28.88,0,3866.8552,0,1,1,0,0,0,1,0


# construção do modelo

In [51]:
from sklearn import linear_model
modelo = linear_model.LinearRegression()

In [24]:
len(df)

1338

In [57]:
y = dfd['valor_seguro']
X = dfd.drop('valor_seguro', axis=1)
X.head()

Unnamed: 0,idade,IMC,num_filhos,sexo_0,sexo_1,fumante_0,fumante_1,regiao_0,regiao_1,regiao_2,regiao_3
0,19,27.9,0,1,0,0,1,1,0,0,0
1,18,33.77,1,0,1,1,0,0,1,0,0
2,28,33.0,3,0,1,1,0,0,1,0,0
3,33,22.705,0,0,1,1,0,0,0,1,0
4,32,28.88,0,0,1,1,0,0,0,1,0


In [58]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
len(X_train), len(X_test)

(1003, 335)

In [59]:
X_train_mat = X_train.to_numpy()
X_train_mat[:2]

array([[24.   , 23.655,  0.   ,  0.   ,  1.   ,  1.   ,  0.   ,  0.   ,
         0.   ,  1.   ,  0.   ],
       [28.   , 26.51 ,  2.   ,  1.   ,  0.   ,  1.   ,  0.   ,  0.   ,
         1.   ,  0.   ,  0.   ]])

In [60]:
y_train_mat = y_train.to_numpy()
y_train_mat[:10]

array([ 2352.96845,  4340.4409 ,  9391.346  , 42211.1382 ,  8823.279  ,
       14256.1928 ,  7133.9025 ,  5312.16985,  3906.127  ,  2203.47185])

In [61]:
modelo.fit(X_train_mat, y_train_mat)

In [62]:
y_test_pred = modelo.predict(X_test.to_numpy() )
y_test_pred[:10]

array([ 8952.20855705,  7053.79716786, 36888.36911226,  9522.49758674,
       26961.84638217, 10878.24576086,    89.95864742, 17048.24399155,
        1006.23867932, 11302.09430243])

In [63]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test.to_numpy(), y_test_pred)

35117755.73613632

In [64]:
from sklearn.metrics import r2_score

r2_score(y_test.to_numpy(), y_test_pred)

0.7672642952734356

# X e y

In [25]:
X = df[ ['idade', 'sexo', 'IMC', 'num_filhos', 'fumante', 'regiao'] ]
X.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao
0,19,0,27.9,0,1,0
1,18,1,33.77,1,0,1
2,28,1,33.0,3,0,1
3,33,1,22.705,0,0,2
4,32,1,28.88,0,0,2


In [26]:
y = df['valor_seguro']
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: valor_seguro, dtype: float64

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
len(X_train), len(X_test)

(1003, 335)

## converte em matrizes

In [29]:
X_train_mat = X_train.to_numpy()
X_train_mat

array([[24.   ,  1.   , 23.655,  0.   ,  0.   ,  2.   ],
       [28.   ,  0.   , 26.51 ,  2.   ,  0.   ,  1.   ],
       [51.   ,  1.   , 39.7  ,  1.   ,  0.   ,  0.   ],
       ...,
       [58.   ,  1.   , 25.175,  0.   ,  0.   ,  3.   ],
       [37.   ,  0.   , 47.6  ,  2.   ,  1.   ,  0.   ],
       [55.   ,  1.   , 29.9  ,  0.   ,  0.   ,  0.   ]])

In [34]:
y_train_mat = y_train.to_numpy()
y_train_mat[:10]

array([ 2352.96845,  4340.4409 ,  9391.346  , 42211.1382 ,  8823.279  ,
       14256.1928 ,  7133.9025 ,  5312.16985,  3906.127  ,  2203.47185])

In [35]:
# Fit do modelo

In [36]:
modelo.fit(X_train_mat, y_train_mat)

In [37]:
# Avaliacao do modelo

In [38]:
y_test_pred = modelo.predict(X_test.to_numpy() )
y_test_pred[:10]

array([ 8917.54106359,  7057.659731  , 36899.8708097 ,  9546.15877323,
       26950.91414625, 10756.81129984,   110.22869716, 17039.89853046,
         986.98094539, 11318.40928727])

In [39]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_test.to_numpy(), y_test_pred)

35174149.32705306

In [40]:
from sklearn.metrics import r2_score

r2_score(y_test.to_numpy(), y_test_pred)

0.7668905583460908

# Deploy do modelo

In [41]:
type(modelo)

sklearn.linear_model._base.LinearRegression

In [42]:
modelo

In [43]:
import pickle 

with open('modelo.pkl', 'wb') as f:
    pickle.dump(modelo, f)
    
with open('dict_categorias.pkl', 'wb') as f:
    pickle.dump(dict_categorias, f)
        
with open('modelo.pkl', 'rb') as f:
    loaded_model = pickle.load(f)
    
print(loaded_model)

LinearRegression()


In [44]:
y_test_pred = loaded_model.predict(X_test.to_numpy() )
print(mean_squared_error(y_test.to_numpy(), y_test_pred))
print(r2_score(y_test.to_numpy(), y_test_pred))

35174149.32705306
0.7668905583460908


In [45]:
#  idade       sexo     IMC   num_filhos   fumante   regiao  valor_seguro
# 32            1     28.880  0            0          2      3866.85520
x1 = np.array([35,     1,   28.880, 0,        0,     2 ])
modelo.predict(x1.reshape(1, -1))[0]

6461.687126017325

In [46]:
df.columns

Index(['idade', 'sexo', 'IMC', 'num_filhos', 'fumante', 'regiao',
       'valor_seguro'],
      dtype='object')

In [47]:
df.head()

Unnamed: 0,idade,sexo,IMC,num_filhos,fumante,regiao,valor_seguro
0,19,0,27.9,0,1,0,16884.924
1,18,1,33.77,1,0,1,1725.5523
2,28,1,33.0,3,0,1,4449.462
3,33,1,22.705,0,0,2,21984.47061
4,32,1,28.88,0,0,2,3866.8552


In [48]:
dict_categorias

{'sexo': {'Feminino': 0, 'Masculino': 1},
 'fumante': {'nao': 0, 'sim': 1},
 'regiao': {'sul': 0, 'sudeste': 1, 'Norte': 2, 'Nordeste': 3}}

In [49]:
!ls -lah

total 720
drwxr-xr-x@ 12 valencar  staff   384B May 28 13:32 [34m.[m[m
drwxr-xr-x@  6 valencar  staff   192B May 28 11:55 [34m..[m[m
drwxr-xr-x@  4 valencar  staff   128B May 28 13:13 [34m.ipynb_checkpoints[m[m
drwxr-xr-x@  3 valencar  staff    96B May 28 11:32 [34m.streamlit[m[m
-rw-r--r--@  1 valencar  staff   2.7K May 28 11:54 app_seguros.py
-rw-r--r--@  1 valencar  staff   140B May 28 13:30 dict_categorias.pkl
-rw-r--r--@  1 valencar  staff    54K May 28 10:09 insurance.csv
-rw-r--r--@  1 valencar  staff   499B May 28 13:30 modelo.pkl
-rw-r--r--@  1 valencar  staff    59K May 28 13:32 previsao_seguros-v02.ipynb
-rw-r--r--@  1 valencar  staff    66K May 28 10:16 previsao_seguros.ipynb
-rw-r--r--@  1 valencar  staff   108K May 28 11:40 seguro.jpeg
-rw-r--r--@  1 valencar  staff    55K May 28 10:14 seguros.csv


In [None]:
# from sklearn import preprocessing
# sexo = preprocessing.LabelEncoder()
# sexo.fit(['Masculino', 'Feminino', 'Masculino'])
# print(sexo.classes_)
# print(sexo.transform(['Masculino', 'Feminino', 'Masculino']))
# print(sexo.inverse_transform([0, 0, 1, 0, 1]))

In [None]:
# sexo , mapping_index = pd.Series(df.sexo).factorize()
# sexo
# mapping_index[0]