In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [2]:
# dados originais
insurances = pd.read_csv('insurance_migue2.csv')
insurances

Unnamed: 0,age,bmi,children,charges
0,19,27.900,0,16884.92400
1,18,33.770,1,1725.55230
2,28,33.000,3,4449.46200
3,33,22.705,0,21984.47061
4,32,28.880,0,3866.85520
5,31,25.740,0,3756.62160
6,46,33.440,1,8240.58960
7,37,27.740,3,7281.50560
8,37,29.830,2,6406.41070
9,60,25.840,0,28923.13692


In [3]:
# um pouco mais de informações sobre os dados
insurances.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [4]:
# coeficiente de relação (quanto mais próximo de 1.0 ou -1.0, melhor atributo)
insurances.corr()

Unnamed: 0,age,bmi,children,charges
age,1.0,0.109272,0.042469,0.299008
bmi,0.109272,1.0,0.012759,0.198341
children,0.042469,0.012759,1.0,0.067998
charges,0.299008,0.198341,0.067998,1.0


In [5]:
# Campos relavantes para o treinamento
cols = ['age', 'sex', 'bmi', 'children', 'smoker', 'region']
# Campo para predição
cols_target = ['charges']

regression = linear_model.LinearRegression()
x_train, x_test, y_train, y_test = train_test_split(
    insurances[cols], insurances[cols_target], test_size=0.2, random_state=4)

In [6]:
x_train

Unnamed: 0,age,bmi,children
127,52,37.400,0
578,52,30.200,1
385,19,34.400,0
1203,51,32.300,1
413,25,23.900,5
431,29,20.235,2
707,49,28.690,3
427,18,29.165,0
574,57,34.295,2
435,60,33.110,3


In [7]:
x_test

Unnamed: 0,age,bmi,children
726,41,28.405,1
260,58,25.200,0
902,26,27.265,3
566,38,40.565,1
776,40,32.300,2
270,18,29.370,1
13,56,39.820,0
280,40,28.120,1
885,32,28.930,1
854,49,23.845,3


In [8]:
y_train

Unnamed: 0,charges
127,9634.538000
578,9724.530000
385,1261.859000
1203,9964.060000
413,5080.096000
431,4906.409650
707,10264.442100
427,7323.734819
574,13224.057050
435,13919.822900


In [9]:
y_test

Unnamed: 0,charges
726,6664.68595
260,11837.16000
902,4661.28635
566,6373.55735
776,6986.69700
270,1719.43630
13,11090.71780
280,22331.56680
885,19719.69470
854,24106.91255


In [10]:
# Executa Treinamento com 80% dos dados disponíveis
regression.fit(x_train, y_train)

# Faz previsão dos 20% dos dados que não entraram no treinamento
output = regression.predict(x_test)
output


array([[12926.72359055],
       [15263.02732702],
       [ 9981.02561102],
       [16613.35162369],
       [14588.87742694],
       [ 7885.35271332],
       [20072.02614293],
       [12589.50558821],
       [11007.22000242],
       [14136.16656827],
       [ 9370.99006843],
       [14907.90288778],
       [ 7836.16859437],
       [18209.01141114],
       [21687.70180549],
       [20494.14034501],
       [15254.00258242],
       [10207.21685843],
       [18640.0495648 ],
       [ 8975.74598074],
       [11949.40333894],
       [15117.35641437],
       [18580.52344296],
       [13856.1714916 ],
       [ 5693.52983219],
       [18900.5745676 ],
       [18562.26983434],
       [14296.67982123],
       [19943.73425064],
       [13455.57615763],
       [ 9046.94214833],
       [ 8001.07356365],
       [ 8246.85406265],
       [ 9084.78500854],
       [14205.92080999],
       [ 6990.30216815],
       [19677.93492001],
       [18066.11300061],
       [10468.6270061 ],
       [15004.32312447],


In [11]:
# Verifica a qualidade da previsão
score = r2_score(y_test, output)
score

0.1250589316215438

In [12]:
# Usa o treinamento para fazer uma previsão de um dado novo
df_new_house_info = pd.DataFrame(
    [(19, 0, 27.900, 0, 1, 0)], columns=['age', 'sex', 'bmi', 'children', 'smoker', 'region'])

output2 = regression.predict(df_new_house_info)
output2

array([[7712.28173636]])