In [183]:
import pandas as pd
import numpy as np 
import plotly.express as px
import plotly.graph_objects as go 


from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

In [184]:
df_costs = pd.read_csv('./datasets/datareg.csv')

In [185]:
df_costs.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [186]:
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1338 non-null   int64  
 1   sex              1338 non-null   object 
 2   bmi              1338 non-null   float64
 3   children         1338 non-null   int64  
 4   smoker           1338 non-null   object 
 5   region           1338 non-null   object 
 6   medical charges  1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


### Feature engineering

In [187]:
#mostrar e remover as colunas categoricas que possuem somente um valor possivel
for column in df_costs.select_dtypes(include=['object']).columns:
    if df_costs[column].nunique() == 1:
        print(f'Coluna {column} possui somente um valor possivel: {df_costs[column].unique()}')

In [188]:
#mostrar valores possiveis
for column in df_costs.select_dtypes(include=['object']).columns:
        print(f'Coluna {column} possui somente um valor possivel: {df_costs[column].unique()}')

Coluna sex possui somente um valor possivel: ['female' 'male']
Coluna smoker possui somente um valor possivel: ['yes' 'no']
Coluna region possui somente um valor possivel: ['southwest' 'southeast' 'northwest' 'northeast']


In [189]:
#mostrar percentual valores ausentes 
for column in df_costs.select_dtypes(include=['object']).columns:
    contagem_nulas = df_costs[column].isnull().sum()
    print(f'{column}: {contagem_nulas/len(df_costs)*100:.2f}%')

sex: 0.00%
smoker: 0.00%
region: 0.00%


In [190]:
#apresentar estatisticas
df_costs.describe()

Unnamed: 0,age,bmi,children,medical charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [191]:
#mostrar e remover as colunas categoricas que possuem somente um valor possivel
for column in df_costs.select_dtypes(include=['number']).columns:
    if df_costs[column].nunique() == 1:
        print(f'Coluna {column} possui somente um valor possivel: {df_costs[column].unique()}')

In [192]:
#mostrar percentual valores ausentes 
for column in df_costs.select_dtypes(include=['number']).columns:
    contagem_nulas = df_costs[column].isnull().sum()
    print(f'{column}: {contagem_nulas/len(df_costs)*100:.2f}%')

age: 0.00%
bmi: 0.00%
children: 0.00%
medical charges: 0.00%


In [193]:
#converter coluna categorica
for column in df_costs.select_dtypes(include=['object']).columns:
    valores_unicos = df_costs[column].unique()
    if set(valores_unicos).issubset(set(['yes', 'no'])):
        df_costs[column] = df_costs[column].apply(lambda x: 1 if x == 'yes' else 0)

### EDA

In [194]:
#mostrar distribuição de custos médicos
fig = px.histogram(df_costs, x='medical charges', nbins=30)
fig.show()

In [195]:
# mostrar distribuição de idade
fig = px.histogram(df_costs, x='age', nbins=30)
fig.show()

In [196]:
#mostrar a quantidade de filhos
fig = px.histogram(df_costs, x='children')
fig.show()

In [197]:
#mostrar a distribuição do bmi
fig = px.histogram(df_costs, x='bmi')
fig.show()

In [198]:
# mostrar a distribuição de genero
fig = px.bar(df_costs['sex'].value_counts())
fig.show()

In [199]:
# mostrar a distribuição de fumantes
fig = px.bar(df_costs['smoker'].value_counts())
fig.show()

In [200]:
# mostrar a distribuição por regiao
fig = px.bar(df_costs['region'].value_counts())
fig.show()

In [201]:
fig = px.box(df_costs, x='age', y='medical charges')
fig.show()

In [202]:
fig = px.box(df_costs, x='sex', y='medical charges')
fig.show()

In [203]:
fig = px.box(df_costs, x='smoker', y='medical charges')
fig.show()

In [204]:
fig = px.box(df_costs, x='region', y='medical charges')
fig.show()

In [205]:
#plot de correlação das variaveis numericas
corr_matrix = df_costs.select_dtypes(include=['number']).corr()

In [206]:
corr_matrix

Unnamed: 0,age,bmi,children,smoker,medical charges
age,1.0,0.109272,0.042469,-0.025019,0.299008
bmi,0.109272,1.0,0.012759,0.00375,0.198341
children,0.042469,0.012759,1.0,0.007673,0.067998
smoker,-0.025019,0.00375,0.007673,1.0,0.787251
medical charges,0.299008,0.198341,0.067998,0.787251,1.0


In [207]:
fig = go.Figure()
fig.add_trace(
    go.Heatmap(
        x = corr_matrix.columns,
        y = corr_matrix.index,
        z = np.array(corr_matrix),
        text = corr_matrix.values,
        texttemplate='%{text:.3f}',
        zmin=-1,
        zmax=1
    )
)

fig.show()

### Preparar dados

In [208]:
X = df_costs.drop(columns=['medical charges'])
y= df_costs['medical charges']

In [209]:
numeric_features = X.select_dtypes(include=['number']).columns
categorical_features = X.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

In [210]:
#dividir os dados em treinamento teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [211]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [212]:
print(f'Dados treinamento {X_train.shape}')
print(f'Dados teste : {X_test.shape}')

Dados treinamento (1070, 10)
Dados teste : (268, 10)


### Treinar modelo

In [213]:
bagging_model = BaggingRegressor(
    estimator=LinearRegression(),
    n_estimators=10,
    random_state=51,
)

In [214]:
bagging_model.fit(X_train, y_train)

0,1,2
,estimator,LinearRegression()
,n_estimators,10
,max_samples,1.0
,max_features,1.0
,bootstrap,True
,bootstrap_features,False
,oob_score,False
,warm_start,False
,n_jobs,
,random_state,51

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


### Análise dos resultados

In [215]:
#fazer predições com base no modelo treinado
y_pred = bagging_model.predict(X_test)

In [216]:
y_pred

array([ 8867.59078872, 36805.27813018,  2786.29194648, 11178.02788322,
       34023.55812248, 11612.68697707, 11555.24008631, 14975.75609284,
        5348.73841345, 10644.35007315,  9542.26478237, 12182.41562307,
        9963.18077502,  4197.02159732,  5495.5510617 , 12669.2259207 ,
        5654.27949942,  4898.19412627, 25738.02597205, 28755.29531981,
       10316.69436656,  8508.57070827, 32483.07329171, 13179.23439342,
        6165.46369588, 16089.92921178,  9917.10593579,  2575.88579687,
       23361.22140021,  8252.34064   ,  3894.4856253 , 30294.22573616,
        5737.14627997,  4710.06736037,  7789.91871347, 11150.60813685,
       13288.03609118,  2140.55532123, 12153.81159574,  7783.63881065,
        9844.16523243,   822.46724682,  5950.84135369,  2147.76049849,
        4277.11067328, 15121.58202627, 15339.42995097, 35018.97097488,
        8205.65604559, 12731.46150312,  5643.31124535, 30716.57617381,
        6984.39166773, 39973.66994033,  4436.59198834, 27572.45121014,
      

In [217]:
#avaliar metricas do modelo 
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [218]:
print(f'rmse = {rmse}')
print(f'R2 = {r2}')

rmse = 6613.208775044855
R2 = 0.7484823756086139


In [219]:
#obter os coeficientes de cada estimador
coefs = np.array([estimator.coef_ for estimator in bagging_model.estimators_])

In [220]:
coefs

array([[ 3278.7325218 ,  2119.78428956,   264.06285745,  9961.81618111,
          285.92290426,  -285.92290426,  1130.93843057,  -100.06818715,
         -370.33276789,  -660.53747553],
       [ 3449.54081044,  2166.85764966,   551.38730585,  9409.05442415,
         -238.53484065,   238.53484065,  1108.07484778,    13.36910861,
         -260.17776898,  -861.26618741],
       [ 3595.62012901,  1775.55778489,   286.44025787,  9718.18192687,
          492.9084037 ,  -492.9084037 ,   938.90201857,   490.24189854,
         -498.12023032,  -931.02368678],
       [ 3348.48448627,  2442.5271385 ,   393.83084163,  9197.04170813,
          275.61177616,  -275.61177616,   366.9682129 ,   854.11514161,
         -659.68638038,  -561.39697413],
       [ 3396.25144842,  2167.43317023,   577.50322017,  9321.62011371,
           78.31591414,   -78.31591414,   748.8897391 ,   379.2427695 ,
        -1009.85959948,  -118.27290912],
       [ 3417.84448469,  1995.01608087,   591.75389738,  9802.87484253,
   

In [221]:
#calcular a media dos coeficientes absolutos
feature_importance = np.mean(np.abs(coefs), axis=0)

#normalizar as importancias
feature_importance = feature_importance / np.sum(feature_importance)

In [222]:
feature_importance

array([0.18826013, 0.1133771 , 0.0296979 , 0.52434395, 0.0141138 ,
       0.0141138 , 0.03750999, 0.02108574, 0.03199751, 0.02550008])

In [223]:
#obter os nomes das features
feature_names = preprocessor.get_feature_names_out()

In [224]:
feature_names

array(['num__age', 'num__bmi', 'num__children', 'num__smoker',
       'cat__sex_female', 'cat__sex_male', 'cat__region_northeast',
       'cat__region_northwest', 'cat__region_southeast',
       'cat__region_southwest'], dtype=object)

In [225]:
#criar um dataframe com as importancias e os nomes
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})

importante_df = importance_df.sort_values('importance', ascending=True)

In [226]:
fig = px.bar(importance_df, x='importance', y='feature', orientation='h')
fig.show()

### Verificar propriedades do modelo

In [227]:
bagging_model.estimators_samples_

[array([503, 347, 592, ..., 894, 379, 343]),
 array([914,  70, 844, ..., 436, 320, 822]),
 array([151, 735, 546, ..., 669, 536, 198]),
 array([405, 317,  32, ..., 592, 790, 440]),
 array([776, 345, 478, ..., 102, 934, 750]),
 array([ 514, 1037,  824, ...,  827,  842,  876]),
 array([ 350,  900, 1045, ...,  167,  341,  985]),
 array([649, 979, 314, ..., 376, 597, 985]),
 array([875, 670, 998, ..., 114, 230, 555]),
 array([ 962, 1069,  677, ..., 1051,  745,  898])]

In [228]:
bagging_model.estimators_features_

[array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])]

In [229]:
import joblib

In [231]:
df_costs.to_csv('./datasets/costs_cleaned.csv', index=False)

In [232]:
joblib.dump(preprocessor, './preprocessor_costs.pkl')

['./preprocessor_costs.pkl']