In [2]:
import pandas as pd
import plotly.express as px
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import VotingRegressor
from sklearn.svm import SVC
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor

### Carga dados

In [3]:
df_costs = pd.read_csv('./datasets/costs_cleaned.csv')

In [4]:
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1338 non-null   int64  
 1   sex              1338 non-null   object 
 2   bmi              1338 non-null   float64
 3   children         1338 non-null   int64  
 4   smoker           1338 non-null   int64  
 5   region           1338 non-null   object 
 6   medical charges  1338 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 73.3+ KB


### Preparar dados

In [5]:
X = df_costs.drop(columns='medical charges')
y = df_costs['medical charges']

In [6]:
import joblib
preprocessor = joblib.load('./preprocessor_costs.pkl')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [8]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [10]:
X_train.shape

(1070, 10)

In [11]:
X_test.shape

(268, 10)

### Treinar modelo

In [20]:
lr_model = LinearRegression()
elastic_model = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=51)
tree_model = DecisionTreeRegressor(random_state=51)

voting_model = VotingRegressor(
    estimators=[
        ('linear regression', lr_model),
        ('elastic', elastic_model),
        ('decision', tree_model)
    ]
)

In [21]:
voting_model.fit(X_train, y_train)

0,1,2
,estimators,"[('linear regression', ...), ('elastic', ...), ...]"
,weights,
,n_jobs,
,verbose,False

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False

0,1,2
,alpha,1.0
,l1_ratio,0.5
,fit_intercept,True
,precompute,False
,max_iter,1000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,51

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,51
,max_leaf_nodes,
,min_impurity_decrease,0.0


### Análise dos resultados

In [22]:
y_pred = voting_model.predict(X_test)

In [23]:
#avaliar metricas do modelo
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [16]:
print(f'rmse:{rmse}')
print(f'r2: {r2}')

rmse:6240.904728783782
r2: 0.7760046197139258


In [26]:
#calcular a importancia
importances = []

for estimador in voting_model.estimators_:
    if hasattr(estimador, 'coef_'):
        importances.append(np.abs(estimador.coef_))
    elif hasattr(estimador, 'feature_importances_'):
        importances.append(estimador.feature_importances_)
    else:
        print(f'Nao foi possivel carregar {type(estimador).__name__}')

In [27]:
importancia_media = np.mean(importances, axis=0)

In [28]:
feature_names = preprocessor.get_feature_names_out()

In [29]:
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importancia_media})

In [30]:
importance_df = importance_df.sort_values(by='importance', ascending=True)

In [31]:
fig = px.bar(importance_df,
             x='importance',
             y='feature',
             orientation='h'
             )
fig.show()

### Evidencias 

In [32]:
#hard voting

#selecionar um registro para predição
X_sample = X_test[7].reshape(1, -1)

In [33]:
linear_pred = voting_model.named_estimators_['linear regression'].predict(X_sample)
elastic_pred = voting_model.named_estimators_['elastic'].predict(X_sample)
tree_pred = voting_model.named_estimators_['decision'].predict(X_sample)


In [34]:
voting_pred = voting_model.predict(X_sample)

In [35]:
mean_pred = np.mean([linear_pred[0], elastic_pred[0], tree_pred[0]])

In [38]:
print(f'linear_pred: {linear_pred[0]}')
print(f'elastic_pred: {elastic_pred[0]}')
print(f'tree_pred: {tree_pred[0]}')

print(f'mean_pred: {mean_pred}')
print(f'voting_pred: {voting_pred[0]}')

linear_pred: 14793.194288532948
elastic_pred: 13782.690606341423
tree_pred: 11856.4115
mean_pred: 13477.43213162479
voting_pred: 13477.43213162479
