In [1]:
import pandas as pd
import plotly.express as px
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, ElasticNet, HuberRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.svm import SVC
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor

### Carregar dados

In [3]:
df_costs = pd.read_csv('./datasets/costs_cleaned.csv')

In [4]:
df_costs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1338 non-null   int64  
 1   sex              1338 non-null   object 
 2   bmi              1338 non-null   float64
 3   children         1338 non-null   int64  
 4   smoker           1338 non-null   int64  
 5   region           1338 non-null   object 
 6   medical charges  1338 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 73.3+ KB


### Preparação dos dados


In [5]:
#preparar dados
X = df_costs.drop(columns=['medical charges'])
y = df_costs['medical charges']

In [6]:
#carregar preprocessor
import joblib
preprocessor = joblib.load('./preprocessor_costs.pkl')

In [7]:
#dividir o dataset entre treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [8]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [9]:
X_train.shape

(1070, 10)

In [10]:
X_test.shape

(268, 10)

### Treiando modelo

In [13]:
#criar modelo regressor
#algoritmos base
lr_model = LinearRegression()
elastic_model = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=51)
tree_model = DecisionTreeRegressor(random_state=51)

#meta modelo ou meta learn
huber_model = HuberRegressor()

stacking_model = StackingRegressor(
    estimators= [
        ('liner regression', lr_model),
        ('elastic', elastic_model),
        ('decision', tree_model)
    ],
    final_estimator=huber_model,
    passthrough=False
)

In [14]:
#treinar o modelo
stacking_model.fit(X_train, y_train)

0,1,2
,estimators,"[('liner regression', ...), ('elastic', ...), ...]"
,final_estimator,HuberRegressor()
,cv,
,n_jobs,
,passthrough,False
,verbose,0

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False

0,1,2
,alpha,1.0
,l1_ratio,0.5
,fit_intercept,True
,precompute,False
,max_iter,1000
,copy_X,True
,tol,0.0001
,warm_start,False
,positive,False
,random_state,51

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,51
,max_leaf_nodes,
,min_impurity_decrease,0.0

0,1,2
,epsilon,1.35
,max_iter,100
,alpha,0.0001
,warm_start,False
,fit_intercept,True
,tol,1e-05


### Analise dos resultados

In [15]:
# Fazer as predições com base no modelo treinado
y_pred = stacking_model.predict(X_test)

In [16]:
y_pred

array([ 7695.79959724, 43370.93405492, 20538.53371989, 23306.79628201,
       38542.87992825,  9712.19066584,  7864.68467422, 12191.71869977,
        5530.59343289,  9405.74504414,  8707.54848159, 11508.65098615,
        7327.30156712,  2463.04849912,  4553.46990532, 12495.95535168,
        3100.39761182,  6935.23743426, 18344.04697192, 20101.55089448,
        4888.24901415,  6829.38484691, 53567.50549203, 10875.11501712,
        5634.50271863, 15015.90063324, 11475.01694231,  1573.00910166,
       30832.39007449, 19316.03404105,  1437.97164139, 23004.37525715,
        2486.96709226,  2806.40056811,  7060.57244398, 24988.87749613,
        7348.17486445,  1418.96078897, 11657.23837064,  7247.37658503,
       11211.77968941,  1234.95208951,  3676.24582513,  1443.33461361,
       12248.44036693, 12752.260629  , 11556.60219094, 40223.96058083,
        8302.20504107, 12560.89421782,  4744.64392107, 37488.43146823,
        8832.32557457, 46791.81649497, 18839.48431266, 33244.72604585,
      

In [17]:
#avaliar metricas do modelo
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [18]:
print(f'rmse = {rmse}')
print(f'r2 = {r2}')

rmse = 6641.236668309968
r2 = 0.7463459096735768


In [19]:
#calcular a importancia considerando o modelo de regressao
importances = []

for estimador in stacking_model.estimators_:
    if hasattr(estimador, 'coef_'):
        importances.append(np.abs(estimador.coef_))
    elif hasattr(estimador, 'feature_importances_'):
        importances.append(estimador.feature_importances_)
    else:
        print(f'Não foi possivel carregar {type(estimador).__name__}')

In [21]:
#calcular a média das importancias
importancia_media = np.mean(importances, axis=0)

In [23]:
#normalizar
feature_importance = importancia_media / np.sum(importancia_media)

In [22]:
#obter os nomes
feature_names = preprocessor.get_feature_names_out()

In [24]:
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})

In [25]:
#ordenar
importance_df = importance_df.sort_values(by='importance', ascending=True)

In [26]:
fig = px.bar(importance_df,
             x='importance',
             y= 'feature',
             orientation='h')


fig.show()

### propriedades do modelo

In [29]:
#mostrar a evidencia do stacking regressor
#selecionar uma amostra
X_sample = X_test[7].reshape(1,-1)

#predições individuais dos estimadores
linear_pred = stacking_model.named_estimators_['liner regression'].predict(X_sample)
elastic_pred = stacking_model.named_estimators_['elastic'].predict(X_sample)
tree_pred = stacking_model.named_estimators_['decision'].predict(X_sample)


stacking_pred = stacking_model.predict(X_sample)



In [30]:
print(f'Linear  = {linear_pred}')
print(f'elastic  = {elastic_pred}')
print(f'tree  = {tree_pred}')
print(f'stacking  = {stacking_pred}')

Linear  = [14793.19428853]
elastic  = [13782.69060634]
tree  = [11856.4115]
stacking  = [12191.71869977]
