In [1]:
import pandas as pd
import numpy as np 
import plotly.express as px


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

### Carga de dados

In [2]:
df_costs = pd.read_csv('./datasets/costs_cleaned.csv')

In [3]:
df_costs.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              1338 non-null   int64  
 1   sex              1338 non-null   object 
 2   bmi              1338 non-null   float64
 3   children         1338 non-null   int64  
 4   smoker           1338 non-null   int64  
 5   region           1338 non-null   object 
 6   medical charges  1338 non-null   float64
dtypes: float64(2), int64(3), object(2)
memory usage: 73.3+ KB


In [4]:
df_costs.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,medical charges
0,19,female,27.9,0,1,southwest,16884.924
1,18,male,33.77,1,0,southeast,1725.5523
2,28,male,33.0,3,0,southeast,4449.462
3,33,male,22.705,0,0,northwest,21984.47061
4,32,male,28.88,0,0,northwest,3866.8552
5,31,female,25.74,0,0,southeast,3756.6216
6,46,female,33.44,1,0,southeast,8240.5896
7,37,female,27.74,3,0,northwest,7281.5056
8,37,male,29.83,2,0,northeast,6406.4107
9,60,female,25.84,0,0,northwest,28923.13692


### Preparar dados

In [5]:
X = df_costs.drop(columns=['medical charges'])
y = df_costs['medical charges']

In [6]:
import joblib
preprocessor = joblib.load('./preprocessor_costs.pkl')

In [7]:
#dividir em treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

In [8]:
#aplicar preprocessor nos dados de treinamento
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [9]:
X_train.shape

(1070, 10)

In [10]:
X_test.shape

(268, 10)

### Treinamento do modelo

In [11]:
#criar o modelo de Adaboost reg
boosting_model = AdaBoostRegressor(
    estimator=LinearRegression(),
    n_estimators=50,
    learning_rate=1.0,
    random_state=51
)

In [12]:
boosting_model.fit(X_train, y_train)

0,1,2
,estimator,LinearRegression()
,n_estimators,50
,learning_rate,1.0
,loss,'linear'
,random_state,51

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


### Análise dos resultados

In [14]:
#fazer predições no conj testes
y_pred = boosting_model.predict(X_test)

In [15]:
y_pred

array([10800.23473216, 37769.24929587,  4867.09652517, 12469.08381743,
       34632.11135672, 13355.96825399, 13599.5756217 , 16852.48066042,
        7174.49363363, 12593.14262104, 11480.13432879, 13947.57940409,
       11823.6915029 ,  6277.67668536,  6741.95125161, 14022.58925865,
        7752.15162778,  7468.44027107, 25796.78129254, 29483.60833705,
       13163.23155702, 10050.09798002, 33222.29917596, 14456.95033626,
        7577.71063807, 17255.15612167, 11355.73094706,  5160.65555925,
       24146.62216307,  9416.59054111,  6744.43508419, 31286.21532125,
        7963.07493631,  6655.44071248,  9094.83966498, 12800.99289499,
       15496.18299594,  4081.74934738, 13692.20982359, 10036.76978533,
       11882.87945656,  2504.06956197,  7416.8049379 ,  4226.77720349,
        5593.98909358, 16485.70336567, 17024.12355557, 35421.64221568,
        8559.92288549, 14404.95568638,  7736.66291608, 30869.01138139,
        8178.63878845, 41246.85098578,  5236.49665571, 27785.72031111,
      

In [16]:
#avaliar emtricas
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [17]:
print(f'rmse {rmse}')
print(f'r2 {r2}')

rmse 6926.93661541702
r2 0.724052551316343


In [18]:
# calcular a importancia das features usando os coeficientes
#obter os coef de cada estimador
coefs = np.array([estimator.coef_ for estimator in boosting_model.estimators_])

In [21]:
importances = np.mean(np.abs(coefs), axis=0)

In [22]:
importances = importances / np.sum(importances)

In [23]:
#obter os nomes da fetures
features_names = preprocessor.get_feature_names_out()

In [24]:
features_names

array(['num__age', 'num__bmi', 'num__children', 'num__smoker',
       'cat__sex_female', 'cat__sex_male', 'cat__region_northeast',
       'cat__region_northwest', 'cat__region_southeast',
       'cat__region_southwest'], dtype=object)

In [25]:
#criar dataframe com features names e imortances
importance_df = pd.DataFrame({'feature': features_names, 'importance': importances})

In [26]:
#ordenar 
importance_df = importance_df.sort_values('importance', ascending=True)

In [27]:
#criar grafico de barras com a importancia
fig = px.bar(importance_df,
             x='importance',
             y='feature',
             orientation='h')
fig.show()

### Propriedades do modelo

In [28]:
#erros 
boosting_model.estimator_errors_

array([0.1325525 , 0.20039444, 0.26369109, 0.35272733, 0.42748131,
       0.41440901, 0.4657024 , 0.48312978, 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ])

In [29]:
boosting_model.estimator_weights_

array([1.87857623, 1.3838309 , 1.02687142, 0.60707232, 0.2921348 ,
       0.34576816, 0.1374062 , 0.06750651, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])