In [73]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from category_encoders import TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [74]:
SEED = 42

# Load

In [75]:
# criando o dataframe
DATA_DIR = 'data/insurance.csv'
df = pd.read_csv(DATA_DIR)

# visualizando o cabeçalho e printando algumas informações
print('Linhas: ', df.shape[0])
print('Colunas: ', df.shape[0])

df.head()

Linhas:  1338
Colunas:  1338


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Split Dataset

In [76]:
X = df.drop(columns=['charges'], axis=1)
y = df['charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED)

print("Training set shape:", X_train.shape, y_train.shape)
# print("Validation set shape:", X_val.shape, y_val.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Training set shape: (936, 6) (936,)
Test set shape: (402, 6) (402,)


# Pipeline

In [77]:
# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=['number']).columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numerical columns:", numerical_cols)

Categorical columns: ['sex', 'smoker', 'region']
Numerical columns: ['age', 'bmi', 'children']


In [78]:
# Create the column transformer
encoder = TargetEncoder(min_samples_leaf=1)

preprocessor = ColumnTransformer(
    transformers=[
        ('target_encoder', TargetEncoder(), categorical_cols),
        ('scaler', StandardScaler(), numerical_cols)
    ])

# Training

## Dummy Model

In [79]:
from sklearn.dummy import DummyRegressor

In [80]:
dummy = DummyRegressor(strategy = "mean")

dummy_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', dummy)
])

# train it
dummy_pipeline.fit(X_train, y_train) 

In [99]:
y_test_dummy_pred = dummy_pipeline.predict(X_test)

mae = mean_absolute_error(y_test, y_test_dummy_pred)
mse = mean_squared_error(y_test, y_test_dummy_pred)
rmse = mean_squared_error(y_test, y_test_dummy_pred, squared=False)  # RMSE requires squared=False
explained_variance = explained_variance_score(y_test, y_test_dummy_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"Explained Variance Score: {explained_variance}")

MAE: 9167.957047790684
MSE: 146755286.14031094
RMSE: 12114.259619981362
Explained Variance Score: 0.0


# Grid Search

In [83]:
param_grid = {
    'model__n_estimators': [50, 100, 150, 200],
    'model__max_depth': [3, 5, 7],
    'model__learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
    'model__subsample': [0.8, 0.9, 1.0],
}

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_seed=SEED))
])
# Inicializa o GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# # Treina o modelo
grid_search.fit(X_train, y_train)


Parameters: { "random_seed" } are not used.



In [93]:
# # Obtém os melhores hiperparâmetros
best_params = grid_search.best_params_
best_params = {key.replace('model__', ''): value for key, value in best_params.items()}
best_params

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}

In [94]:
# Cria um novo modelo XGBRegressor com os melhores hiperparâmetros
model = XGBRegressor(random_state=SEED, **best_params)

# Create the full pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])


# # Define the cross-validation strategy
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

# # Perform cross-validation
scores = cross_val_score(pipeline, X_train, y_train, cv=kf, scoring='r2')

# Print the cross-validation scores
print("Cross-Validation R2 scores:", scores)
print("Mean R2:", np.mean(scores))

Cross-Validation R2 scores: [0.84750056 0.82253682 0.8386977  0.88918403 0.86636039]
Mean R2: 0.8528559014436496


In [98]:
# Train the model on the entire training set
pipeline.fit(X_train, y_train)

# Predict on the validation set
y_test_pred = pipeline.predict(X_test)

# Calculate the metrics
mae = mean_absolute_error(y_test, y_test_pred)
mse = mean_squared_error(y_test, y_test_pred)
rmse = mean_squared_error(y_test, y_test_pred, squared=False)  # RMSE requires squared=False
explained_variance = explained_variance_score(y_test, y_test_pred)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"Explained Variance Score: {explained_variance}")

MAE: 2427.755551660292
MSE: 18856412.81789763
RMSE: 4342.397128073114
Explained Variance Score: 0.8718090802749328


In [100]:
import plotly.graph_objects as go
# Create a scatter plot
fig = go.Figure()

# Add the predicted values
fig.add_trace(go.Scatter(x=y_test, y=y_test_pred, mode='markers', 
                         marker=dict(color='red', size=8), name='Predicted Values'))

# Add a line for perfect prediction
fig.add_trace(go.Scatter(x=y_test.sort_values(), y=y_test.sort_values(), mode='lines', 
                         line=dict(color='blue'), name='Actual Values'))

# Update the layout
fig.update_layout(xaxis_title='Actual Values', yaxis_title='Predicted Values', title='Actual vs Predicted Values',
                  width=1000, height=600)

# Show the plot
fig.show()


In [101]:
standardized_residuals = (y_test - y_test_pred) / np.std(y_test - y_test_pred)
mean_residuals = np.mean(standardized_residuals)

fig = go.Figure()

fig.add_trace(go.Scatter(x=y_test_pred, y=standardized_residuals, mode='markers', 
                         marker=dict(color='green'), name='Resíduos Padronizados'))

fig.add_shape(type='line', x0=min(y_test_pred), y0=mean_residuals, 
              x1=max(y_test_pred), y1=mean_residuals,
              line=dict(color='red', width=2, dash='dash'), 
              name='Média dos Resíduos Padronizados')

fig.update_layout(xaxis_title='y_pred', yaxis_title='Resíduos Padronizados', 
                  title='Gráfico de Resíduos Padronizados vs. Valores Previstos')

fig.show()