In [7]:
# --- Bibliotecas ---
from dataclasses import dataclass, field
from typing import List, Dict, Any
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
import time

# --- 1. Definição das Nossas Classes de Configuração ---
@dataclass
class FeatureConfig:
    """Um contentor para todas as configurações de engenharia de features."""
    target_variable: str = 'ph'
    lags: List[int] = field(default_factory=lambda: [1, 2, 3, 4, 5, 6, 7, 8, 34, 35, 36, 37, 38, 72])
    windows: List[int] = field(default_factory=lambda: [2, 4, 8])

@dataclass
class ModelConfig:
    """Um contentor genérico para as configurações do modelo."""
    model_class: Any
    params: Dict[str, Any]

print("Classes FeatureConfig e ModelConfig definidas.")

# --- 2. Criação da Lista de Configurações para o Experimento ---
model_configs = [
    ModelConfig(
        model_class=RandomForestRegressor,
        params={'n_estimators': 100, 'n_jobs': -1, 'random_state': 42} # Usando params padrão, que foram os melhores
    ),
    ModelConfig(
        model_class=lgb.LGBMRegressor,
        params={'objective': 'mae', 'n_estimators': 1000, 'n_jobs': -1, 'verbose': -1, 'seed': 42,
                'learning_rate': 0.1, 'max_depth': -1, 'num_leaves': 20, 'reg_alpha': 0.5, 'reg_lambda': 0}
    ),
    ModelConfig(
        model_class=xgb.XGBRegressor,
        params={'objective': 'reg:squarederror', 'n_estimators': 1000, 'n_jobs': -1, 'seed': 42,
                'learning_rate': 0.05, 'max_depth': 7, 'subsample': 1.0, 'colsample_bytree': 1.0}
    ),
    ModelConfig(
        model_class=DecisionTreeRegressor,
        params={'criterion': 'squared_error', 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'random_state': 42}
    )
]
feature_config = FeatureConfig()
print("\nConfigurações de features e modelos prontas para o experimento.")

# --- 3. Carregamento e Preparação dos Dados ---
print("\nCarregando e preparando os dados...")
caminho_saida_parquet = r'D:\DOUTORADO\DOUTORADO_NOTEBOOK_JPY/df_DocFinal2025.parquet'
df = pd.read_parquet(caminho_saida_parquet)

def create_features_from_config(df, config: FeatureConfig):
    df_features = df.copy()
    for lag in config.lags:
        df_features[f'{config.target_variable}_lag_{lag}'] = df_features[config.target_variable].shift(lag)
    for window in config.windows:
        df_features[f'{config.target_variable}_rolling_mean_{window}'] = df_features[config.target_variable].rolling(window=window).mean()
        df_features[f'{config.target_variable}_rolling_std_{window}'] = df_features[config.target_variable].rolling(window=window).std()
    return df_features.dropna()

df_featured = create_features_from_config(df, feature_config)
FEATURES = [col for col in df_featured.columns if col != feature_config.target_variable]
TARGET = feature_config.target_variable
X = df_featured[FEATURES]
y = df_featured[TARGET]
train_size = len(df_featured) - (7 * 24 * 3)
X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size:], y[train_size:]
print("Dados preparados e divididos.")

# --- 4. Loop de Treino e Avaliação ---
results_list = []
print("\n--- Iniciando o Pipeline de Treino e Avaliação ---")

for config in model_configs:
    model_name = config.model_class.__name__
    print(f"\n--- Treinando o modelo: {model_name} ---")
    start_time = time.time()
    
    model = config.model_class(**config.params)
    
    # Lógica de treino específica para modelos com early stopping
    if model_name in ['LGBMRegressor', 'XGBRegressor']:
        fit_params = {'eval_set': [(X_test, y_test)]}
        if model_name == 'LGBMRegressor':
             fit_params['callbacks'] = [lgb.early_stopping(100, verbose=False)]
        else: # XGBoost
            model.set_params(early_stopping_rounds=100)
            fit_params['verbose'] = False
        
        model.fit(X_train, y_train, **fit_params)
    else:
        model.fit(X_train, y_train)
    
    # Previsão e Avaliação
    prediction = model.predict(X_test)
    mae = mean_absolute_error(y_test, prediction)
    end_time = time.time()
    
    results_list.append({
        'Modelo': model_name,
        'MAE Final': mae,
        'Tempo de Treino (s)': end_time - start_time
    })
    print(f"  -> MAE Final: {mae:.4f} | Tempo: {end_time - start_time:.2f}s")

# --- 5. Resultados Finais ---
final_results_df = pd.DataFrame(results_list).sort_values(by='MAE Final', ascending=True)
print("\n--- Tabela Comparativa Final ---")
display(final_results_df)


Classes FeatureConfig e ModelConfig definidas.

Configurações de features e modelos prontas para o experimento.

Carregando e preparando os dados...
Dados preparados e divididos.

--- Iniciando o Pipeline de Treino e Avaliação ---

--- Treinando o modelo: RandomForestRegressor ---
  -> MAE Final: 0.0179 | Tempo: 1.74s

--- Treinando o modelo: LGBMRegressor ---
  -> MAE Final: 0.0230 | Tempo: 0.45s

--- Treinando o modelo: XGBRegressor ---
  -> MAE Final: 0.0231 | Tempo: 2.16s

--- Treinando o modelo: DecisionTreeRegressor ---
  -> MAE Final: 0.0258 | Tempo: 0.13s

--- Tabela Comparativa Final ---


Unnamed: 0,Modelo,MAE Final,Tempo de Treino (s)
0,RandomForestRegressor,0.017937,1.738019
1,LGBMRegressor,0.023034,0.454837
2,XGBRegressor,0.023134,2.161134
3,DecisionTreeRegressor,0.025765,0.133902
