# Best Lags, grid search hyperparams

In [53]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import time

# --- 1. Carregamento dos Dados ---
caminho_saida_parquet = r'D:\DOUTORADO\DOUTORADO_NOTEBOOK_JPY/df_DocFinal2025.parquet'
df = pd.read_parquet(caminho_saida_parquet)

# --- 2. Função de Engenharia de Features (com a melhor combinação encontrada) ---
def create_champion_features(df, target_variable='ph'):
    df_features = df.copy()
    lags_list = [1, 2, 3, 4, 5, 6, 7, 8, 34, 35, 36, 37, 38, 72]
    window_sizes_list = [2, 4, 8]
    
    for lag in lags_list:
        df_features[f'{target_variable}_lag_{lag}'] = df_features[target_variable].shift(lag)
    for window in window_sizes_list:
        df_features[f'{target_variable}_rolling_mean_{window}'] = df_features[target_variable].rolling(window=window).mean()
        df_features[f'{target_variable}_rolling_std_{window}'] = df_features[target_variable].rolling(window=window).std()
    return df_features.dropna()

# --- 3. Preparação dos Dados ---
df_featured = create_champion_features(df, target_variable='ph')

FEATURES = [col for col in df_featured.columns if col != 'ph']
TARGET = 'ph'
X = df_featured[FEATURES]
y = df_featured[TARGET]

train_size = len(df_featured) - (7 * 24 * 3)
X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size:], y[train_size:]

# --- 4. Configuração do Grid Search ---
print("Configurando o Grid Search para o LightGBM...")

lgb_model = lgb.LGBMRegressor(objective='mae', n_estimators=1000, n_jobs=-1, verbose=-1, seed=42)

param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [20, 31, 40],
    'max_depth': [-1, 10, 20],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [0, 0.1, 0.5]
}

tscv = TimeSeriesSplit(n_splits=5)

grid_search = GridSearchCV(
    estimator=lgb_model,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=tscv,
    verbose=2
)

# --- 5. Execução do Grid Search ---
print("Iniciando a busca pelos melhores hiperparâmetros... Isso pode demorar.")
start_time = time.time()

# #############################################################################
# CORREÇÃO APLICADA AQUI
# Adicionamos o 'eval_set' ao fit_params para que o callback saiba o que monitorar.
# #############################################################################
fit_params = {
    "eval_set": [(X_test, y_test)],
    "callbacks": [lgb.early_stopping(100, verbose=False)]
}

grid_search.fit(X_train, y_train, **fit_params)

end_time = time.time()
print(f"Grid Search concluído em {end_time - start_time:.2f} segundos.")

# --- 6. Análise dos Resultados ---
print("\n--- Melhores Hiperparâmetros Encontrados ---")
print(grid_search.best_params_)

best_score = -grid_search.best_score_
print(f"\nMelhor MAE na Validação Cruzada: {best_score:.4f}")

# --- 7. Avaliação Final no Conjunto de Teste (Holdout) ---
print("\nAvaliando o melhor modelo no conjunto de teste final...")
best_model = grid_search.best_estimator_
final_forecast = best_model.predict(X_test)
final_mae = mean_absolute_error(y_test, final_forecast)

print(f"\nMAE Final no Conjunto de Teste: {final_mae:.4f}")

Configurando o Grid Search para o LightGBM...
Iniciando a busca pelos melhores hiperparâmetros... Isso pode demorar.
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
[CV] END learning_rate=0.01, max_depth=-1, num_leaves=20, reg_alpha=0, reg_lambda=0; total time=   2.2s
[CV] END learning_rate=0.01, max_depth=-1, num_leaves=20, reg_alpha=0, reg_lambda=0; total time=   2.6s
[CV] END learning_rate=0.01, max_depth=-1, num_leaves=20, reg_alpha=0, reg_lambda=0; total time=   3.0s
[CV] END learning_rate=0.01, max_depth=-1, num_leaves=20, reg_alpha=0, reg_lambda=0; total time=   2.6s
[CV] END learning_rate=0.01, max_depth=-1, num_leaves=20, reg_alpha=0, reg_lambda=0; total time=   3.2s
[CV] END learning_rate=0.01, max_depth=-1, num_leaves=20, reg_alpha=0, reg_lambda=0.1; total time=   2.0s
[CV] END learning_rate=0.01, max_depth=-1, num_leaves=20, reg_alpha=0, reg_lambda=0.1; total time=   2.6s
[CV] END learning_rate=0.01, max_depth=-1, num_leaves=20, reg_alpha=0, reg_lambda=0.1; 

# Grid Search XGBoost


In [56]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import time

# --- 1. Carregamento dos Dados ---
caminho_saida_parquet = r'D:\DOUTORADO\DOUTORADO_NOTEBOOK_JPY/df_DocFinal2025.parquet'
df = pd.read_parquet(caminho_saida_parquet)

# --- 2. Função de Engenharia de Features (com a melhor combinação encontrada) ---
def create_champion_features(df, target_variable='ph'):
    df_features = df.copy()
    lags_list = [1, 2, 3, 4, 5, 6, 7, 8, 34, 35, 36, 37, 38, 72]
    window_sizes_list = [2, 4, 8]
    
    for lag in lags_list:
        df_features[f'{target_variable}_lag_{lag}'] = df_features[target_variable].shift(lag)
    for window in window_sizes_list:
        df_features[f'{target_variable}_rolling_mean_{window}'] = df_features[target_variable].rolling(window=window).mean()
        df_features[f'{target_variable}_rolling_std_{window}'] = df_features[target_variable].rolling(window=window).std()
    return df_features.dropna()

# --- 3. Preparação dos Dados ---
df_featured = create_champion_features(df, target_variable='ph')

FEATURES = [col for col in df_featured.columns if col != 'ph']
TARGET = 'ph'
X = df_featured[FEATURES]
y = df_featured[TARGET]

train_size = len(df_featured) - (7 * 24 * 3)
X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size:], y[train_size:]

# --- 4. Configuração do Grid Search para XGBoost ---
print("Configurando o Grid Search para o XGBoost...")

# Modelo base com early stopping
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=1000,
    n_jobs=-1,
    seed=42,
    early_stopping_rounds=100
)

# Grade de hiperparâmetros que vamos testar
param_grid_xgb = {
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0]
}

# Validação Cruzada específica para Séries Temporais
tscv = TimeSeriesSplit(n_splits=5)

# Objeto do Grid Search
grid_search_xgb = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid_xgb,
    scoring='neg_mean_absolute_error',
    cv=tscv,
    verbose=2
)

# --- 5. Execução do Grid Search ---
print("Iniciando a busca pelos melhores hiperparâmetros para o XGBoost... Isso pode demorar.")
start_time = time.time()

# O XGBoost precisa que o eval_set seja passado dentro do .fit() para o early stopping
# O GridSearchCV lida com isso automaticamente ao passar os dados
grid_search_xgb.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

end_time = time.time()
print(f"Grid Search concluído em {end_time - start_time:.2f} segundos.")

# --- 6. Análise dos Resultados ---
print("\n--- Melhores Hiperparâmetros Encontrados para o XGBoost ---")
print(grid_search_xgb.best_params_)

best_score_xgb = -grid_search_xgb.best_score_
print(f"\nMelhor MAE na Validação Cruzada: {best_score_xgb:.4f}")

# --- 7. Avaliação Final no Conjunto de Teste (Holdout) ---
print("\nAvaliando o melhor modelo XGBoost no conjunto de teste final...")
best_model_xgb = grid_search_xgb.best_estimator_
final_forecast_xgb = best_model_xgb.predict(X_test)
final_mae_xgb = mean_absolute_error(y_test, final_forecast_xgb)

print(f"\nMAE Final no Conjunto de Teste (XGBoost Otimizado): {final_mae_xgb:.4f}")

Configurando o Grid Search para o XGBoost...
Iniciando a busca pelos melhores hiperparâmetros para o XGBoost... Isso pode demorar.
Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=3, subsample=0.7; total time=   0.6s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=3, subsample=0.7; total time=   1.0s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=3, subsample=0.7; total time=   0.4s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=3, subsample=0.7; total time=   0.2s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=3, subsample=0.7; total time=   0.2s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=3, subsample=1.0; total time=   0.3s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=3, subsample=1.0; total time=   0.4s
[CV] END colsample_bytree=0.7, learning_rate=0.05, max_depth=3, subsample=1.0; total time=   0.3s
[CV] END colsample_bytr

# Grid search DEcision Tree

In [57]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
import time

# --- 1. Carregamento dos Dados ---
caminho_saida_parquet = r'D:\DOUTORADO\DOUTORADO_NOTEBOOK_JPY/df_DocFinal2025.parquet'
df = pd.read_parquet(caminho_saida_parquet)

# --- 2. Função de Engenharia de Features (com a melhor combinação encontrada) ---
def create_champion_features(df, target_variable='ph'):
    df_features = df.copy()
    lags_list = [1, 2, 3, 4, 5, 6, 7, 8, 34, 35, 36, 37, 38, 72]
    window_sizes_list = [2, 4, 8]
    
    for lag in lags_list:
        df_features[f'{target_variable}_lag_{lag}'] = df_features[target_variable].shift(lag)
    for window in window_sizes_list:
        df_features[f'{target_variable}_rolling_mean_{window}'] = df_features[target_variable].rolling(window=window).mean()
        df_features[f'{target_variable}_rolling_std_{window}'] = df_features[target_variable].rolling(window=window).std()
    return df_features.dropna()

# --- 3. Preparação dos Dados ---
df_featured = create_champion_features(df, target_variable='ph')

FEATURES = [col for col in df_featured.columns if col != 'ph']
TARGET = 'ph'
X = df_featured[FEATURES]
y = df_featured[TARGET]

train_size = len(df_featured) - (7 * 24 * 3)
X_train, y_train = X[:train_size], y[:train_size]
X_test, y_test = X[train_size:], y[train_size:]

# --- 4. Configuração do Grid Search para Decision Tree ---
print("Configurando o Grid Search para a Decision Tree...")

# Modelo base
dt_model = DecisionTreeRegressor(random_state=42)

# Grade de hiperparâmetros que vamos testar
param_grid_dt = {
    'max_depth': [10, 20, None], # None significa sem limite
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 5, 10],
    'criterion': ['squared_error', 'absolute_error']
}

# Validação Cruzada específica para Séries Temporais
tscv = TimeSeriesSplit(n_splits=5)

# Objeto do Grid Search
grid_search_dt = GridSearchCV(
    estimator=dt_model,
    param_grid=param_grid_dt,
    scoring='neg_mean_absolute_error',
    cv=tscv,
    verbose=2
)

# --- 5. Execução do Grid Search ---
print("Iniciando a busca pelos melhores hiperparâmetros para a Decision Tree... Isso pode demorar.")
start_time = time.time()

grid_search_dt.fit(X_train, y_train)

end_time = time.time()
print(f"Grid Search concluído em {end_time - start_time:.2f} segundos.")

# --- 6. Análise dos Resultados ---
print("\n--- Melhores Hiperparâmetros Encontrados para a Decision Tree ---")
print(grid_search_dt.best_params_)

best_score_dt = -grid_search_dt.best_score_
print(f"\nMelhor MAE na Validação Cruzada: {best_score_dt:.4f}")

# --- 7. Avaliação Final no Conjunto de Teste (Holdout) ---
print("\nAvaliando a melhor Decision Tree no conjunto de teste final...")
best_model_dt = grid_search_dt.best_estimator_
final_forecast_dt = best_model_dt.predict(X_test)
final_mae_dt = mean_absolute_error(y_test, final_forecast_dt)

print(f"\nMAE Final no Conjunto de Teste (Decision Tree Otimizada): {final_mae_dt:.4f}")

Configurando o Grid Search para a Decision Tree...
Iniciando a busca pelos melhores hiperparâmetros para a Decision Tree... Isso pode demorar.
Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END criterion=squared_error, max_depth=10, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=squared_error, max_depth=10, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=squared_error, max_depth=10, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=squared_error, max_depth=10, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=squared_error, max_depth=10, min_samples_leaf=1, min_samples_split=2; total time=   0.0s
[CV] END criterion=squared_error, max_depth=10, min_samples_leaf=1, min_samples_split=10; total time=   0.0s
[CV] END criterion=squared_error, max_depth=10, min_samples_leaf=1, min_samples_split=10; total time=   0.0s
[CV] END criterion=squared_error, max