## Depend√™ncias

In [22]:
# Checagem/instala√ß√£o leve de depend√™ncias
print("Verificando depend√™ncias (pyarrow para Parquet)...")

try:
    import pyarrow as pa
    print(f"PyArrow dispon√≠vel: {pa.__version__}")
except Exception:
    print("Instalando pyarrow...")
    !pip install --upgrade "pyarrow>=18" --quiet
    import importlib
    importlib.invalidate_caches()
    import pyarrow as pa
    print(f"PyArrow instalado: {pa.__version__}")

# fastparquet √© opcional
try:
    import fastparquet  # noqa: F401
    print("fastparquet dispon√≠vel (opcional)")
except Exception:
    pass

# Outras bibliotecas sob demanda
for lib in [
    "numpy", "python-dotenv", "pandas", "matplotlib", "seaborn",
    "scikit-learn", "tensorflow", "keras", "lxml", "pytz", "requests", "optuna"
]:
    try:
        __import__(lib)
    except ImportError:
        print(f"Instalando {lib}...")
        !pip install {lib} --quiet

print("Depend√™ncias prontas")

Verificando depend√™ncias (pyarrow para Parquet)...
PyArrow dispon√≠vel: 21.0.0
Instalando python-dotenv...
Instalando scikit-learn...
Instalando scikit-learn...



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Depend√™ncias prontas



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


## VARI√ÅVEIS NECESS√ÅRIAS

In [23]:
# Imports para a API e utilidades
import os
import requests
import pandas
from dotenv import load_dotenv
from datetime import datetime, timedelta, date
import pytz
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import mixed_precision
import tensorflow as tf

# ==============================================
# GPU CONFIGURATION
# ==============================================
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    mixed_precision.set_global_policy('mixed_float16')
    print(f"‚úÖ GPU detected ({gpus[0].name}) - using mixed precision.")
else:
    print("‚ö†Ô∏è No GPU detected, running on CPU.")

# Carregar vari√°veis de ambiente do .env
load_dotenv()
# ---------------- CONFIG ---------------- #
COUNTRY_DOMAINS = {
    "FR": {"domain": "10YFR-RTE------C"},
    "ES": {"domain": "10YES-REE------0"},
    "PT": {"domain": "10YPT-REN------W"}
}

DATA_ITEMS = [
    {'key': 'load_total', 'documentType': 'A65', 'processType': 'A16', 'domainParam': 'outBiddingZone_Domain', 'parser': 'load'},
    {'key': 'market_prices', 'documentType': 'A44', 'processType': 'A07', 'domainParamIn': 'in_Domain', 'domainParamOut': 'out_Domain', 'parser': 'price'}
]

ENTSOE_TOKEN = os.environ.get("ENTSOE_SECURITY_TOKEN")
BASE_URL = "https://web-api.tp.entsoe.eu/api"
MAX_WORKERS = 100
RAW_DIR = os.path.join("data", "raw")
PARQUET_COMPRESSION = "zstd"
os.makedirs(RAW_DIR, exist_ok=True)

‚ö†Ô∏è No GPU detected, running on CPU.


# Cap√≠tulo 1: Pr√©processamento de dados

Etapa de contru√ß√£o da pipelines de pre-processamento de dados


## Classe geral de preprocessamento

In [24]:
import os
from typing import Optional, List, Tuple
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
import numpy as np
import time

class Preprocessor:
    """Pr√©-processador base.

    - lag/lead como inteiros s√£o expandidos para ranges [1..N] quando apropriado.
    - feature_cols/target_cols definem bases permitidas e servem como sele√ß√£o no export.
    - Nenhuma coluna √© removida dos dados; sele√ß√£o ocorre apenas na exporta√ß√£o.
    """
    def __init__(
        self,
        lag: int,
        lead: int,
        country_list: Optional[List[str]] = None,
        *,
        model_name: str = "linear",
        data_dir: str = "data/processed",
        feature_cols: Optional[List[str]] = None,
        target_cols: Optional[List[str]] = None,
    ):
        self.lag = lag
        self.lead = lead
        self.country_list = country_list
        self.model_name = model_name
        self.data_dir = data_dir
        self.save_dir = os.path.join(self.data_dir, self.model_name)
        os.makedirs(self.save_dir, exist_ok=True)

        self.feature_cols: List[str] = list(feature_cols) if feature_cols else []
        self.target_cols: List[str] = list(target_cols) if target_cols else []

        self.norm_objects = {}
        self.encod_objects = {}
        self.df_base = pd.DataFrame()

    def _expand_steps(self, steps, default_max: Optional[int]) -> List[int]:
        """Normaliza passos: int‚Üí[1..N], None‚Üí[1..default_max], lista‚Üícomo est√°."""
        if isinstance(steps, int):
            return list(range(1, steps + 1)) if steps > 0 else [1]
        if steps is None and isinstance(default_max, int) and default_max > 0:
            return list(range(1, default_max + 1))
        if isinstance(steps, (list, tuple)):
            return list(steps)
        return [1]

    def load_data(self, raw_dir: Optional[str] = None) -> pd.DataFrame:
        """Carrega Parquet unificado em data/raw (ou raw_dir) e atualiza self.df_base."""
        base_raw = raw_dir or os.path.join('data', 'raw')
        unified_path = os.path.join(base_raw, f'raw_dataset.parquet')
        if not os.path.exists(unified_path):
            raise FileNotFoundError(f"Arquivo unificado n√£o encontrado: {unified_path}. Execute a coleta primeiro.")
        df = pd.read_parquet(unified_path, engine='pyarrow')
        if 'datetime' in df.columns:
            df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
        if self.country_list and 'country' in df.columns:
            df = df[df['country'].isin(self.country_list)].copy()
        sort_cols = [c for c in ['country', 'datetime'] if c in df.columns]
        if sort_cols:
            df = df.sort_values(sort_cols).reset_index(drop=True)
            
        # Filtrando Colunas apenas para as necess√°rias
        cols = list(set([c for c in self.feature_cols + self.target_cols if c in df.columns]))
        df = df.loc[:, ~df.columns.duplicated()]  # optional: remove duplicates
        df = df[cols]

        self.df_base = df
        return self.df_base

    def encode(self, encode_cols: str = 'datetime', encode_method: str = 'label') -> pd.DataFrame:
        """Codifica de forma n√£o destrutiva e atualiza self.df_base.

        - label: usa LabelEncoder com suporte a NaN via placeholder interno que √© revertido no decode.
        - time_cycle: adiciona features de calend√°rio e c√≠clicas sem remover datetime.
        """
        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Chame load_data() primeiro.")
            return self.df_base
        df = self.df_base.copy()
        if encode_method == 'label':
            le = LabelEncoder()
            s = df[encode_cols].astype(object)
            le.fit(s)
            df[encode_cols] = le.transform(s)
            # salva metadados incluindo o code do NaN
            self.encod_objects['label'] = {
                'encode_cols': encode_cols,
                'label_encoder': le,
            }
        elif encode_method == 'time_cycle':
            if encode_cols not in df.columns:
                print(f"Coluna {encode_cols} n√£o encontrada para time_cycle.")
                self.df_base = df
                return df
            dt = pd.to_datetime(df[encode_cols], utc=True)
            # Mant√©m a coluna original e adiciona componentes discretos e c√≠clicos
            df['year'] = dt.dt.year
            df['month'] = dt.dt.month
            df['day'] = dt.dt.day
            df['hour'] = dt.dt.hour
            df['minute'] = dt.dt.minute
            current_year = time.localtime().tm_year
            df['year_sin'] = np.sin(2 * np.pi * df['year'] / max(current_year, 1))
            df['year_cos'] = np.cos(2 * np.pi * df['year'] / max(current_year, 1))
            df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
            df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
            df['day_sin'] = np.sin(2 * np.pi * df['day'] / 31)
            df['day_cos'] = np.cos(2 * np.pi * df['day'] / 31)
            df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
            df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
            df['minute_sin'] = np.sin(2 * np.pi * df['minute'] / 60)
            df['minute_cos'] = np.cos(2 * np.pi * df['minute'] / 60)
            self.encod_objects['time_cycle'] = {'encode_cols': encode_cols}
            self.feature_cols.extend(["year_sin", "year_cos",
                                                     "month_sin", "month_cos",
                                                     "day_sin", "day_cos",
                                                     "hour_sin", "hour_cos",
                                                     "minute_sin", "minute_cos"])
        else:
            print(f"encode_method '{encode_method}' n√£o suportado.")
        self.df_base = df
        return self.df_base

    def decode(self, encode_method: str = 'label', target_col: Optional[str] = None) -> pd.DataFrame:
        """Reverte codifica√ß√µes suportadas (label, time_cycle)."""
        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Nada para decodificar.")
            return self.df_base
        df = self.df_base.copy()
        if encode_method == 'label':
            info = self.encod_objects.get('label')
            if not info:
                print("Nenhuma informa√ß√£o de label encoding salva.")
                return self.df_base
            col = info['encode_cols']
            le: LabelEncoder = info['label_encoder']
            placeholder = info.get('na_placeholder', '__NA__')
            try:
                inv = le.inverse_transform(df[col].astype(int))
                # mapeia placeholder de volta para NaN
                inv = pd.Series(inv).replace(placeholder, np.nan).values
                df[col] = inv
            except Exception as e:
                print(f"Falha ao decodificar label para coluna {col}: {e}")
        elif encode_method == 'time_cycle':
            if 'year' not in df.columns:
                print("Componentes de tempo ausentes para reconstru√ß√£o.")
                return self.df_base
            tgt = target_col or 'decoded_datetime'
            def _recover_component(sin_col, cos_col, period, offset):
                if sin_col not in df.columns or cos_col not in df.columns:
                    return pd.Series([np.nan] * len(df))
                ang = np.arctan2(df[sin_col], df[cos_col])
                ang = (ang + 2 * np.pi) % (2 * np.pi)
                idx = np.round((ang / (2 * np.pi)) * period).astype('Int64') % period
                return idx + offset
            month = _recover_component('month_sin', 'month_cos', 12, 1)
            day = _recover_component('day_sin', 'day_cos', 31, 1)
            hour = _recover_component('hour_sin', 'hour_cos', 24, 0)
            minute = _recover_component('minute_sin', 'minute_cos', 60, 0)
            year = df['year'] if 'year' in df.columns else pd.Series([np.nan] * len(df))
            dt = pd.to_datetime({
                'year': year.astype('Int64'),
                'month': month.astype('Int64'),
                'day': day.astype('Int64'),
                'hour': hour.astype('Int64'),
                'minute': minute.astype('Int64'),
            }, errors='coerce', utc=True)
            df[tgt] = dt
        else:
            print(f"encode_method '{encode_method}' n√£o suportado para decode.")
        self.df_base = df
        return self.df_base

    def normalize(self, value_cols: List[str], normalization_method: str = 'minmax') -> pd.DataFrame:
        """Normaliza colunas e atualiza self.df_base."""
        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Chame load_data() primeiro.")
            return self.df_base
        df = self.df_base.copy()
        scaler = MinMaxScaler() if normalization_method == 'minmax' else (
            StandardScaler() if normalization_method == 'standard' else None)
        if scaler is None:
            raise ValueError("normalization_method deve ser 'minmax' ou 'standard'")
        df[value_cols] = scaler.fit_transform(df[value_cols])
        self.norm_objects[normalization_method] = {'value_cols': value_cols, 'scaler': scaler}
        self.df_base = df
        return self.df_base

    def normalize_splits(self, value_cols: List[str], normalization_method: str = 'minmax') -> dict:
        """Normaliza os conjuntos de treino, valida√ß√£o e teste."""
        if not self.splits:
            print("Nenhum conjunto dividido encontrado.")
            return {}
        normalized_splits = {}
        for split_name, split_df in self.splits.items():
            self.df_base = split_df
            normalized_df = self.normalize(value_cols=value_cols, normalization_method=normalization_method)
            normalized_splits[split_name] = normalized_df
        self.splits = normalized_splits
        return normalized_splits

    def denormalize(self, normalization_method: str = 'minmax') -> pd.DataFrame:
        """Reverte normaliza√ß√£o usando metadados salvos."""
        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Nada para denormalizar.")
            return self.df_base
        info = self.norm_objects.get(normalization_method)
        if not info:
            print(f"Nenhum scaler salvo para o m√©todo '{normalization_method}'.")
            return self.df_base
        cols: List[str] = info['value_cols']
        scaler = info['scaler']
        df = self.df_base.copy()
        try:
            df[cols] = scaler.inverse_transform(df[cols])
        except Exception as e:
            print(f"Falha ao denormalizar colunas {cols}: {e}")
            return self.df_base
        self.df_base = df
        return self.df_base

    def save_df_base(self, filename: Optional[str] = None, compression: Optional[str] = None, partition_by: Optional[List[str]] = None) -> Optional[str]:
        """Salva self.df_base em Parquet dentro de data_dir/{model_name}."""
        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Nada para salvar.")
            return None
        comp = compression
        if comp is None:
            try:
                comp = PARQUET_COMPRESSION
            except NameError:
                comp = 'zstd'
        filename = "raw_dataset.parquet"
        out_path = os.path.join(self.save_dir, filename)
        df = self.df_base.copy()
        if 'datetime' in df.columns:
            df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
        try:
            if partition_by:
                df.to_parquet(out_path, engine='pyarrow', compression=comp, index=False, partition_cols=partition_by)
            else:
                df.to_parquet(out_path, engine='pyarrow', compression=comp, index=False)
            print(f"[SALVO] df_base: {len(df):,} linhas ‚Üí {out_path}")
            return out_path
        except Exception as e:
            print(f"Falha ao salvar df_base em {out_path}: {e}")
            return None
    
    def split_train_val_test(self, train_size: float = 0.7, val_size: float = 0.15, test_size: float = 0.15, time_col: str = 'datetime') -> Optional[dict]:
        """Divide df_base em conjuntos de treino, valida√ß√£o e teste com base em time_col."""
        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Nada para dividir.")
            return None
        if not np.isclose(train_size + val_size + test_size, 1.0):
            print("train_size, val_size e test_size devem somar 1.0")
            return None
        df = self.df_base.copy()
        if time_col not in df.columns:
            print(f"Coluna de tempo '{time_col}' n√£o encontrada em df_base.")
            return None
        df = df.sort_values(time_col).reset_index(drop=True)
        n = len(df)
        train_end = int(n * train_size)
        val_end = train_end + int(n * val_size)
        splits = {
            'train': df.iloc[:train_end].reset_index(drop=True),
            'val': df.iloc[train_end:val_end].reset_index(drop=True),
            'test': df.iloc[val_end:].reset_index(drop=True),
        }
        for split_name, split_df in splits.items():
            print(f"[DIVIDIDO] {split_name}: {len(split_df):,} linhas")
        self.splits = splits
        return splits

## Preprocessamento do Modelo Linear

Esse modelo deve ser√° contruido a partir de lags e leads passados como par√¢metros na fun√ß√£o, resultando na contru√ß√£o de novas colunas lead lag, assim gerando uma flat matrix 2D que ser√° usada no modelo linear

Observa√ß√£o importante: lag e lead s√£o inteiros e representam o m√°ximo de passos; o pipeline expande para intervalos 1..N automaticamente. Por exemplo, lag=96 gera features com defasagens de 1 a 96; lead=96 gera alvos de 1 a 96.

Os arquivos do modelo ser√£o salvos em TFrecords j√° que o modelo linear ser√° contru√≠do usando tensor flow

In [25]:

import os, json
import numpy as np
import pandas as pd
import tensorflow as tf
from typing import List, Optional, Dict, Any

class LSTMPreprocessor(Preprocessor):
    """Pr√©-processador sequencial para LSTM: gera janelas 3D (N, seq_len, features)."""

    def build_sequence_matrix(
        self,
        value_cols: Optional[List[str]] = None,
        target_cols: Optional[List[str]] = None,
        seq_len: Optional[int] = None,
        lead: Optional[int] = None,
        group_cols: Optional[List[str]] = None,
        time_col: str = "datetime",
        drop_last_incomplete: bool = True,
    ) -> Dict[str, np.ndarray]:
        """
        Constr√≥i tensores X (entradas) e Y (alvos) para modelo LSTM.
        Cada grupo (ex.: pa√≠s) √© processado separadamente e concatenado.
        """
        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Chame load_data() primeiro.")
            return {}

        df = self.df_base.copy()
        feats = value_cols or self.feature_cols
        tgts = target_cols or self.target_cols
        if not feats:
            raise ValueError("Nenhuma coluna de feature informada.")
        if not tgts:
            raise ValueError("Nenhum target informado.")

        if time_col not in df.columns:
            raise ValueError(f"Coluna temporal '{time_col}' n√£o encontrada.")

        group_cols = group_cols or [c for c in ["country"] if c in df.columns]
        sort_cols = (group_cols or []) + [time_col]
        df = df.sort_values(sort_cols).reset_index(drop=True)

        if group_cols:
            df["_group_id"] = df[group_cols].astype(str).agg("_".join, axis=1)
        else:
            df["_group_id"] = "global"

        seq_len = seq_len or getattr(self, "seq_len", 24)
        lead = lead or getattr(self, "lead", 1)

        X_list, Y_list = [], []
        for gid, g in df.groupby("_group_id", sort=False):
            g = g.reset_index(drop=True)
            if len(g) < seq_len + lead:
                continue
            X_src = g[feats].to_numpy(np.float32)
            Y_src = g[tgts].to_numpy(np.float32)
            for i in range(len(g) - seq_len - lead + 1):
                x_win = X_src[i:i+seq_len]
                y_val = Y_src[i+seq_len+lead-1]
                X_list.append(x_win)
                Y_list.append(y_val)

        if not X_list:
            print("[WARN] Nenhuma janela gerada.")
            return {}

        X = np.stack(X_list)
        Y = np.stack(Y_list)
        print(f"[JANELAS] X={X.shape}, Y={Y.shape}")
        self._seq_data = dict(X=X, Y=Y, seq_len=seq_len, x_dim=X.shape[-1], y_dim=Y.shape[-1])
        return self._seq_data

    def save_sequence_tfrecords(
        self,
        output_basename: str = 'lstm_dataset',
        shard_size: int = 50_000,
        compression: str = 'GZIP',
    ) -> Optional[List[str]]:
        """Salva janelas (X,Y) como TFRecords comprimidos."""
        if not hasattr(self, "_seq_data"):
            print("Nenhum dado sequencial encontrado. Execute build_sequence_matrix() antes.")
            return None

        X, Y = self._seq_data["X"], self._seq_data["Y"]
        seq_len, x_dim, y_dim = self._seq_data["seq_len"], self._seq_data["x_dim"], self._seq_data["y_dim"]
        n = len(X)
        os.makedirs(self.save_dir, exist_ok=True)

        options = tf.io.TFRecordOptions(compression_type=compression)
        paths = []

        def _bytes_feature(arr: np.ndarray) -> tf.train.Feature:
            return tf.train.Feature(bytes_list=tf.train.BytesList(value=[arr.tobytes()]))

        for shard_idx, start in enumerate(range(0, n, shard_size)):
            end = min(start + shard_size, n)
            shard_path = os.path.join(self.save_dir, f"{output_basename}_{shard_idx}.tfrecord")
            with tf.io.TFRecordWriter(shard_path, options=options) as w:
                for i in range(start, end):
                    ex = tf.train.Example(features=tf.train.Features(feature={
                        'x_raw': _bytes_feature(X[i]),
                        'y_raw': _bytes_feature(Y[i]),
                    }))
                    w.write(ex.SerializeToString())
            paths.append(shard_path)

        meta = {
            'seq_len': seq_len,
            'x_dim': x_dim,
            'y_dim': y_dim,
            'compression': compression,
            'count': int(n),
            'basename': output_basename,
        }
        with open(os.path.join(self.save_dir, f"{output_basename}.meta.json"), 'w') as f:
            json.dump(meta, f, indent=2)

        print(f"[‚úÖ] TFRecords salvos ({len(paths)} shards) em {self.save_dir}")
        return paths

    @staticmethod
    def parse_tfrecord(example_proto, seq_len:int, x_dim:int, y_dim:int):
        """Fun√ß√£o para leitura dos TFRecords salvos."""
        features = {
            'x_raw': tf.io.FixedLenFeature([], tf.string),
            'y_raw': tf.io.FixedLenFeature([], tf.string),
        }
        parsed = tf.io.parse_single_example(example_proto, features)
        x = tf.io.decode_raw(parsed['x_raw'], tf.float32)
        y = tf.io.decode_raw(parsed['y_raw'], tf.float32)
        x = tf.reshape(x, [seq_len, x_dim])
        y = tf.reshape(y, [y_dim])
        return x, y

    @staticmethod
    def load_sequence_dataset(
        path_pattern: str,
        seq_len: int,
        x_dim: int,
        y_dim: int,
        batch_size: int = 256,
        compression: str = 'GZIP'
    ) -> tf.data.Dataset:
        """Carrega os TFRecords como dataset pronto para treino."""
        files = tf.io.gfile.glob(path_pattern)
        ds = tf.data.TFRecordDataset(files, compression_type=compression)
        ds = ds.map(lambda ex: LSTMPreprocessor.parse_tfrecord(ex, seq_len, x_dim, y_dim),
                    num_parallel_calls=tf.data.AUTOTUNE)
        ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
        print(f"[DATASET] {len(files)} shards carregados ‚Üí batch_size={batch_size}")
        return ds


# Cap√≠tulo 2 ‚Äî Constru√ß√£o dos Modelos

A seguir, definimos construtores simples e eficientes para cada modelo (Linear, LSTM, TFT e TimesFM),
prontos para uso em rotinas de otimiza√ß√£o de hiperpar√¢metros (por exemplo, Optuna). Cada construtor
recebe um dicion√°rio de par√¢metros (`params`) e retorna um modelo compilado.

## Constru√ß√£o do Modelo Linear

Objetivo: um regressor denso simples (MLP) para prever `target_cols` a partir de `feature_cols`.

Contrato r√°pido:
- Entrada: vetor de tamanho `x_dim` (n√∫mero de features)
- Sa√≠da: vetor de tamanho `y_dim` (n√∫mero de targets)
- Par√¢metros (exemplos): hidden_units, activation, dropout, lr, l2

In [26]:
from typing import Dict, Any
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


def build_lstm_model(seq_len: int, x_dim: int, y_dim: int, params: Dict[str, Any]) -> keras.Model:
    """
    LSTM para regress√£o multivariada temporal com suporte a m√°scara para valores nulos/padding.

    - As entradas devem conter `NaN` ou um valor sentinel (ex.: 0.0) para timesteps a mascarar.
    - Camadas LSTM automaticamente ignoram esses timesteps durante o treinamento.
    """

    lstm_units = params.get('lstm_units', [128, 64])
    dense_units = params.get('dense_units', [128])
    dropout = float(params.get('dropout', 0.1))
    rec_dropout = float(params.get('rec_dropout', 0.0))
    act = params.get('act', 'relu')
    lr = float(params.get('lr', 1e-3))
    l2 = float(params.get('l2', 0.0))
    layer_norm = bool(params.get('layer_norm', True))
    mask_value = float(params.get('mask_value', 0.0))  # sentinel for masking

    # --- Inputs & Mask ---
    inputs = keras.Input(shape=(seq_len, x_dim), name='sequence_input')
    # Replace NaNs with mask_value before masking
    x = layers.Lambda(lambda v: tf.where(tf.math.is_nan(v), tf.fill(tf.shape(v), mask_value), v))(inputs)
    x = layers.Masking(mask_value=mask_value, name='masking')(x)

    # --- LSTM stack ---
    for i, units in enumerate(lstm_units):
        return_seq = i < len(lstm_units) - 1
        x = layers.LSTM(
            units,
            return_sequences=return_seq,
            dropout=dropout,
            recurrent_dropout=rec_dropout,
            kernel_regularizer=keras.regularizers.l2(l2),
            name=f'lstm_{i}'
        )(x)
        if layer_norm:
            x = layers.LayerNormalization(name=f'ln_{i}')(x)

    # --- Dense layers ---
    for i, units in enumerate(dense_units):
        x = layers.Dense(units, activation=act, name=f'dense_{i}')(x)
        if dropout > 0:
            x = layers.Dropout(dropout, name=f'dropout_{i}')(x)

    outputs = layers.Dense(y_dim, name='output')(x)

    # --- Compile ---
    model = keras.Model(inputs, outputs, name='lstm_regressor')
    optimizer = keras.optimizers.Adam(learning_rate=lr, clipnorm=1.0)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

    return model
def parse_seq_tfrecord(example_proto, seq_len, x_dim, y_dim):
    """
    Faz o parsing de TFRecords com dados 3D salvos em bytes.
    Espera features:
        'x_raw': sequ√™ncia de entrada (float32 bytes)
        'y_raw': target (float32 bytes)
    """
    feature_description = {
        'x_raw': tf.io.FixedLenFeature([], tf.string),
        'y_raw': tf.io.FixedLenFeature([], tf.string),
    }
    parsed = tf.io.parse_single_example(example_proto, feature_description)

    x = tf.io.decode_raw(parsed['x_raw'], tf.float32)
    y = tf.io.decode_raw(parsed['y_raw'], tf.float32)

    x = tf.reshape(x, [seq_len, x_dim])
    y = tf.reshape(y, [y_dim])

    return x, y


def load_seq_tfrecord_dataset(path_pattern, seq_len, x_dim, y_dim, batch_size=64, compression='GZIP'):
    """
    Carrega TFRecords sequenciais e retorna um tf.data.Dataset pronto para treino.

    Cada exemplo cont√©m:
        X.shape = (seq_len, x_dim)
        Y.shape = (y_dim,)
    """
    files = tf.io.gfile.glob(path_pattern)
    if not files:
        raise FileNotFoundError(f"Nenhum arquivo TFRecord encontrado em {path_pattern}")

    ds = tf.data.TFRecordDataset(files, compression_type=compression)
    ds = ds.map(
        lambda ex: parse_seq_tfrecord(ex, seq_len, x_dim, y_dim),
        num_parallel_calls=tf.data.AUTOTUNE
    )
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    print(f"[DATASET] {len(files)} shards carregados | batch_size={batch_size}")
    return ds


# Cap√≠tulo 3 - Contru√ß√£o da Pipelines de dados dos modelos

Contruir o fluxo de dados, incluindo a o preprocessamento e treinamento dos modelos

## Pipeline dos Modelos Lineares

Ser√° gerada uma pipeline completa para cada n√≠vel de pergunta

Cada fun√ß√£o ir√° processar os dados para cad problema e fazer o treinamento do modelo

Seus outputs ser√£o os modelos treinados, onde os valores ser√£o comparados ao final do modelo

In [27]:
import os
import optuna
import matplotlib.pyplot as plt
from typing import List, Tuple
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau


def lstm_pipeline(
    data_dir: str,
    country_list: List[str],
    feature_cols: List[str],
    target_cols: List[str],
    seq_len: int,
    lead: int,
    value_cols: List[str],
    use_optuna: bool = True,
    n_trials: int = 20,
    batch_size: int = 128
) -> Tuple[LSTMPreprocessor, keras.Model]:
    """
    Pipeline completa de pr√©-processamento, tuning e treinamento de um modelo LSTM.
    """

    # ----------------------------
    # Pr√©-processamento
    # ----------------------------
    preproc = LSTMPreprocessor(
        data_dir=data_dir,
        model_name='lstm_model',
        feature_cols=feature_cols,
        target_cols=target_cols,
        country_list=country_list,
        lag=seq_len,
        lead=lead
    )

    preproc.load_data()
    preproc.encode(encode_cols='datetime', encode_method='time_cycle')
    preproc.encode(encode_cols='country', encode_method='label')
    preproc.split_train_val_test(train_size=0.6, val_size=0.2, test_size=0.2, time_col='datetime')
    preproc.normalize_splits(value_cols=value_cols, normalization_method='minmax')

    # Constr√≥i janelas e salva TFRecords para cada split
    for split_name, split_df in preproc.splits.items():
        preproc.df_base = split_df
        preproc.build_sequence_matrix(
            value_cols=value_cols,
            target_cols=target_cols,
            seq_len=seq_len,
            lead=lead,
            group_cols=['country'],
            time_col='datetime'
        )
        preproc.save_sequence_tfrecords(output_basename=f'lstm_dataset_{split_name}', shard_size=1000, compression='GZIP')
    print("‚úÖ Pr√©-processamento sequencial conclu√≠do.")

    # ----------------------------
    # TFRecord datasets
    # ----------------------------
    meta = preproc._seq_data
    x_dim, y_dim = meta['x_dim'], meta['y_dim']

    dataset_train = LSTMPreprocessor.load_sequence_dataset(
        path_pattern=os.path.join(preproc.save_dir, 'lstm_dataset_train*.tfrecord'),
        seq_len=seq_len, x_dim=x_dim, y_dim=y_dim, batch_size=batch_size
    )
    dataset_val = LSTMPreprocessor.load_sequence_dataset(
        path_pattern=os.path.join(preproc.save_dir, 'lstm_dataset_val*.tfrecord'),
        seq_len=seq_len, x_dim=x_dim, y_dim=y_dim, batch_size=batch_size
    )
    print("üì¶ Dataset TFRecord carregado para treinamento.")

    # ----------------------------
    # Optuna hyperparameter search
    # ----------------------------
    if use_optuna:
        def objective(trial):
            lstm_layers = trial.suggest_int("n_lstm_layers", 1, 2)
            lstm_units = [trial.suggest_int(f"lstm_u{i}", 64, 256, step=64) for i in range(lstm_layers)]
            dense_units = [trial.suggest_int("dense_u", 64, 256, step=64)]

            params = {
                "lstm_units": lstm_units,
                "dense_units": dense_units,
                "dropout": trial.suggest_float("dropout", 0.0, 0.3),
                "rec_dropout": trial.suggest_float("rec_dropout", 0.0, 0.2),
                "act": trial.suggest_categorical("act", ["relu", "tanh", "gelu"]),
                "lr": trial.suggest_float("lr", 1e-4, 3e-3, log=True),
                "l2": trial.suggest_float("l2", 1e-7, 1e-4, log=True),
                "layer_norm": trial.suggest_categorical("layer_norm", [True, False])
            }

            model = build_lstm_model(seq_len=seq_len, x_dim=x_dim, y_dim=y_dim, params=params)
            es = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=0)

            hist = model.fit(dataset_train, validation_data=dataset_val,
                             epochs=60, callbacks=[es], verbose=0)
            return min(hist.history['val_loss'])

        study = optuna.create_study(direction="minimize")
        study.optimize(objective, n_trials=n_trials, n_jobs=-1)
        print("üéØ Melhores hiperpar√¢metros Optuna:", study.best_params)

        best_params = study.best_params
        lstm_layers = best_params.pop("n_lstm_layers")
        best_params["lstm_units"] = [best_params.pop(f"lstm_u{i}") for i in range(lstm_layers)]
        best_params["dense_units"] = [best_params.pop("dense_u")]
    else:
        best_params = {
            "lstm_units": [128, 64],
            "dense_units": [128],
            "dropout": 0.1,
            "rec_dropout": 0.0,
            "act": "relu",
            "lr": 1e-3,
            "l2": 1e-6,
            "layer_norm": True
        }

    # ----------------------------
    # Constru√ß√£o e Treinamento Final
    # ----------------------------
    model = build_lstm_model(seq_len=seq_len, x_dim=x_dim, y_dim=y_dim, params=best_params)

    early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True, verbose=0)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=6, min_lr=1e-6, verbose=0)

    hist = model.fit(
        dataset_train,
        validation_data=dataset_val,
        epochs=100,
        callbacks=[early_stopping, reduce_lr]
    )

    # ----------------------------
    # Plot hist√≥rico
    # ----------------------------
    plt.figure(figsize=(10, 4))
    plt.plot(hist.history['loss'], label='train_loss')
    plt.plot(hist.history['val_loss'], label='val_loss')
    plt.title("Treinamento LSTM")
    plt.xlabel("√âpoca")
    plt.ylabel("MSE")
    plt.legend()
    plt.show()

    return preproc, model


In [None]:
import os, gc, tensorflow as tf

lstm_pipelines = [
    dict(name="Nivel 1 - A", data_dir="data/N1A", feats=["country","datetime","quantity_MW"], tgts=["quantity_MW"], seq_len=7*96, lead=96, vals=["quantity_MW"], countries=["ES"]),
    dict(name="Nivel 1 - B", data_dir="data/N1B", feats=["country","datetime","quantity_MW"], tgts=["quantity_MW"], seq_len=15*96, lead=3*96, vals=["quantity_MW"], countries=COUNTRY_DOMAINS.keys()),
    dict(name="Nivel 1 - C", data_dir="data/N1C", feats=["country","datetime","quantity_MW"], tgts=["quantity_MW"], seq_len=30*96, lead=7*96, vals=["quantity_MW"], countries=COUNTRY_DOMAINS.keys()),
    dict(name="Nivel 2 - A", data_dir="data/N2A", feats=["country","datetime","quantity_MW","price_EUR_MWh"], tgts=["quantity_MW","price_EUR_MWh"], seq_len=7*96, lead=96, vals=["quantity_MW","price_EUR_MWh"], countries=["ES"]),
    dict(name="Nivel 2 - B", data_dir="data/N2B", feats=["country","datetime","quantity_MW","price_EUR_MWh"], tgts=["quantity_MW","price_EUR_MWh"], seq_len=15*96, lead=3*96, vals=["quantity_MW","price_EUR_MWh"], countries=COUNTRY_DOMAINS.keys()),
    dict(name="Nivel 2 - C", data_dir="data/N2C", feats=["country","datetime","quantity_MW","price_EUR_MWh"], tgts=["quantity_MW","price_EUR_MWh"], seq_len=30*96, lead=7*96, vals=["quantity_MW","price_EUR_MWh"], countries=COUNTRY_DOMAINS.keys())
]

for cfg in lstm_pipelines:
    name = cfg["name"]
    print(f"\nüöÄ Running LSTM pipeline {name} ...")

    preproc, _ = lstm_pipeline(
        data_dir=cfg["data_dir"],
        feature_cols=cfg["feats"],
        target_cols=cfg["tgts"],
        seq_len=cfg["seq_len"],
        lead=cfg["lead"],
        value_cols=cfg["vals"],
        country_list=cfg["countries"],
        batch_size=256,
        use_optuna=False  # se quiser tuning, mude para True
    )

    del preproc
    tf.keras.backend.clear_session()
    gc.collect()

    print(f"‚úÖ Finished {name} - memory cleared\n{'-'*60}")



üöÄ Running LSTM pipeline Nivel 1 - A ...
[DIVIDIDO] train: 10,357 linhas
[DIVIDIDO] val: 3,452 linhas
[DIVIDIDO] test: 3,454 linhas
[JANELAS] X=(9590, 672, 1), Y=(9590, 1)
[‚úÖ] TFRecords salvos (10 shards) em data/N1A\lstm_model
[JANELAS] X=(2685, 672, 1), Y=(2685, 1)
[‚úÖ] TFRecords salvos (3 shards) em data/N1A\lstm_model
[JANELAS] X=(2687, 672, 1), Y=(2687, 1)
[‚úÖ] TFRecords salvos (3 shards) em data/N1A\lstm_model
‚úÖ Pr√©-processamento sequencial conclu√≠do.
[DATASET] 10 shards carregados ‚Üí batch_size=256
[DATASET] 3 shards carregados ‚Üí batch_size=256
üì¶ Dataset TFRecord carregado para treinamento.

Epoch 1/100
     38/Unknown [1m154s[0m 4s/step - loss: 0.2737 - mae: 0.3703



[1m38/38[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m163s[0m 4s/step - loss: 0.1130 - mae: 0.2222 - val_loss: 0.1601 - val_mae: 0.3492 - learning_rate: 0.0010
Epoch 2/100
[1m12/38[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [1m1:41[0m 4s/step - loss: 0.0308 - mae: 0.1391