# Introdu√ß√£o

Esse Notebook ser√° respons√°vel pelo preprocessamento dos dados contidos em ./data/raw para formatos compat√≠veis e otimizados para o treinamento de cada modelo
As defini√ß√µes dos problemas aos quais os modelos dever√£o solucionar j√° foram definidas no notebook "Coleta de Dados"

Modelos a serem Criados:

1. Modelo Linear: MLP sem fun√ß√µes de ativa√ß√£o, composta apenas de somas lineares
2. MLP: rede neural - efetivamente identica ao modelo linear, no entanto, apresenta fun√ß√£o de ativa√ß√£o ao final do somat√≥rio de fun√ß√µes lineares
3. LSTM: Um modelo de rede neural recorrente, com capacidade de diferencia√ß√£o de informa√ß√£o de curto e longo prazo
4. TFT: Modelo baseado em LLMs desenvolvido pela microsoft - servir√° como um comparativo mais moderno


## Depend√™ncias

In [None]:
# Depend√™ncias m√≠nimas para TFT ‚Äî simples e com foco em GPU
try:
    import torch
except Exception:
    import shutil
    has_gpu = shutil.which("nvidia-smi") is not None
    if has_gpu:
        # tenta instalar com suporte CUDA (ajuste a vers√£o cu de acordo com sua stack, p.ex. cu121)
        %pip install -q torch --index-url https://download.pytorch.org/whl/cu121
    else:
        %pip install -q torch --index-url https://download.pytorch.org/whl/cpu
    import torch

# libs do pipeline TFT
%pip install -q pytorch-lightning pytorch-forecasting


print(f"torch={torch.__version__} | cuda={torch.cuda.is_available()}")

print("Verificando depend√™ncias (pyarrow para Parquet)...")

try:
    import pyarrow as pa
    print(f"PyArrow dispon√≠vel: {pa.__version__}")
except Exception:
    print("Instalando pyarrow...")
    !pip install --upgrade "pyarrow>=18" --quiet
    import importlib
    importlib.invalidate_caches()
    import pyarrow as pa
    print(f"PyArrow instalado: {pa.__version__}")

# fastparquet √© opcional
try:
    import fastparquet  # noqa: F401
    print("fastparquet dispon√≠vel (opcional)")
except Exception:
    pass

# Outras bibliotecas sob demanda
for lib in [
    "numpy", "python-dotenv", "pandas", "matplotlib", "seaborn",
    "scikit-learn", "tensorflow[and-cuda]", "keras", "lxml", "pytz"
]:
    try:
        __import__(lib)
    except ImportError:
        print(f"Instalando {lib}...")
        !pip install {lib} --quiet

print("Depend√™ncias prontas")

## VARI√ÅVEIS NECESS√ÅRIAS

In [None]:
# Imports para a API e utilidades
import os
from dotenv import load_dotenv
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import mixed_precision
import tensorflow as tf
import pandas as pd

# Silenciando Warnings
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=RuntimeWarning)
warnings.filterwarnings("ignore", category=Warning)  # last resort
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)


# ==============================================
# GPU CONFIGURATION
# ==============================================
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)
    mixed_precision.set_global_policy('mixed_float16')
    print(f"‚úÖ GPU detected ({gpus[0].name}) - using mixed precision.")
else:
    print("‚ö†Ô∏è No GPU detected, running on CPU.")

# Carregar vari√°veis de ambiente do .env
load_dotenv()
# ---------------- CONFIG ---------------- #
COUNTRY_DOMAINS = {
    "FR": {"domain": "10YFR-RTE------C"},
    "ES": {"domain": "10YES-REE------0"},
    "PT": {"domain": "10YPT-REN------W"}
}

DATA_ITEMS = [
    {'key': 'load_total', 'documentType': 'A65', 'processType': 'A16', 'domainParam': 'outBiddingZone_Domain', 'parser': 'load'},
    {'key': 'market_prices', 'documentType': 'A44', 'processType': 'A07', 'domainParamIn': 'in_Domain', 'domainParamOut': 'out_Domain', 'parser': 'price'}
]

ENTSOE_TOKEN = os.environ.get("ENTSOE_SECURITY_TOKEN")
BASE_URL = "https://web-api.tp.entsoe.eu/api"
MAX_WORKERS = 100
RAW_DIR = os.path.join("data", "raw")
PARQUET_COMPRESSION = "zstd"
os.makedirs(RAW_DIR, exist_ok=True)

# Cap√≠tulo 1: Pr√©processamento de dados

Etapa de contru√ß√£o da pipelines de pre-processamento de dados


## Classe geral de preprocessamento

In [None]:
import os
from typing import Optional, List, Tuple
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
import numpy as np
import time

class Preprocessor:
    """Pr√©-processador base.

    - lag/lead como inteiros s√£o expandidos para ranges [1..N] quando apropriado.
    - feature_cols/target_cols definem bases permitidas e servem como sele√ß√£o no export.
    - Nenhuma coluna √© removida dos dados; sele√ß√£o ocorre apenas na exporta√ß√£o.
    """
    def __init__(
        self,
        lag: int,
        lead: int,
        country_list: Optional[List[str]] = None,
        *,
        model_name: str = "linear",
        data_dir: str = "data/processed",
        feature_cols: Optional[List[str]] = None,
        target_cols: Optional[List[str]] = None,
    ):
        self.lag = lag
        self.lead = lead
        self.country_list = country_list
        self.model_name = model_name
        self.data_dir = data_dir
        self.save_dir = self.data_dir
        os.makedirs(self.save_dir, exist_ok=True)

        self.feature_cols: List[str] = list(feature_cols) if feature_cols else []
        self.target_cols: List[str] = list(target_cols) if target_cols else []

        self.norm_objects = {}
        self.encod_objects = {}
        self.df_base = pd.DataFrame()

    def _expand_steps(self, steps, default_max: Optional[int]) -> List[int]:
        """Normaliza passos: int‚Üí[1..N], None‚Üí[1..default_max], lista‚Üícomo est√°."""
        if isinstance(steps, int):
            return list(range(1, steps + 1)) if steps > 0 else [1]
        if steps is None and isinstance(default_max, int) and default_max > 0:
            return list(range(1, default_max + 1))
        if isinstance(steps, (list, tuple)):
            return list(steps)
        return [1]

    def load_data(self, raw_dir: Optional[str] = None) -> pd.DataFrame:
        """Carrega Parquet unificado em data/raw (ou raw_dir) e atualiza self.df_base."""
        base_raw = raw_dir or os.path.join('data', 'raw')
        unified_path = os.path.join(base_raw, f'raw_dataset.parquet')
        if not os.path.exists(unified_path):
            raise FileNotFoundError(f"Arquivo unificado n√£o encontrado: {unified_path}. Execute a coleta primeiro.")
        df = pd.read_parquet(unified_path, engine='pyarrow')
        if 'datetime' in df.columns:
            df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
        if self.country_list and 'country' in df.columns:
            df = df[df['country'].isin(self.country_list)].copy()
        sort_cols = [c for c in ['country', 'datetime'] if c in df.columns]
        if sort_cols:
            df = df.sort_values(sort_cols).reset_index(drop=True)
            
        # Filtrando Colunas apenas para as necess√°rias
        cols = list(set([c for c in self.feature_cols + self.target_cols if c in df.columns]))
        df = df.loc[:, ~df.columns.duplicated()]  # optional: remove duplicates
        df = df[cols]

        self.df_base = df
        return self.df_base

    def encode(self, encode_cols: str = 'datetime', encode_method: str = 'label') -> pd.DataFrame:
        """Codifica de forma n√£o destrutiva e atualiza self.df_base.

        - label: usa LabelEncoder com suporte a NaN via placeholder interno que √© revertido no decode.
        - time_cycle: adiciona features de calend√°rio e c√≠clicas sem remover datetime.
        """
        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Chame load_data() primeiro.")
            return self.df_base
        df = self.df_base.copy()
        if encode_method == 'label':
            le = LabelEncoder()
            s = df[encode_cols].astype(object)
            le.fit(s)
            df[encode_cols] = le.transform(s)
            # salva metadados incluindo o code do NaN
            self.encod_objects['label'] = {
                'encode_cols': encode_cols,
                'label_encoder': le,
            }
        elif encode_method == 'time_cycle':
            if encode_cols not in df.columns:
                print(f"Coluna {encode_cols} n√£o encontrada para time_cycle.")
                self.df_base = df
                return df
            dt = pd.to_datetime(df[encode_cols], utc=True)
            # Mant√©m a coluna original e adiciona componentes discretos e c√≠clicos
            df['year'] = dt.dt.year
            df['month'] = dt.dt.month
            df['day'] = dt.dt.day
            df['hour'] = dt.dt.hour
            df['minute'] = dt.dt.minute
            current_year = time.localtime().tm_year
            df['year_sin'] = np.sin(2 * np.pi * df['year'] / max(current_year, 1))
            df['year_cos'] = np.cos(2 * np.pi * df['year'] / max(current_year, 1))
            df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
            df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
            df['day_sin'] = np.sin(2 * np.pi * df['day'] / 31)
            df['day_cos'] = np.cos(2 * np.pi * df['day'] / 31)
            df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
            df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
            df['minute_sin'] = np.sin(2 * np.pi * df['minute'] / 60)
            df['minute_cos'] = np.cos(2 * np.pi * df['minute'] / 60)
            self.encod_objects['time_cycle'] = {'encode_cols': encode_cols}
            self.feature_cols.extend(["year_sin", "year_cos",
                                                     "month_sin", "month_cos",
                                                     "day_sin", "day_cos",
                                                     "hour_sin", "hour_cos",
                                                     "minute_sin", "minute_cos"])
        else:
            print(f"encode_method '{encode_method}' n√£o suportado.")
        self.df_base = df
        return self.df_base

    def decode(self, encode_method: str = 'label', target_col: Optional[str] = None) -> pd.DataFrame:
        """Reverte codifica√ß√µes suportadas (label, time_cycle)."""
        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Nada para decodificar.")
            return self.df_base
        df = self.df_base.copy()
        if encode_method == 'label':
            info = self.encod_objects.get('label')
            if not info:
                print("Nenhuma informa√ß√£o de label encoding salva.")
                return self.df_base
            col = info['encode_cols']
            le: LabelEncoder = info['label_encoder']
            placeholder = info.get('na_placeholder', '__NA__')
            try:
                inv = le.inverse_transform(df[col].astype(int))
                # mapeia placeholder de volta para NaN
                inv = pd.Series(inv).replace(placeholder, np.nan).values
                df[col] = inv
            except Exception as e:
                print(f"Falha ao decodificar label para coluna {col}: {e}")
        elif encode_method == 'time_cycle':
            if 'year' not in df.columns:
                print("Componentes de tempo ausentes para reconstru√ß√£o.")
                return self.df_base
            tgt = target_col or 'decoded_datetime'
            def _recover_component(sin_col, cos_col, period, offset):
                if sin_col not in df.columns or cos_col not in df.columns:
                    return pd.Series([np.nan] * len(df))
                ang = np.arctan2(df[sin_col], df[cos_col])
                ang = (ang + 2 * np.pi) % (2 * np.pi)
                idx = np.round((ang / (2 * np.pi)) * period).astype('Int64') % period
                return idx + offset
            month = _recover_component('month_sin', 'month_cos', 12, 1)
            day = _recover_component('day_sin', 'day_cos', 31, 1)
            hour = _recover_component('hour_sin', 'hour_cos', 24, 0)
            minute = _recover_component('minute_sin', 'minute_cos', 60, 0)
            year = df['year'] if 'year' in df.columns else pd.Series([np.nan] * len(df))
            dt = pd.to_datetime({
                'year': year.astype('Int64'),
                'month': month.astype('Int64'),
                'day': day.astype('Int64'),
                'hour': hour.astype('Int64'),
                'minute': minute.astype('Int64'),
            }, errors='coerce', utc=True)
            df[tgt] = dt
        else:
            print(f"encode_method '{encode_method}' n√£o suportado para decode.")
        self.df_base = df
        return self.df_base

    def normalize(self, value_cols: List[str], normalization_method: str = 'minmax') -> pd.DataFrame:
        """Normaliza colunas e atualiza self.df_base."""
        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Chame load_data() primeiro.")
            return self.df_base
        df = self.df_base.copy()
        scaler = MinMaxScaler() if normalization_method == 'minmax' else (
            StandardScaler() if normalization_method == 'standard' else None)
        if scaler is None:
            raise ValueError("normalization_method deve ser 'minmax' ou 'standard'")
        df[value_cols] = scaler.fit_transform(df[value_cols])
        self.norm_objects[normalization_method] = {'value_cols': value_cols, 'scaler': scaler}
        self.df_base = df
        return self.df_base

    def normalize_splits(self, value_cols: List[str], normalization_method: str = 'minmax') -> dict:
        """Normaliza os conjuntos de treino, valida√ß√£o e teste."""
        if not self.splits:
            print("Nenhum conjunto dividido encontrado.")
            return {}
        normalized_splits = {}
        for split_name, split_df in self.splits.items():
            self.df_base = split_df
            normalized_df = self.normalize(value_cols=value_cols, normalization_method=normalization_method)
            normalized_splits[split_name] = normalized_df
        self.splits = normalized_splits
        return normalized_splits

    def denormalize(self, normalization_method: str = 'minmax') -> pd.DataFrame:
        """Reverte normaliza√ß√£o usando metadados salvos."""
        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Nada para denormalizar.")
            return self.df_base
        info = self.norm_objects.get(normalization_method)
        if not info:
            print(f"Nenhum scaler salvo para o m√©todo '{normalization_method}'.")
            return self.df_base
        cols: List[str] = info['value_cols']
        scaler = info['scaler']
        df = self.df_base.copy()
        try:
            df[cols] = scaler.inverse_transform(df[cols])
        except Exception as e:
            print(f"Falha ao denormalizar colunas {cols}: {e}")
            return self.df_base
        self.df_base = df
        return self.df_base

    def save_df_base(self, filename: Optional[str] = None, compression: Optional[str] = None, partition_by: Optional[List[str]] = None) -> Optional[str]:
        """Salva self.df_base em Parquet dentro de data_dir/{model_name}."""
        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Nada para salvar.")
            return None
        comp = compression
        if comp is None:
            try:
                comp = PARQUET_COMPRESSION
            except NameError:
                comp = 'zstd'
        filename = "raw_dataset.parquet"
        out_path = os.path.join(self.save_dir, filename)
        df = self.df_base.copy()
        if 'datetime' in df.columns:
            df['datetime'] = pd.to_datetime(df['datetime'], utc=True)
        try:
            if partition_by:
                df.to_parquet(out_path, engine='pyarrow', compression=comp, index=False, partition_cols=partition_by)
            else:
                df.to_parquet(out_path, engine='pyarrow', compression=comp, index=False)
            print(f"[SALVO] df_base: {len(df):,} linhas ‚Üí {out_path}")
            return out_path
        except Exception as e:
            print(f"Falha ao salvar df_base em {out_path}: {e}")
            return None
    
    def split_train_val_test(self, train_size: float = 0.7, val_size: float = 0.15, test_size: float = 0.15, time_col: str = 'datetime') -> Optional[dict]:
        """Divide df_base em conjuntos de treino, valida√ß√£o e teste com base em time_col."""
        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Nada para dividir.")
            return None
        if not np.isclose(train_size + val_size + test_size, 1.0):
            print("train_size, val_size e test_size devem somar 1.0")
            return None
        df = self.df_base.copy()
        if time_col not in df.columns:
            print(f"Coluna de tempo '{time_col}' n√£o encontrada em df_base.")
            return None
        df = df.sort_values(time_col).reset_index(drop=True)
        n = len(df)
        train_end = int(n * train_size)
        val_end = train_end + int(n * val_size)
        splits = {
            'train': df.iloc[:train_end].reset_index(drop=True),
            'val': df.iloc[train_end:val_end].reset_index(drop=True),
            'test': df.iloc[val_end:].reset_index(drop=True),
        }
        for split_name, split_df in splits.items():
            print(f"[DIVIDIDO] {split_name}: {len(split_df):,} linhas")
        self.splits = splits
        return splits

## Preprocessamento do Modelo Linear

Esse modelo deve ser√° contruido a partir de lags e leads passados como par√¢metros na fun√ß√£o, resultando na contru√ß√£o de novas colunas lead lag, assim gerando uma flat matrix 2D que ser√° usada no modelo linear

Observa√ß√£o importante: lag e lead s√£o inteiros e representam o m√°ximo de passos; o pipeline expande para intervalos 1..N automaticamente. Por exemplo, lag=96 gera features com defasagens de 1 a 96; lead=96 gera alvos de 1 a 96.

Os arquivos do modelo ser√£o salvos em TFrecords j√° que o modelo linear ser√° contru√≠do usando tensor flow

No caso o Preprocessador do modelo linear ser√° igual ao pr√©-processador do MLP 

In [None]:
class LinearPreprocessor(Preprocessor):
    """Pr√©-processador linear: gera matriz flat (lags/leads), exporta Parquet e TFRecords."""

    def build_flat_matrix(
        self,
        value_cols: Optional[List[str]] = None,
        target_cols: Optional[List[str]] = None,
        lags: Optional[int] = None,
        leads: Optional[int] = None,
        dropna: bool = True,
        group_cols: Optional[List[str]] = None,
        time_col: str = "datetime",
    ) -> pd.DataFrame:
        import pandas as pd

        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Chame load_data() primeiro.")
            return self.df_base

        df = self.df_base.copy()
        feats = value_cols or self.feature_cols
        tgts = target_cols or self.target_cols
        if not feats:
            raise ValueError("Nenhuma coluna de feature informada.")
        if not tgts:
            raise ValueError("Nenhum target informado.")

        group_cols = group_cols or [c for c in ["country"] if c in df.columns]

        if time_col not in df.columns:
            raise ValueError(f"Coluna temporal '{time_col}' n√£o encontrada no DataFrame.")

        sort_cols = (group_cols or []) + [time_col]
        df = df.sort_values(sort_cols).reset_index(drop=True)

        if group_cols:
            df["_group_id"] = df[group_cols].astype(str).agg("_".join, axis=1)
        else:
            df["_group_id"] = "global"

        lag_steps = list(range(1, (lags or self.lag or 0) + 1))
        lead_steps = list(range(1, (leads or self.lead or 0) + 1))
        new_cols = []

        # ---- Lags ----
        for col in feats:
            if col not in df.columns:
                print(f"[WARN] Coluna de feature '{col}' n√£o encontrada.")
                continue
            for k in lag_steps:
                cname = f"{col}_lag{k}"
                df[cname] = df.groupby("_group_id", group_keys=False, sort=False)[col].shift(k)
                new_cols.append(cname)

        # ---- Leads ----
        for tgt in tgts:
            if tgt in df.columns:
                for k in lead_steps:
                    cname = f"{tgt}_lead{k}"
                    df[cname] = df.groupby("_group_id", group_keys=False, sort=False)[tgt].shift(-k)
                    new_cols.append(cname)
            else:
                print(f"[WARN] Target '{tgt}' n√£o encontrado. Ignorando leads.")

        if dropna and new_cols:
            df = df.dropna(subset=new_cols).reset_index(drop=True)

        df.drop(columns=["_group_id"], inplace=True, errors="ignore")

        self.df_base = df
        self.feature_cols.extend([c for c in new_cols if "_lag" in c and c not in self.feature_cols])
        self.target_cols.extend([c for c in new_cols if "_lead" in c and c not in self.target_cols])
        return self.df_base

    def build_flat_matrices_splits(self, *args, **kwargs) -> Optional[dict]:
        """Constr√≥i matrizes flat para cada split (train/val/test)."""
        if not self.splits:
            print("Nenhum conjunto dividido encontrado.")
            return None
        built_splits = {}
        for split_name, split_df in self.splits.items():
            self.df_base = split_df
            built_df = self.build_flat_matrix(*args, **kwargs)
            built_splits[split_name] = built_df
        self.splits = built_splits
        return built_splits

    def save_tfrecords(
        self,
        output_basename: str = 'dataset',
        shard_size: int = 100_000,
        compression: Optional[str] = None,
    ) -> Optional[List[str]]:
        import os, json
        import pandas as pd
        import tensorflow as tf

        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Nada para salvar em TFRecords.")
            return None

        # --- Salvar Parquet intermedi√°rio ---
        parquet_path = os.path.join(self.save_dir, f"{output_basename}.parquet")
        try:
            self.df_base.to_parquet(parquet_path, index=False)
            print(f"[üíæ] Parquet salvo em: {parquet_path}")
        except Exception as e:
            print(f"[WARN] Falha ao salvar Parquet: {e}")

        # --- Sele√ß√£o de colunas num√©ricas ---
        numeric_cols = self.df_base.select_dtypes(include=["number", "bool"]).columns
        present_feats = [c for c in self.feature_cols if c in numeric_cols]
        present_tgts = [c for c in self.target_cols if c in numeric_cols]
        if not present_feats or not present_tgts:
            print("Nenhuma feature/target v√°lida encontrada. Abortando export.")
            return None
        
        self.feature_cols = present_feats
        self.target_cols = present_tgts

        df = self.df_base.reset_index(drop=True)
        X = df[present_feats].astype('float32').to_numpy(copy=False)
        y = df[present_tgts].astype('float32').to_numpy(copy=False)
        n = len(df)
        x_dim, y_dim = X.shape[1], y.shape[1]

        comp = compression or 'GZIP'
        options = tf.io.TFRecordOptions(compression_type=comp)
        paths: List[str] = []

        def _float_feature(v):
            return tf.train.Feature(float_list=tf.train.FloatList(value=v))

        for shard_idx, start in enumerate(range(0, n, shard_size)):
            end = min(start + shard_size, n)
            shard_path = os.path.join(self.save_dir, f"{output_basename}_{shard_idx}.tfrecord")
            with tf.io.TFRecordWriter(shard_path, options=options) as w:
                for i in range(start, end):
                    ex = tf.train.Example(features=tf.train.Features(feature={
                        'x': _float_feature(X[i]),
                        'y': _float_feature(y[i]),
                    }))
                    w.write(ex.SerializeToString())
            paths.append(shard_path)

        # --- Metadados ---
        meta = {
            'x_dim': int(x_dim),
            'y_dim': int(y_dim),
            'feature_cols': present_feats,
            'target_cols': present_tgts,
            'count': int(n),
            'compression': comp,
            'basename': output_basename,
            'parquet_path': parquet_path,
        }
        try:
            with open(os.path.join(self.save_dir, f"{output_basename}.meta.json"), 'w', encoding='utf-8') as f:
                json.dump(meta, f, ensure_ascii=False, indent=2)
        except Exception as e:
            print(f"[WARN] Falha ao salvar metadados: {e}")

        print(f"[‚úÖ] TFRecords salvos ({len(paths)} shards) + Parquet em {self.save_dir}")
        return paths

    def save_splits_tfrecords(
        self,
        output_basename: str = 'dataset',
        shard_size: int = 100_000,
        compression: Optional[str] = None,
    ) -> Optional[dict]:
        """Salva TFRecords e Parquet para cada split (train/val/test)."""
        if not self.splits:
            print("Nenhum conjunto dividido encontrado.")
            return None
        paths_dict = {}
        for split_name, split_df in self.splits.items():
            self.df_base = split_df
            paths = self.save_tfrecords(
                output_basename=f"{output_basename}_{split_name}",
                shard_size=shard_size,
                compression=compression,
            )
            paths_dict[split_name] = paths
        return paths_dict


## Preprocessamento do Modelo LSTM

O preprocessador do LSTM deve ser capaz de gerar tensores de dimens√£o 3, no seguinte formato (n_batch, seq_len, features) e (n_batch, seq_len, features)  para alimenta√ß√£o do modelo e valida√ß√£o das m√©tricas do modelo

Os arquivos do modelo ser√£o salvos em TFrecords j√° que o modelo linear ser√° contru√≠do usando tensor flow

In [None]:
import os, json, re
import numpy as np
import pandas as pd
import tensorflow as tf
from typing import List, Optional, Dict, Any

class LSTMPreprocessor(Preprocessor):
    """Pr√©-processador sequencial para LSTM: gera janelas 3D (N, seq_len, features)."""

    def build_sequence_matrix(
        self,
        value_cols: Optional[List[str]] = None,
        target_cols: Optional[List[str]] = None,
        seq_len: Optional[int] = None,
        lead: Optional[int] = None,
        group_cols: Optional[List[str]] = None,
        time_col: str = "datetime",
        drop_last_incomplete: bool = True,
    ) -> Dict[str, np.ndarray]:
        """
        Constr√≥i tensores X (entradas) e Y (alvos) para modelo LSTM.
        Cada grupo (ex.: pa√≠s) √© processado separadamente e concatenado.
        """
        if self.df_base is None or self.df_base.empty:
            print("df_base vazio. Chame load_data() primeiro.")
            return {}

        df = self.df_base.copy()
        feats = value_cols or self.feature_cols
        tgts = target_cols or self.target_cols
        if not feats:
            raise ValueError("Nenhuma coluna de feature informada.")
        if not tgts:
            raise ValueError("Nenhum target informado.")

        if time_col not in df.columns:
            raise ValueError(f"Coluna temporal '{time_col}' n√£o encontrada.")

        group_cols = group_cols or [c for c in ["country"] if c in df.columns]
        sort_cols = (group_cols or []) + [time_col]
        df = df.sort_values(sort_cols).reset_index(drop=True)

        if group_cols:
            df["_group_id"] = df[group_cols].astype(str).agg("_".join, axis=1)
        else:
            df["_group_id"] = "global"

        seq_len = seq_len or getattr(self, "seq_len", 24)
        lead = lead or getattr(self, "lead", 1)

        X_list, Y_list = [], []
        for gid, g in df.groupby("_group_id", sort=False):
            g = g.reset_index(drop=True)
            if len(g) < seq_len + lead:
                continue
            X_src = g[feats].to_numpy(np.float32)
            Y_src = g[tgts].to_numpy(np.float32)
            for i in range(len(g) - seq_len - lead + 1):
                x_win = X_src[i:i+seq_len]
                y_val = Y_src[i+seq_len+lead-1]
                X_list.append(x_win)
                Y_list.append(y_val)

        if not X_list:
            print("[WARN] Nenhuma janela gerada.")
            return {}

        X = np.stack(X_list)
        Y = np.stack(Y_list)
        print(f"[JANELAS] X={X.shape}, Y={Y.shape}, seq_len={seq_len}, lead={lead}")
        self._seq_data = dict(X=X, Y=Y, seq_len=seq_len, lead=lead, x_dim=X.shape[-1], y_dim=Y.shape[-1])
        return self._seq_data

    def save_sequence_tfrecords(
        self,
        output_basename: str = 'lstm_dataset',
        shard_size: int = 50_000,
        compression: str = 'GZIP',
    ) -> Optional[List[str]]:
        """Salva janelas (X,Y) como TFRecords comprimidos."""
        if not hasattr(self, "_seq_data"):
            print("Nenhum dado sequencial encontrado. Execute build_sequence_matrix() antes.")
            return None

        X, Y = self._seq_data["X"], self._seq_data["Y"]
        seq_len, x_dim, y_dim = self._seq_data["seq_len"], self._seq_data["x_dim"], self._seq_data["y_dim"]
        lead = int(self._seq_data.get("lead", getattr(self, "lead", 1)))
        n = len(X)
        os.makedirs(self.save_dir, exist_ok=True)

        options = tf.io.TFRecordOptions(compression_type=compression)
        paths = []

        def _bytes_feature(arr: np.ndarray) -> tf.train.Feature:
            return tf.train.Feature(bytes_list=tf.train.BytesList(value=[arr.tobytes()]))

        for shard_idx, start in enumerate(range(0, n, shard_size)):
            end = min(start + shard_size, n)
            shard_path = os.path.join(self.save_dir, f"{output_basename}_{shard_idx}.tfrecord")
            with tf.io.TFRecordWriter(shard_path, options=options) as w:
                for i in range(start, end):
                    ex = tf.train.Example(features=tf.train.Features(feature={
                        'x_raw': _bytes_feature(X[i]),
                        'y_raw': _bytes_feature(Y[i]),
                    }))
                    w.write(ex.SerializeToString())
            paths.append(shard_path)

        meta = {
            'seq_len': seq_len,
            'lead': lead,
            'x_dim': x_dim,
            'y_dim': y_dim,
            'compression': compression,
            'count': int(n),
            'basename': output_basename,
        }
        with open(os.path.join(self.save_dir, f"{output_basename}.meta.json"), 'w') as f:
            json.dump(meta, f, indent=2)

        print(f"[‚úÖ] TFRecords salvos ({len(paths)} shards) em {self.save_dir} ‚Äî lead={lead}")
        return paths

    @staticmethod
    def parse_tfrecord(example_proto, seq_len:int, x_dim:int, y_dim:int):
        """Fun√ß√£o para leitura dos TFRecords salvos."""
        features = {
            'x_raw': tf.io.FixedLenFeature([], tf.string),
            'y_raw': tf.io.FixedLenFeature([], tf.string),
        }
        parsed = tf.io.parse_single_example(example_proto, features)
        x = tf.io.decode_raw(parsed['x_raw'], tf.float32)
        y = tf.io.decode_raw(parsed['y_raw'], tf.float32)
        x = tf.reshape(x, [seq_len, x_dim])
        y = tf.reshape(y, [y_dim])
        return x, y

    @staticmethod
    def load_sequence_dataset(
        path_pattern: str,
        seq_len: Optional[int] = None,
        x_dim: Optional[int] = None,
        y_dim: Optional[int] = None,
        batch_size: int = 256,
        compression: str = 'GZIP',
        meta_path: Optional[str] = None,
        return_meta: bool = False,
    ) -> tf.data.Dataset | tuple:
        """Carrega os TFRecords como dataset pronto para treino.

        Se `return_meta=True`, retorna (dataset, meta_dict).
        Quando seq_len/x_dim/y_dim n√£o forem informados, tenta inferir do meta.json.
        """
        files = tf.io.gfile.glob(path_pattern)

        # Localiza e l√™ meta se necess√°rio
        meta = None
        if meta_path is None and files:
            # Ex.: .../lstm_dataset_train_0.tfrecord -> .../lstm_dataset_train.meta.json
            guess = re.sub(r'_[0-9]+\.tfrecord$', '.meta.json', files[0])
            if tf.io.gfile.exists(guess):
                meta_path = guess
        if meta_path and tf.io.gfile.exists(meta_path):
            try:
                with tf.io.gfile.GFile(meta_path, 'r') as f:
                    meta = json.load(f)
            except Exception:
                meta = None

        # Resolu√ß√£o de shapes
        if seq_len is None and meta is not None:
            seq_len = int(meta.get('seq_len'))
        if x_dim is None and meta is not None:
            x_dim = int(meta.get('x_dim'))
        if y_dim is None and meta is not None:
            y_dim = int(meta.get('y_dim'))

        if seq_len is None or x_dim is None or y_dim is None:
            raise ValueError("seq_len/x_dim/y_dim n√£o definidos e meta.json ausente ou incompleto.")

        ds = tf.data.TFRecordDataset(files, compression_type=compression)
        ds = ds.map(lambda ex: LSTMPreprocessor.parse_tfrecord(ex, seq_len, x_dim, y_dim),
                    num_parallel_calls=tf.data.AUTOTUNE)
        ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

        # Mensagem informativa
        if meta is not None:
            lead = meta.get('lead')
            if lead is not None:
                print(f"[DATASET] {len(files)} shards ‚Üí batch_size={batch_size} (seq_len={seq_len}, lead={lead})")
            else:
                print(f"[DATASET] {len(files)} shards ‚Üí batch_size={batch_size} (seq_len={seq_len}) ‚Äî lead ausente no meta")
        else:
            print(f"[DATASET] {len(files)} shards carregados ‚Üí batch_size={batch_size}")

        if return_meta:
            return ds, (meta or {})
        return ds


## Preprocessamento do Modelo TFT (PyTorch)

O preprocessador do LSTM deve ser capaz de gerar tensores de dimens√£o 3, no seguinte formato (n_batch, seq_len, features) e (n_batch, seq_len, features)  para alimenta√ß√£o do modelo e valida√ß√£o das m√©tricas do modelo

Os arquivos do modelo ser√£o salvos em TFrecords j√° que o modelo linear ser√° contru√≠do usando tensor flow

In [None]:
import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.models import TemporalFusionTransformer
from lightning.pytorch import Trainer, seed_everything
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
from typing import List, Optional


class TFTPreprocessor(Preprocessor):
    """
    Preprocessador espec√≠fico para o modelo Temporal Fusion Transformer (PyTorch Forecasting).
    Herdando de Preprocessor, apenas adiciona a etapa final de estrutura√ß√£o e salvamento
    dos splits no formato compat√≠vel com o PyTorch Forecasting.
    """

    def __init__(
        self,
        data_dir: str,
        model_name: str,
        feature_cols: List[str],
        target_cols: List[str],
        country_list: List[str],
        seq_len: int,
        lead: int,
    ):
        # Corrigido: alinhar com assinatura de Preprocessor
        super().__init__(
            lag=seq_len,
            lead=lead,
            country_list=country_list,
            model_name=model_name,
            data_dir=data_dir,
            feature_cols=feature_cols,
            target_cols=target_cols,
        )
        self.seq_len = seq_len
        self.lead = lead


    def build_tft_parquets(
        self,
        group_cols: Optional[List[str]] = ["country"],
        time_col: str = "datetime",
        dropna: bool = True,
    ):
        """
        Estrutura os splits existentes (j√° criados na classe-base) para uso no TFT e salva em parquet.
        Simples e direto:
        - Ordena por (group_cols + time_col)
        - Opcionalmente remove nulos nas colunas cr√≠ticas [time_col] + group_cols + target_cols
        - Define '_group_id' e calcula 'time_idx' por grupo via cumcount() (0..N-1 por s√©rie)
        - Salva parquet por split
        """
        if not hasattr(self, "splits") or not self.splits:
            raise ValueError("Os splits ainda n√£o foram gerados. Execute split_train_val_test() primeiro.")

        for name, df in self.splits.items():
            df = df.copy()
            # tipos e ordena√ß√£o
            df[time_col] = pd.to_datetime(df[time_col], utc=True)
            sort_cols = (group_cols or []) + [time_col]
            df = df.sort_values(sort_cols).reset_index(drop=True)

            # drop nulos b√°sico
            if dropna:
                subset_cols = ([time_col] if time_col else []) + (group_cols or []) + (self.target_cols or [])
                present = [c for c in subset_cols if c in df.columns]
                before = len(df)
                df = df.dropna(subset=present).reset_index(drop=True)
                if before - len(df) > 0:
                    print(f"üßπ Drop NA ({name}): {before - len(df)} linhas removidas nas colunas {present}.")

            # id de grupo e time_idx por grupo
            if group_cols:
                df["_group_id"] = df[group_cols].astype(str).agg("_".join, axis=1)
            else:
                df["_group_id"] = "global"

            # contador sequencial por grupo (n√£o global)
            df["time_idx"] = df.groupby("_group_id").cumcount().astype("int64")

            # salvar parquet
            path = os.path.join(self.data_dir, f"tft_dataset_{name}.parquet")
            df.to_parquet(path, index=False)
            print(f"üíæ Split '{name}' salvo em {path} ({df.shape[0]} linhas, grupos={df['_group_id'].nunique()}, max local time_idx={df.groupby('_group_id')['time_idx'].max().max()}).")


    def load_tft_dataset(
        self,
        split_name: str,
        target_col: str,
        known_reals: Optional[List[str]] = None,
        return_df: bool = False,
    ):
        """
        Carrega o parquet salvo como DataFrame ou cria um TimeSeriesDataSet compat√≠vel com o TFT PyTorch.

        Args:
            split_name: 'train' | 'val' | 'test' (parte do nome do arquivo parquet gerado)
            target_col: coluna alvo principal (string)
            known_reals: lista de features conhecidas no tempo (overrides self.feature_cols quando fornecida)
            return_df: se True retorna o DataFrame bruto em vez do TimeSeriesDataSet

        Retorna:
            DataFrame (quando return_df=True) ou TimeSeriesDataSet
        """
        path = os.path.join(self.data_dir, f"tft_dataset_{split_name}.parquet")
        if not os.path.exists(path):
            raise FileNotFoundError(f"Arquivo n√£o encontrado: {path}")

        df = pd.read_parquet(path)

        if return_df:
            print(f"üì• Parquet '{split_name}' carregado ({len(df)} linhas) ‚Äî retornando DataFrame.")
            return df

        # determina known/unknown reals
        known_reals = known_reals or [c for c in (self.feature_cols or []) if c not in (self.target_cols or [])]

        ds = TimeSeriesDataSet(
            df,
            time_idx="time_idx",
            target=target_col,
            group_ids=["_group_id"],
            max_encoder_length=self.seq_len,
            max_prediction_length=self.lead,
            time_varying_known_reals=known_reals,
            time_varying_unknown_reals=self.target_cols,
            add_relative_time_idx=True,
            add_target_scales=True,
            add_encoder_length=True,
        )
        print(f"üì¶ TimeSeriesDataSet ({split_name}) criado com {len(df)} amostras.")
        return ds

# Cap√≠tulo 2 ‚Äî Constru√ß√£o dos Modelos

A seguir, definimos construtores simples e eficientes para cada modelo (Linear, LSTM, TFT e TimesFM),
prontos para uso em rotinas de otimiza√ß√£o de hiperpar√¢metros (por exemplo, Optuna). Cada construtor
recebe um dicion√°rio de par√¢metros (`params`) e retorna um modelo compilado.

## Constru√ß√£o do Modelo Linear/MLP

Objetivo: Criar um regressor simples (MLP), com capacidade de redu√ß√£o para um modelo apenas lienar - pela exclus√£o da camada de ativa√ß√£o - para prever `target_cols` a partir de `feature_cols`.

Contrato r√°pido:
- Entrada: vetor de tamanho `x_dim` (n√∫mero de features)
- Sa√≠da: vetor de tamanho `y_dim` (n√∫mero de targets)
- Par√¢metros (exemplos): hidden_units, activation, dropout, lr, l2

In [None]:
from typing import Dict, Any
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


def build_linear_model(x_dim: int, y_dim: int, params: Dict[str, Any], linear: bool = False) -> keras.Model:
    """
    Constr√≥i um modelo linear puro ou um MLP, dependendo do argumento `linear`.

    params:
      - hidden_units: List[int] (apenas usado se linear=False)
      - activation: str (apenas usado se linear=False)
      - dropout: float (0..1)
      - l2: float (regulariza√ß√£o L2)
      - lr: float (learning rate)
    """
    hidden_units = params.get('hidden_units', [128, 64])
    activation = params.get('activation', 'relu')
    dropout = float(params.get('dropout', 0.0))
    l2 = float(params.get('l2', 0.0))
    lr = float(params.get('lr', 1e-3))

    inputs = keras.Input(shape=(x_dim,), name='features')

    if linear:
        # Modelo puramente linear (sem ativa√ß√£o)
        outputs = layers.Dense(
            y_dim,
            activation=None,
            kernel_regularizer=keras.regularizers.l2(l2),
            name='linear_output'
        )(inputs)
        model = keras.Model(inputs, outputs, name='linear_model_true')

    else:
        # Modelo MLP (n√£o linear)
        x = inputs
        for i, units in enumerate(hidden_units):
            x = layers.Dense(
                units,
                activation=activation,
                kernel_regularizer=keras.regularizers.l2(l2),
                name=f'dense_{i}'
            )(x)
            if dropout > 0:
                x = layers.Dropout(dropout, name=f'dropout_{i}')(x)
        outputs = layers.Dense(y_dim, name='targets')(x)
        model = keras.Model(inputs, outputs, name='mlp_model')

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss='mse',
        metrics=['mae']
    )
    return model


# Fun√ß√µes auxiliares para carregar TFRecords
def parse_tfrecord(example_proto, x_dim, y_dim):
    feature_description = {
        'x': tf.io.VarLenFeature(tf.float32),
        'y': tf.io.VarLenFeature(tf.float32),
    }
    parsed = tf.io.parse_single_example(example_proto, feature_description)
    x = tf.sparse.to_dense(parsed['x'])
    y = tf.sparse.to_dense(parsed['y'])
    x = tf.reshape(x, [x_dim])
    y = tf.reshape(y, [y_dim])
    return x, y

def load_tfrecord_dataset(
    path_pattern: str,
    x_dim: int | None = None,
    y_dim: int | None = None,
    batch_size: int = 64,
    compression: str = 'GZIP',
    meta_path: str | None = None,
    return_meta: bool = False,
):
    import re, json
    import tensorflow as tf

    files = tf.io.gfile.glob(path_pattern)
    if not files:
        raise FileNotFoundError(f"Nenhum TFRecord encontrado para o padr√£o: {path_pattern}")

    meta = None
    # Tenta deduzir o caminho do meta.json a partir do primeiro shard
    if meta_path is None:
        guess = re.sub(r'_[0-9]+\\.tfrecord$', '.meta.json', files[0])
        if tf.io.gfile.exists(guess):
            meta_path = guess
    if meta_path and tf.io.gfile.exists(meta_path):
        try:
            with tf.io.gfile.GFile(meta_path, 'r') as f:
                meta = json.load(f)
        except Exception:
            meta = None

    # Resolve dimens√µes a partir do meta quando n√£o fornecidas
    if x_dim is None and meta is not None:
        x_dim = int(meta.get('x_dim')) if meta.get('x_dim') is not None else None
    if y_dim is None and meta is not None:
        y_dim = int(meta.get('y_dim')) if meta.get('y_dim') is not None else None

    if x_dim is None or y_dim is None:
        raise ValueError("x_dim/y_dim n√£o definidos e meta.json ausente ou incompleto.")

    ds = tf.data.TFRecordDataset(files, compression_type=compression)
    ds = ds.map(lambda ex: parse_tfrecord(ex, x_dim, y_dim),
                num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    # Mensagem informativa
    if meta is not None:
        seq_len = meta.get('seq_len')  # Pode existir em alguns formatos de meta
        if seq_len is not None:
            print(f"[DATASET] {len(files)} shards ‚Üí batch_size={batch_size} (x_dim={x_dim}, y_dim={y_dim}, seq_len={seq_len})")
        else:
            print(f"[DATASET] {len(files)} shards ‚Üí batch_size={batch_size} (x_dim={x_dim}, y_dim={y_dim})")
    else:
        print(f"[DATASET] {len(files)} shards carregados ‚Üí batch_size={batch_size}")

    if return_meta:
        return ds, (meta or {})
    return ds

## Constru√ß√£o do Modelo LSTM

Objetivo: um regressor denso simples (MLP) para prever `target_cols` a partir de `feature_cols`.

Contrato r√°pido:
- Entrada: vetor de tamanho `x_dim` (n√∫mero de features)
- Sa√≠da: vetor de tamanho `y_dim` (n√∫mero de targets)
- Par√¢metros (exemplos): hidden_units, activation, dropout, lr, l2

In [None]:
from typing import Dict, Any
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


def build_lstm_model(seq_len: int, x_dim: int, y_dim: int, params: Dict[str, Any]) -> keras.Model:
    """
    LSTM para regress√£o multivariada temporal com suporte a m√°scara para valores nulos/padding.

    - As entradas devem conter `NaN` ou um valor sentinel (ex.: 0.0) para timesteps a mascarar.
    - Camadas LSTM automaticamente ignoram esses timesteps durante o treinamento.
    """

    lstm_units = params.get('lstm_units', [128, 64])
    dense_units = params.get('dense_units', [128])
    dropout = float(params.get('dropout', 0.1))
    rec_dropout = float(params.get('rec_dropout', 0.0))
    act = params.get('act', 'relu')
    lr = float(params.get('lr', 1e-3))
    l2 = float(params.get('l2', 0.0))
    layer_norm = bool(params.get('layer_norm', True))
    mask_value = float(params.get('mask_value', 0.0))  # sentinel for masking

    # --- Inputs & Mask ---
    inputs = keras.Input(shape=(seq_len, x_dim), name='sequence_input')

    # Replace NaNs safely within a Lambda layer
    x = layers.Lambda(
        lambda v: tf.where(tf.math.is_nan(v), tf.zeros_like(v), v),
        name="replace_nans"
    )(inputs)

    # --- LSTM stack ---
    for i, units in enumerate(lstm_units):
        return_seq = i < len(lstm_units) - 1
        x = layers.LSTM(
            units,
            return_sequences=return_seq,
            dropout=dropout,
            recurrent_dropout=rec_dropout,
            kernel_regularizer=keras.regularizers.l2(l2),
            name=f'lstm_{i}'
        )(x)
        if layer_norm:
            x = layers.LayerNormalization(name=f'ln_{i}')(x)

    # --- Dense layers ---
    for i, units in enumerate(dense_units):
        x = layers.Dense(units, activation=act, name=f'dense_{i}')(x)
        if dropout > 0:
            x = layers.Dropout(dropout, name=f'dropout_{i}')(x)

    outputs = layers.Dense(y_dim, name='output')(x)

    # --- Compile ---
    model = keras.Model(inputs, outputs, name='lstm_regressor')
    optimizer = keras.optimizers.Adam(learning_rate=lr, clipnorm=1.0)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])

    return model
def parse_seq_tfrecord(example_proto, seq_len, x_dim, y_dim):
    """
    Faz o parsing de TFRecords com dados 3D salvos em bytes.
    Espera features:
        'x_raw': sequ√™ncia de entrada (float32 bytes)
        'y_raw': target (float32 bytes)
    """
    feature_description = {
        'x_raw': tf.io.FixedLenFeature([], tf.string),
        'y_raw': tf.io.FixedLenFeature([], tf.string),
    }
    parsed = tf.io.parse_single_example(example_proto, feature_description)

    x = tf.io.decode_raw(parsed['x_raw'], tf.float32)
    y = tf.io.decode_raw(parsed['y_raw'], tf.float32)

    x = tf.reshape(x, [seq_len, x_dim])
    y = tf.reshape(y, [y_dim])

    return x, y


def load_seq_tfrecord_dataset(path_pattern, seq_len, x_dim, y_dim, batch_size=64, compression='GZIP'):
    """
    Carrega TFRecords sequenciais e retorna um tf.data.Dataset pronto para treino.

    Cada exemplo cont√©m:
        X.shape = (seq_len, x_dim)
        Y.shape = (y_dim,)
    """
    files = tf.io.gfile.glob(path_pattern)
    if not files:
        raise FileNotFoundError(f"Nenhum arquivo TFRecord encontrado em {path_pattern}")

    ds = tf.data.TFRecordDataset(files, compression_type=compression)
    ds = ds.map(
        lambda ex: parse_seq_tfrecord(ex, seq_len, x_dim, y_dim),
        num_parallel_calls=tf.data.AUTOTUNE
    )
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    print(f"[DATASET] {len(files)} shards carregados | batch_size={batch_size}")
    return ds

def save_model(model, path: str):
    """
    Salva um modelo TensorFlow (.keras/.h5) ou PyTorch (.pt/.pth).
    Detecta o tipo automaticamente.
    """
    os.makedirs(os.path.dirname(path), exist_ok=True)
    
    if isinstance(model, tf.keras.Model):
        if not path.endswith(('.keras', '.h5')):
            path += ".keras"
        model.save(path)
        print(f"‚úÖ Modelo TensorFlow salvo em: {path}")
        
    elif isinstance(model, torch.nn.Module):
        if not path.endswith(('.pt', '.pth')):
            path += ".pt"
        torch.save(model.state_dict(), path)
        print(f"‚úÖ Modelo PyTorch salvo em: {path}")
        
    else:
        raise TypeError("‚ùå Tipo de modelo n√£o suportado. Deve ser TensorFlow (keras.Model) ou PyTorch (nn.Module).")






## Constru√ß√£o do Modelo TFT (Temporal Fusion Transformer)

**Objetivo:** prever `target_cols` a partir de `feature_cols` utilizando a implementa√ß√£o oficial `keras_tft`,  
que integra **sele√ß√£o de vari√°veis din√¢micas**, **blocos LSTM**, **aten√ß√£o temporal multi-cabe√ßas** e **gating residual** em um √∫nico modelo interpretable.

**Contrato r√°pido:**
- **Entrada:** `tf.data.Dataset` com tensores no formato `(batch, seq_len, x_dim)`  
- **Sa√≠da:** tensor cont√≠nuo de tamanho `y_dim` *(ou `dec_len √ó y_dim` para horizontes m√∫ltiplos)*

**Par√¢metros (exemplos):**  
`hidden_size` (tamanho interno das camadas GRN) ¬∑ `lstm_layers` ¬∑ `num_heads` (aten√ß√£o) ¬∑ `dropout` ¬∑ `learning_rate` ¬∑ `output_size` ¬∑ `seq_len`

**Componentes internos (`keras_tft`):**  
Variable Selection Network ‚Üí LSTM Encoder/Decoder ‚Üí Multi-Head Temporal Attention ‚Üí Gated Residual Network ‚Üí Camada de proje√ß√£o final

**Compatibilidade:**  
Totalmente compat√≠vel com o pipeline atual de TFRecords do LSTM, recebendo o mesmo formato de dados  
(`(batch, seq_len, features)`), permitindo substitui√ß√£o direta do modelo sem alterar o pr√©-processamento.



In [None]:
from typing import Dict, Any
import torch
from torch import nn
from pytorch_forecasting.models import TemporalFusionTransformer
from pytorch_forecasting.metrics import QuantileLoss

def build_tft_model(
    params: Dict[str, Any]
):
    """
    Constr√≥i um Temporal Fusion Transformer (TFT) com par√¢metros configur√°veis.

    Args:
        x_dim: n√∫mero de features de entrada
        y_dim: n√∫mero de targets
        seq_len: tamanho da sequ√™ncia temporal
        params: dicion√°rio de hiperpar√¢metros (hidden_size, dropout, lstm_layers, etc.)
        max_encoder_length: tamanho da janela passada (encoder)
        max_prediction_length: tamanho do horizonte de previs√£o (decoder)
    """

    from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

    hidden_size = int(params.get("hidden_size", 64))
    dropout = float(params.get("dropout", 0.1))
    lstm_layers = int(params.get("lstm_layers", 1))
    attention_head_size = int(params.get("num_heads", 4))
    lr = float(params.get("lr", 1e-3))

    model = TemporalFusionTransformer.from_dataset(
        params["dataset"],  # dataset preparado via TimeSeriesDataSet
        learning_rate=lr,
        hidden_size=hidden_size,
        dropout=dropout,
        lstm_layers=lstm_layers,
        attention_head_size=attention_head_size,
        loss=QuantileLoss([0.5]),
        log_interval=10,
        log_val_interval=1
    )

    return model


# Cap√≠tulo 3 - Contru√ß√£o da Pipelines de dados dos modelos

A fun√ß√£o de pipeline organiza o fluxo de dados para, de forma mais concisa e organizada, treinar o modelo, sendo capaz de mostrar a progress√£o das perdas a medida que as √©pocas de treinamento passam - Esse display est√© dispon√≠vel no notebook "Resultados"

O resultado da pipeline √© um gr√°fico com a evolu√ß√£o de todas as m√©tricas e o salvamento do modelo treinado dentro da pasta ./modelo/{Nome_Problema}/{Nome_Modelo}

Assim podendo ser facilmente reutilizado futuramente para um notebook comparativo

## Pipeline dos Modelos Lineares

Pipeline de preprocessamento e de treinamento

In [None]:
from typing import Dict, Any
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


def build_linear_model(x_dim: int, y_dim: int, params: Dict[str, Any], linear: bool = False) -> keras.Model:
    """
    Constr√≥i um modelo linear puro ou um MLP, dependendo do argumento `linear`.

    params:
      - hidden_units: List[int] (apenas usado se linear=False)
      - activation: str (apenas usado se linear=False)
      - dropout: float (0..1)
      - l2: float (regulariza√ß√£o L2)
      - lr: float (learning rate)
    """
    hidden_units = params.get('hidden_units', [128, 64])
    activation = params.get('activation', 'relu')
    dropout = float(params.get('dropout', 0.0))
    l2 = float(params.get('l2', 0.0))
    lr = float(params.get('lr', 1e-3))

    inputs = keras.Input(shape=(x_dim,), name='features')

    if linear:
        # Modelo puramente linear (sem ativa√ß√£o)
        outputs = layers.Dense(
            y_dim,
            activation=None,
            kernel_regularizer=keras.regularizers.l2(l2),
            name='linear_output'
        )(inputs)
        model = keras.Model(inputs, outputs, name='linear_model_true')

    else:
        # Modelo MLP (n√£o linear)
        x = inputs
        for i, units in enumerate(hidden_units):
            x = layers.Dense(
                units,
                activation=activation,
                kernel_regularizer=keras.regularizers.l2(l2),
                name=f'dense_{i}'
            )(x)
            if dropout > 0:
                x = layers.Dropout(dropout, name=f'dropout_{i}')(x)
        outputs = layers.Dense(y_dim, name='targets')(x)
        model = keras.Model(inputs, outputs, name='mlp_model')

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss='mse',
        metrics=['mae']
    )
    return model


# Fun√ß√µes auxiliares para carregar TFRecords
def parse_tfrecord(example_proto, x_dim, y_dim):
    feature_description = {
        'x': tf.io.VarLenFeature(tf.float32),
        'y': tf.io.VarLenFeature(tf.float32),
    }
    parsed = tf.io.parse_single_example(example_proto, feature_description)
    x = tf.sparse.to_dense(parsed['x'])
    y = tf.sparse.to_dense(parsed['y'])
    x = tf.reshape(x, [x_dim])
    y = tf.reshape(y, [y_dim])
    return x, y

def load_tfrecord_dataset(
    path_pattern: str,
    x_dim: int | None = None,
    y_dim: int | None = None,
    batch_size: int = 64,
    compression: str = 'GZIP',
    meta_path: str | None = None,
    return_meta: bool = False,
):
    import re, json
    import tensorflow as tf

    files = tf.io.gfile.glob(path_pattern)
    if not files:
        raise FileNotFoundError(f"Nenhum TFRecord encontrado para o padr√£o: {path_pattern}")

    meta = None
    # Tenta deduzir o caminho do meta.json a partir do primeiro shard
    if meta_path is None:
        guess = re.sub(r'_[0-9]+\\.tfrecord$', '.meta.json', files[0])
        if tf.io.gfile.exists(guess):
            meta_path = guess
    if meta_path and tf.io.gfile.exists(meta_path):
        try:
            with tf.io.gfile.GFile(meta_path, 'r') as f:
                meta = json.load(f)
        except Exception:
            meta = None

    # Resolve dimens√µes a partir do meta quando n√£o fornecidas
    if x_dim is None and meta is not None:
        x_dim = int(meta.get('x_dim')) if meta.get('x_dim') is not None else None
    if y_dim is None and meta is not None:
        y_dim = int(meta.get('y_dim')) if meta.get('y_dim') is not None else None

    if x_dim is None or y_dim is None:
        raise ValueError("x_dim/y_dim n√£o definidos e meta.json ausente ou incompleto.")

    ds = tf.data.TFRecordDataset(files, compression_type=compression)
    ds = ds.map(lambda ex: parse_tfrecord(ex, x_dim, y_dim),
                num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    # Mensagem informativa
    if meta is not None:
        seq_len = meta.get('seq_len')  # Pode existir em alguns formatos de meta
        if seq_len is not None:
            print(f"[DATASET] {len(files)} shards ‚Üí batch_size={batch_size} (x_dim={x_dim}, y_dim={y_dim}, seq_len={seq_len})")
        else:
            print(f"[DATASET] {len(files)} shards ‚Üí batch_size={batch_size} (x_dim={x_dim}, y_dim={y_dim})")
    else:
        print(f"[DATASET] {len(files)} shards carregados ‚Üí batch_size={batch_size}")

    if return_meta:
        return ds, (meta or {})
    return ds

In [None]:
import os
import matplotlib.pyplot as plt
from typing import List, Tuple, Dict
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping as TFEarlyStopping, ReduceLROnPlateau as TFReduceLROnPlateau


def linear_preproccess_pipeline(
    data_dir: str,
    country_list: List[str],
    feature_cols: List[str],
    target_cols: List[str],
    lag: int,
    lead: int,
    value_cols: List[str]
) -> Tuple[LinearPreprocessor, Dict[str, keras.Model]]:
    """
    Pipeline completa de pr√©-processamento e treinamento de 3 modelos lineares
    (simple, medium, deep) para compara√ß√£o direta de desempenho.
    """

    # ----------------------------
    # Pr√©-processamento
    # ----------------------------
    preproc = LinearPreprocessor(
        data_dir=data_dir,
        model_name='linear_model',
        feature_cols=feature_cols,
        target_cols=target_cols,
        lag=lag,
        lead=lead,
        country_list=country_list
    )

    preproc.load_data()
    preproc.encode(encode_cols='datetime', encode_method='time_cycle')
    preproc.encode(encode_cols='country', encode_method='label')
    preproc.split_train_val_test(train_size=0.6, val_size=0.2, test_size=0.2, time_col='datetime')
    preproc.normalize_splits(value_cols=value_cols, normalization_method='minmax')
    preproc.build_flat_matrices_splits(
        value_cols=value_cols,
        target_cols=target_cols,
        dropna=True,
        group_cols=['country'],
        time_col='datetime'
    )
    preproc.save_splits_tfrecords(output_basename='linear_dataset', shard_size=1000, compression='GZIP')
    print("‚úÖ Pr√©-processamento linear conclu√≠do.")

    return preproc


def linear_train_pipeline(
    problem_name: str,
    data_dir: str,
    batch_size: int = 128,
    configs: Dict[str, Dict[str, Any]] = None
):
    # ----------------------------
    # TFRecord datasets
    # ----------------------------
    meta_path = os.path.join(data_dir, "linear_dataset_train.meta.json")
    if not os.path.exists(meta_path):
        raise FileNotFoundError(f"‚ùå Arquivo de metadados n√£o encontrado: {meta_path}")

    with open(meta_path, "r", encoding="utf-8") as f:
        meta = json.load(f)

    x_dim = int(meta["x_dim"])
    y_dim = int(meta["y_dim"])

    dataset_train = load_tfrecord_dataset(
        path_pattern=os.path.join(data_dir, 'linear_dataset_train*.tfrecord'),
        x_dim=x_dim, y_dim=y_dim, batch_size=batch_size
    )
    dataset_val = load_tfrecord_dataset(
        path_pattern=os.path.join(data_dir, 'linear_dataset_val*.tfrecord'),
        x_dim=x_dim, y_dim=y_dim, batch_size=batch_size
    )
    print("üì¶ Dataset TFRecord carregado para treinamento.")

    histories = {}
    models = {}

    early_stopping = TFEarlyStopping(monitor="val_loss", patience=20, restore_best_weights=True, verbose=0)
    reduce_lr = TFReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=6, min_lr=1e-6, verbose=0)

    # ----------------------------
    # Treinamento de cada modelo
    # ----------------------------
    for name, params in configs.items():
        print(f"\nüöÄ Treinando modelo {name}...")
        if params["linear"]:
            # Modelo Linear
            model = build_linear_model(x_dim=x_dim, y_dim=y_dim, params=params, linear=True)
        else:
            # Modelo MLP
            model = build_linear_model(x_dim=x_dim, y_dim=y_dim, params=params, linear=False)
        hist = model.fit(
            dataset_train,
            validation_data=dataset_val,
            epochs=100,
            callbacks=[early_stopping, reduce_lr],
            verbose=0
        )

        histories[name] = hist
        models[name] = model
        print(f"‚úÖ {name} conclu√≠do - Val Loss: {min(hist.history['val_loss']):.6f}")

    # ----------------------------
    # Salvando modelos
    # ----------------------------
    for name, model in models.items():
        # Salvando modelo no path /modelos/{nome do problema}/{nome do modelo}
        save_model(model, path = f"./modelos/{problem_name}/{name}")
    
    
    return models


## Pipelines dos modelos LSTM
Implementa√ß√£o e uso dos preprocessors e treinadores LSTM para s√©ries temporais (janelas seq_len e lead).

In [None]:
import os
import matplotlib.pyplot as plt
from typing import List, Tuple, Dict
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping as TFEarlyStopping, ReduceLROnPlateau as TFReduceLROnPlateau


def lstm_preproccess_pipeline(
    data_dir: str,
    country_list: List[str],
    feature_cols: List[str],
    target_cols: List[str],
    seq_len: int,
    lead: int,
    value_cols: List[str]
) -> Tuple[LSTMPreprocessor, Dict[str, keras.Model]]:
    """
    Pipeline completa de pr√©-processamento e treinamento de 3 modelos LSTM
    em diferentes escalas de complexidade (simple, medium, deep).
    """

    # ----------------------------
    # Pr√©-processamento
    # ----------------------------
    preproc = LSTMPreprocessor(
        data_dir=data_dir,
        model_name="lstm_model",
        feature_cols=feature_cols,
        target_cols=target_cols,
        country_list=country_list,
        lag=seq_len,
        lead=lead,
    )

    preproc.load_data()
    preproc.encode(encode_cols="datetime", encode_method="time_cycle")
    preproc.encode(encode_cols="country", encode_method="label")
    preproc.split_train_val_test(train_size=0.6, val_size=0.2, test_size=0.2, time_col="datetime")
    preproc.normalize_splits(value_cols=value_cols, normalization_method="minmax")

    # Constr√≥i janelas e salva TFRecords para cada split
    for split_name, split_df in preproc.splits.items():
        preproc.df_base = split_df
        preproc.build_sequence_matrix(
            value_cols=value_cols,
            target_cols=target_cols,
            seq_len=seq_len,
            lead=lead,
            group_cols=["country"],
            time_col="datetime",
        )
        preproc.save_sequence_tfrecords(
            output_basename=f"lstm_dataset_{split_name}", shard_size=1000, compression="GZIP"
        )
    print("‚úÖ Pr√©-processamento sequencial conclu√≠do.")
    return preproc

def lstm_train_pipeline(
    problem_name: str,
    data_dir: str,
    seq_len: int,
    batch_size: int = 128,
    configs: Dict[str, Dict[str, Any]] = None
) -> Tuple[LSTMPreprocessor, Dict[str, keras.Model]]:
    # ----------------------------
    # TFRecord datasets
    # ----------------------------
    meta_path = os.path.join(data_dir, "lstm_dataset_train.meta.json")
    if not os.path.exists(meta_path):
        raise FileNotFoundError(f"‚ùå Arquivo de metadados n√£o encontrado: {meta_path}")

    with open(meta_path, "r", encoding="utf-8") as f:
        meta = json.load(f)

    x_dim = int(meta["x_dim"])
    y_dim = int(meta["y_dim"])

    dataset_train = LSTMPreprocessor.load_sequence_dataset(
        path_pattern=os.path.join(data_dir, "lstm_dataset_train*.tfrecord"),
        seq_len=seq_len,
        x_dim=x_dim,
        y_dim=y_dim,
        batch_size=batch_size,
    )
    dataset_val = LSTMPreprocessor.load_sequence_dataset(
        path_pattern=os.path.join(data_dir, "lstm_dataset_val*.tfrecord"),
        seq_len=seq_len,
        x_dim=x_dim,
        y_dim=y_dim,
        batch_size=batch_size,
    )
    print("üì¶ Dataset TFRecord carregado para treinamento.")

    histories = {}
    models = {}

    early_stopping = TFEarlyStopping(monitor="val_loss", patience=20, restore_best_weights=True, verbose=0)
    reduce_lr = TFReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=6, min_lr=1e-6, verbose=0)

    for name, params in configs.items():
        print(f"\nüöÄ Treinando modelo {name}...")
        model = build_lstm_model(seq_len=seq_len, x_dim=x_dim, y_dim=y_dim, params=params)
        hist = model.fit(
            dataset_train,
            validation_data=dataset_val,
            epochs=100,
            callbacks=[early_stopping, reduce_lr],
            verbose=0,
        )

        histories[name] = hist
        models[name] = model

        print(f"‚úÖ {name} conclu√≠do - Val Loss: {min(hist.history['val_loss']):.6f}")

    # ----------------------------
    # Salvando modelos
    # ----------------------------
    for name, model in models.items():
        # Salvando modelo no path /modelos/{nome do problema}/{nome do modelo}
        save_model(model, path = f"./modelos/{problem_name}/{name}")
    
    
    return models


## Pipelines dos modelos TFT
Pr√©-processamento em parquet e treino com PyTorch Forecasting (Temporal Fusion Transformer) via Lightning.

In [None]:
import os
import pandas as pd
import torch
import torch.nn as nn
from typing import Dict, Any, List
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.models import TemporalFusionTransformer
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch.callbacks import EarlyStopping as LGEarlyStopping, LearningRateMonitor as LGLearningRateMonitor, ModelCheckpoint as LGModelCheckpoint


def tft_preproccess_pipeline(
    data_dir: str,
    country_list: List[str],
    feature_cols: List[str],
    target_cols: List[str],
    seq_len: int,
    lead: int,
    value_cols: List[str]
) -> Tuple[TFTPreprocessor, Dict[str, Any]]:
    """
    Pipeline completa de pr√©-processamento para TFT.
    """

    # ----------------------------
    # Pr√©-processamento
    # ----------------------------
    preproc = TFTPreprocessor(
        data_dir=data_dir,
        model_name='linear_model',
        feature_cols=feature_cols,
        target_cols=target_cols,
        seq_len=seq_len,
        lead=lead,
        country_list=country_list
    )

    preproc.load_data()
    preproc.encode(encode_cols='datetime', encode_method='time_cycle')
    preproc.encode(encode_cols='country', encode_method='label')
    preproc.split_train_val_test(train_size=0.6, val_size=0.2, test_size=0.2, time_col='datetime')
    preproc.normalize_splits(value_cols=value_cols, normalization_method='minmax')
    preproc.build_tft_parquets(
        group_cols=['country'],
        time_col='datetime'
    )
    print("‚úÖ Pr√©-processamento tft conclu√≠do.")

    return preproc



def tft_train_pipeline(
    problem_name: str,
    data_dir: str,
    feature_cols: List[str],
    target_cols: List[str],
    seq_len: int,
    lead: int,
    batch_size: int = 128,
    configs: Dict[str, Dict[str, Any]] = None,
):
    """
    Treinamento de modelos TFT (Temporal Fusion Transformer) usando PyTorch Forecasting + Lightning.

    - Consome os parquets gerados por TFTPreprocessor: tft_dataset_train.parquet e tft_dataset_val.parquet
    - Cria TimeSeriesDataSet para treino/valida√ß√£o
    - Constr√≥i o modelo via TemporalFusionTransformer.from_dataset
    - Treina com EarlyStopping e salva checkpoints por preset
    """

    # ----------------------------
    # 1. Carregar dados pr√©-processados via TFTPreprocessor reutilizando load_tft_dataset
    # ----------------------------
    preproc = TFTPreprocessor(
        data_dir=data_dir,
        model_name='tft_model',
        feature_cols=feature_cols,
        target_cols=target_cols,
        seq_len=seq_len,
        lead=lead,
        country_list=[],
    )

    # usa a fun√ß√£o para retornar DataFrames ‚Äî permite aplicar dtypes e criar TimeSeriesDataSet de forma consistente
    df_train = preproc.load_tft_dataset('train', target_col=target_cols[0])
    df_val = preproc.load_tft_dataset('val', target_col=target_cols[0])

    # ----------------------------
    # 2. TimeSeriesDataSet (encoder/decoder feitos internamente)
    # ----------------------------

    train_loader = df_train.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
    val_loader   = df_val.to_dataloader(train=False, batch_size=batch_size, num_workers=0)

    print(f"üì¶ Dados TFT ‚Äî batches: train={len(train_loader)} | val={len(val_loader)}")

    # ----------------------------
    # 3. Treinamento por preset
    # ----------------------------
    models = {}
    seed_everything(42)

    accelerator = "gpu" if torch.cuda.is_available() else "cpu"

    for name, params in (configs or {}).items():
        print(f"\nüöÄ Treinando TFT preset: {name} [{accelerator}]")

        model = build_tft_model(
            params={
                **params,
                "dataset": df_train,
            }
        )

        save_dir = os.path.join("modelos", problem_name, "TFT", name)
        os.makedirs(save_dir, exist_ok=True)

        callbacks = [
            LGEarlyStopping(monitor="val_loss", patience=int(params.get("patience", 5)), mode="min"),
            LGLearningRateMonitor(logging_interval="epoch"),
            LGModelCheckpoint(
                dirpath=save_dir,
                filename="best",
                monitor="val_loss",
                save_top_k=1,
                mode="min",
            ),
        ]

        trainer = Trainer(
            max_epochs=int(params.get("epochs", 50)),
            accelerator=accelerator,
            devices=1,
            callbacks=callbacks,
            default_root_dir=save_dir,
            log_every_n_steps=10,
            logger=True,
        )

        trainer.fit(model, train_loader, val_loader)
        print(f"‚úÖ {name} conclu√≠do ‚Äî melhor checkpoint salvo em {save_dir}")

        models[name] = model

    return models

# Cap√≠tulo 4: Defini√ß√£o algor√≠tmica dos problemas e resolu√ß√£o
Configura√ß√£o dos problemas (dados, features, janelas) e presets de hiperpar√¢metros para Linear/MLP, LSTM e TFT.

In [None]:
import os, gc, tensorflow as tf

problemas = [
    dict(name="Nivel_1_A", data_dir="data/N1A", feats=["country","datetime","quantity_MW"], tgts=["quantity_MW"], lag=7*24, lead=24, vals=["quantity_MW"], countries=["ES"]),
    dict(name="Nivel_1_B", data_dir="data/N1B", feats=["country","datetime","quantity_MW"], tgts=["quantity_MW"], lag=15*24, lead=3*24, vals=["quantity_MW"], countries=COUNTRY_DOMAINS.keys()),
    dict(name="Nivel_1_C", data_dir="data/N1C", feats=["country","datetime","quantity_MW"], tgts=["quantity_MW"], lag=30*24, lead=7*24, vals=["quantity_MW"], countries=COUNTRY_DOMAINS.keys()),
    dict(name="Nivel_2_A", data_dir="data/N2A", feats=["country","datetime","quantity_MW","price_EUR_MWh"], tgts=["quantity_MW","price_EUR_MWh"], lag=7*24, lead=24, vals=["quantity_MW","price_EUR_MWh"], countries=["ES"]),
    dict(name="Nivel_2_B", data_dir="data/N2B", feats=["country","datetime","quantity_MW","price_EUR_MWh"], tgts=["quantity_MW","price_EUR_MWh"], lag=15*24, lead=3*24, vals=["quantity_MW","price_EUR_MWh"], countries=COUNTRY_DOMAINS.keys()),
    dict(name="Nivel_2_C", data_dir="data/N2C", feats=["country","datetime","quantity_MW","price_EUR_MWh"], tgts=["quantity_MW","price_EUR_MWh"], lag=30*24, lead=7*24, vals=["quantity_MW","price_EUR_MWh"], countries=COUNTRY_DOMAINS.keys())
]

configs_linear = {
    "linear_Simple": {
        "linear": True,
        "units": [],
        "dropout": 0.0,
        "lr": 1e-3,
        "l2": 0.0,
        "layer_norm": False,
    },
    "mlp_Medium": {
        "linear": False,
        "units": [128, 64],
        "dropout": 0.1,
        "lr": 1e-3,
        "l2": 1e-6,
        "act": "relu",
        "layer_norm": False,
    },
    "mlp_Deep": {
        "linear": False,
        "units": [256, 128, 64],
        "dropout": 0.2,
        "lr": 1e-3,
        "l2": 1e-6,
        "act": "relu",
        "layer_norm": True,
    },
}

configs_mlp = {
    "mlp_Simple": {
        "linear": False,
        "units": [64],
        "dropout": 0.05,
        "lr": 1e-3,
        "l2": 1e-6,
        "act": "relu",
        "layer_norm": False,
    },
    "mlp_Medium": {
        "linear": False,
        "units": [128, 64],
        "dropout": 0.1,
        "lr": 1e-3,
        "l2": 1e-6,
        "act": "relu",
        "layer_norm": False,
    },
    "mlp_Deep": {
        "linear": False,
        "units": [256, 128, 64],
        "dropout": 0.2,
        "lr": 1e-3,
        "l2": 1e-6,
        "act": "relu",
        "layer_norm": True,
    },
}

configs_lstm = {
    "lstm_Simple": {
        "lstm_units": [64],
        "dense_units": [64],
        "dropout": 0.05,
        "rec_dropout": 0.0,
        "act": "tanh",
        "lr": 1e-3,
        "l2": 1e-6,
        "layer_norm": False,
    },
    "lstm_Medium": {
        "lstm_units": [128, 64],
        "dense_units": [128, 64],
        "dropout": 0.15,
        "rec_dropout": 0.05,
        "act": "tanh",
        "lr": 1e-3,
        "l2": 1e-6,
        "layer_norm": True,
    },
}

# Presets de TFT compat√≠veis com build_tft_model (PyTorch Forecasting)
# Campos utilizados: hidden_size, dropout, lstm_layers, num_heads, lr, epochs, patience
config_tft = {
    "tft_Simple": {
        "hidden_size": 64,
        "dropout": 0.1,
        "lstm_layers": 1,
        "num_heads": 4,
        "lr": 1e-3,
        "epochs": 50,
        "patience": 5,
    },
    "tft_Medium": {
        "hidden_size": 128,
        "dropout": 0.15,
        "lstm_layers": 2,
        "num_heads": 8,
        "lr": 1e-3,
        "epochs": 75,
        "patience": 8,
    },
    "tft_Deep": {
        "hidden_size": 256,
        "dropout": 0.2,
        "lstm_layers": 2,
        "num_heads": 8,
        "lr": 1e-3,
        "epochs": 100,
        "patience": 10,
    },
}


# Cap√≠tulo 5: Pr√©processamento de dados

In [None]:
import concurrent.futures
import tensorflow as tf
import gc


preprocess_collector = {}
def run_preprocessing(cfg):
    """Executa o pipeline completo de pr√©-processamento para um problema."""
    name = cfg["name"]
    try:
        print(f"\nüöÄ Executando Preprocessamento do problema {name} ...")

        print("üß† Pr√©-processando dados do modelo linear/MLP")
        preproc_lin = linear_preproccess_pipeline(
            data_dir=cfg["data_dir"],
            feature_cols=cfg["feats"],
            target_cols=cfg["tgts"],
            lag=cfg["lag"],
            lead=cfg["lead"],
            value_cols=cfg["vals"],
            country_list=cfg["countries"]
        )

        del preproc_lin
        tf.keras.backend.clear_session()
        gc.collect()

        print("üß† Pr√©-processando dados do modelo LSTM")
        preproc_lstm = lstm_preproccess_pipeline(
            data_dir=cfg["data_dir"],
            feature_cols=cfg["feats"],
            target_cols=cfg["tgts"],
            seq_len=cfg["lag"],
            lead=cfg["lead"],
            value_cols=cfg["vals"],
            country_list=cfg["countries"],
        )
        del preproc_lstm
        tf.keras.backend.clear_session()
        gc.collect()

        # === Adicionado: pr√©-processamento TFT ===
        print("üß† Pr√©-processando dados do modelo TFT")
        preproc_tft = tft_preproccess_pipeline(
            data_dir=cfg["data_dir"],
            feature_cols=cfg["feats"],
            target_cols=cfg["tgts"],
            seq_len=cfg["lag"],
            lead=cfg["lead"],
            value_cols=cfg["vals"],
            country_list=cfg["countries"],
        )
        del preproc_tft
        tf.keras.backend.clear_session()
        gc.collect()

        print(f"‚úÖ Finalizado {name} - mem√≥ria liberada\n{'-'*60}")
        return (name, "OK")

    except Exception as e:
        print(f"‚ùå Erro em {name}: {e}")
        return (name, f"ERRO: {e}")


# ================================
# Execu√ß√£o paralela
# ================================
MAX_WORKERS = min(8, len(problemas))  # ajuste conforme n√∫cleos / VRAM dispon√≠vel

with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    futures = [executor.submit(run_preprocessing, cfg) for cfg in problemas]

    for future in concurrent.futures.as_completed(futures):
        name, status = future.result()
        print(f"üß© Resultado {name}: {status}")


# Cap√≠tulo 6: Treinamento dos modelos
Este cap√≠tulo executa, por problema: Linear/MLP (configs_linear), MLP (configs_mlp), LSTM (configs_lstm) e TFT (config_tft), liberando mem√≥ria entre execu√ß√µes.

In [18]:
import tensorflow as tf
import gc
# Loop de treinamento sequencial: Linear/MLP -> MLP -> LSTM -> TFT

for cfg in problemas:
    name = cfg["name"]
    print(f"\nüöÄ Iniciando treinamento do problema {name} ...")

    # Treinamento Linear / MLP
    try:
        models_linear = linear_train_pipeline(
            problem_name=name,
            data_dir=cfg["data_dir"],
            batch_size=256,
            configs=configs_linear,
        )
        del models_linear
    except Exception as e:
        print(f"‚ùå Erro ao treinar Linear para {name}: {e}")
    finally:
        tf.keras.backend.clear_session()
        gc.collect()

    # Treinamento MLP (configs_mlp)
    try:
        models_mlp = linear_train_pipeline(
            problem_name=name,
            data_dir=cfg["data_dir"],
            batch_size=256,
            configs=configs_mlp,
        )
        del models_mlp
    except Exception as e:
        print(f"‚ùå Erro ao treinar MLP (configs_mlp) para {name}: {e}")
    finally:
        tf.keras.backend.clear_session()
        gc.collect()

    # Treinamento LSTM
    try:
        models_lstm = lstm_train_pipeline(
            problem_name=name,
            data_dir=cfg["data_dir"],
            seq_len=cfg.get("lag") or cfg.get("seq_len"),
            batch_size=256,
            configs=configs_lstm,
        )
        del models_lstm
    except Exception as e:
        print(f"‚ùå Erro ao treinar LSTM para {name}: {e}")
    finally:
        tf.keras.backend.clear_session()
        gc.collect()

    # Treinamento TFT (Temporal Fusion Transformer)
    try:
        models_tft = tft_train_pipeline(
            problem_name=name,
            data_dir=cfg["data_dir"],
            feature_cols=cfg.get("feats") or cfg.get("feature_cols"),
            target_cols=cfg.get("tgts") or cfg.get("target_cols"),
            seq_len=cfg.get("lag"),
            lead=cfg.get("lead"),
            batch_size=256,
            configs=config_tft,
        )
        # libera refer√™ncia ao retorno (modelos serializados internamente)
        del models_tft

    except Exception as e:
        print(f"‚ùå Erro ao treinar TFT para {name}: {e}")

    finally:
        tf.keras.backend.clear_session()
        gc.collect()

    print(f"‚úÖ Problema {name} conclu√≠do ‚Äî mem√≥ria limpa\n{'-'*60}")

KeyboardInterrupt: 