In [2]:
# ============================ Celda 01 ‚Äî Bootstrap y Control de Ejecuci√≥n ============================
# Esta celda NO crea carpetas ni archivos y NO se conecta a MT5.
# Objetivo:
#   1) Iniciar contexto de ejecuci√≥n con RUN_ID √∫nico (YYYYMMDD_HHMMSS).
#   2) Fijar expl√≠citamente la zona horaria objetivo IANA: America/Guayaquil (GYE).
#      - Imprimir SIEMPRE hora en UTC y en GYE (para trazabilidad) y adem√°s la TZ local detectada.
#   3) Verificar espacio en disco m√≠nimo (umbral configurable; por defecto 3 GiB).
#   4) Detectar e imprimir versi√≥n de Python, S.O., usuario (si aplica).
#   5) Definir flags de control de ejecuci√≥n (solo inicializaci√≥n):
#         - FORCE_REDOWNLOAD_BULK_M5 = False
#         - FORCE_REWRITE_DAY        = False
#         - TICKS_RECENT_DAYS        = 30
#         - DATA_VERSION             = "v1"
#         - WRITE_MODE               = "append"
#   6) Preparar variables simples para celdas siguientes (en memoria).
#
# Formato de salida (impresiones obligatorias, en este orden):
#   1) RUN_ID
#   2) Hora GYE (ISO 8601), hora UTC (ISO 8601), TZ objetivo (IANA) y TZ local detectada
#   3) Versi√≥n de Python y S.O. (y usuario si aplica)
#   4) Espacio en disco libre (GiB) y umbral requerido
#   5) Valores iniciales de los flags
#   6) Mensaje final:
#        - "‚úÖ Bootstrap listo: OK para continuar con Celda 02"  (si pasa todo)
#        - Excepci√≥n con mensaje claro si NO hay espacio suficiente o si no se puede fijar la TZ IANA
# =====================================================================================================

from datetime import datetime, timezone
from pathlib import Path
import shutil
import platform
import getpass
import sys

# -------------------------------- 0) Zona horaria objetivo --------------------------------
# Intentamos cargar la zona IANA 'America/Guayaquil' desde la base de datos del sistema.
# En Windows o entornos sin base IANA, aseg√∫rate de tener instalado 'tzdata' (pip install tzdata).
TIMEZONE_IANA: str = "America/Guayaquil"
try:
    from zoneinfo import ZoneInfo  # Python 3.9+
    GYE_TZ = ZoneInfo(TIMEZONE_IANA)
except Exception as e:
    raise RuntimeError(
        f"No se pudo fijar la zona horaria IANA '{TIMEZONE_IANA}'. "
        f"Detalle: {e}. Sugerencia: instala el paquete 'tzdata' e int√©ntalo de nuevo."
    )

# ----------------------------- 1) RUN_ID √∫nico -----------------------------
RUN_ID: str = datetime.now().strftime("%Y%m%d_%H%M%S")

# ------------------ 2) Tiempos y zonas horarias (UTC, GYE, local) ---------
_now_utc = datetime.now(timezone.utc)
UTC_TIME_ISO: str = _now_utc.isoformat()

_now_gye = _now_utc.astimezone(GYE_TZ)
GYE_TIME_ISO: str = _now_gye.isoformat()

_local_dt = datetime.now().astimezone()
LOCAL_TZ_NAME: str = _local_dt.tzname() or "LOCAL_TZ"
LOCAL_TIME_ISO: str = _local_dt.isoformat()

# ------------------------- 3) Python, S.O. y usuario -----------------------
PYTHON_VERSION: str = sys.version.split()[0]
OS_NAME: str = platform.system()
OS_RELEASE: str = platform.release()
OS_VERSION: str = platform.version()
try:
    USER_NAME: str = getpass.getuser()
except Exception:
    USER_NAME = "N/A"

# --------------- 4) Flags de control (solo inicializaci√≥n) ----------------
FORCE_REDOWNLOAD_BULK_M5: bool = False
FORCE_REWRITE_DAY: bool = False
TICKS_RECENT_DAYS: int = 30
DATA_VERSION: str = "v1"
WRITE_MODE: str = "append"   # posibles valores futuros: "overwrite_part"

# ---------- 5) Verificaci√≥n de espacio en disco (sin escribir nada) -------
# Usamos el directorio de trabajo actual como referencia de disco.
DISK_CHECK_PATH = Path.cwd()
_total_b, _used_b, _free_b = shutil.disk_usage(DISK_CHECK_PATH)

def _to_gib(bytes_val: int) -> float:
    return round(bytes_val / (1024**3), 2)

FREE_GIB: float = _to_gib(_free_b)
DISK_MIN_GIB_REQUIRED: float = 3.00  # Umbral configurable

# --------------------------------- IMPRESIONES ---------------------------------
print("================================================================================")
print(f"RUN_ID: {RUN_ID}")
print("--------------------------------------------------------------------------------")
print(f"TZ objetivo (IANA): {TIMEZONE_IANA}")
print(f"Hora GYE (ISO 8601): {GYE_TIME_ISO}")
print(f"Hora UTC (ISO 8601): {UTC_TIME_ISO}")
print(f"TZ local detectada : {LOCAL_TZ_NAME}")
print(f"Hora local (ISO)   : {LOCAL_TIME_ISO}")
print("--------------------------------------------------------------------------------")
print(f"Python: {PYTHON_VERSION} | S.O.: {OS_NAME} {OS_RELEASE}")
print(f"S.O. versi√≥n detallada: {OS_VERSION}")
print(f"Usuario: {USER_NAME}")
print("--------------------------------------------------------------------------------")
print(f"Espacio libre en disco: {FREE_GIB:.2f} GiB | Umbral requerido: {DISK_MIN_GIB_REQUIRED:.2f} GiB")
print(f"Directorio base de verificaci√≥n de disco: {DISK_CHECK_PATH}")
print("--------------------------------------------------------------------------------")
print("FLAGS de control iniciales:")
print(f"  - FORCE_REDOWNLOAD_BULK_M5 = {FORCE_REDOWNLOAD_BULK_M5}")
print(f"  - FORCE_REWRITE_DAY        = {FORCE_REWRITE_DAY}")
print(f"  - TICKS_RECENT_DAYS        = {TICKS_RECENT_DAYS}")
print(f"  - DATA_VERSION             = {DATA_VERSION}")
print(f"  - WRITE_MODE               = {WRITE_MODE}")
print("================================================================================")

# ---------------------------------- VALIDACIONES ----------------------------------
if FREE_GIB < DISK_MIN_GIB_REQUIRED:
    raise RuntimeError(
        f"Espacio insuficiente en disco: {FREE_GIB:.2f} GiB libres < {DISK_MIN_GIB_REQUIRED:.2f} GiB requeridos. "
        f"Libera espacio o ajusta el umbral antes de continuar con la Celda 02."
    )

# Nota: dejamos variables listas para celdas siguientes (no se persisten a√∫n):
#   RUN_ID, TIMEZONE_IANA, GYE_TZ, GYE_TIME_ISO, UTC_TIME_ISO, LOCAL_TZ_NAME, LOCAL_TIME_ISO,
#   PYTHON_VERSION, OS_NAME, OS_RELEASE, OS_VERSION, USER_NAME,
#   FORCE_REDOWNLOAD_BULK_M5, FORCE_REWRITE_DAY, TICKS_RECENT_DAYS,
#   DATA_VERSION, WRITE_MODE, DISK_CHECK_PATH, FREE_GIB, DISK_MIN_GIB_REQUIRED

print("‚úÖ Bootstrap listo: OK para continuar con Celda 02")
# =====================================================================================================


RUN_ID: 20251202_232253
--------------------------------------------------------------------------------
TZ objetivo (IANA): America/Guayaquil
Hora GYE (ISO 8601): 2025-12-02T23:22:53.165378-05:00
Hora UTC (ISO 8601): 2025-12-03T04:22:53.165378+00:00
TZ local detectada : Hora est. Pac√≠fico, Sudam√©rica
Hora local (ISO)   : 2025-12-02T23:22:53.165378-05:00
--------------------------------------------------------------------------------
Python: 3.11.9 | S.O.: Windows 10
S.O. versi√≥n detallada: 10.0.26200
Usuario: PC
--------------------------------------------------------------------------------
Espacio libre en disco: 257.97 GiB | Umbral requerido: 3.00 GiB
Directorio base de verificaci√≥n de disco: c:\Quant\MT5_Data_Extraction
--------------------------------------------------------------------------------
FLAGS de control iniciales:
  - FORCE_REDOWNLOAD_BULK_M5 = False
  - FORCE_REWRITE_DAY        = False
  - TICKS_RECENT_DAYS        = 30
  - DATA_VERSION             = v1
  - WRITE_

In [3]:
# ============================ Celda 02 ‚Äî Configuraci√≥n y Rutas base ============================
# Esta celda crea/valida las rutas base del pipeline y persiste un "snapshot" de configuraci√≥n.
# Reglas:
#   - NO se conecta a MT5.
#   - NO descarga datos.
#   - Crea directorios de forma idempotente (no falla si ya existen).
#   - Escribe metadata/config_snapshot.json con rutas, flags, par√°metros globales y QA/GOLD.
#   - Imprime TZ objetivo, configuraci√≥n Parquet, √°rbol de rutas y la ubicaci√≥n del snapshot.
# ------------------------------------------------------------------------------------------------

from pathlib import Path
from datetime import datetime, timezone
import os
import json
import platform
import sys
import getpass

# --- Unified path contract (single source of truth) ---
import sys as _sys
from pathlib import Path as _Path
for _p in [_Path.cwd().resolve()] + list(_Path.cwd().resolve().parents):
    _contract = _p / "shared" / "contracts" / "path_contract.py"
    if _contract.exists():
        if str(_contract.parent) not in _sys.path:
            _sys.path.insert(0, str(_contract.parent))
        break
import path_contract

# -------------------------- 0) Dependencias del Bootstrap (guardas) --------------------------
# Requiere que la Celda 01 (Bootstrap) haya definido estas variables. Si faltan, se proveen valores seguros.
if 'RUN_ID' not in globals():
    RUN_ID = datetime.now().strftime("%Y%m%d_%H%M%S")
    print(f"‚ö†Ô∏è  Aviso: RUN_ID no estaba definido. Se gener√≥ temporalmente: {RUN_ID}")

if 'TIMEZONE_IANA' not in globals() or not isinstance(TIMEZONE_IANA, str) or not TIMEZONE_IANA:
    TIMEZONE_IANA = "America/Guayaquil"
    print(f"‚ö†Ô∏è  Aviso: TIMEZONE_IANA no estaba definido. Se fij√≥ a: {TIMEZONE_IANA}")

if 'FORCE_REDOWNLOAD_BULK_M5' not in globals():
    FORCE_REDOWNLOAD_BULK_M5 = True
if 'FORCE_REWRITE_DAY' not in globals():
    FORCE_REWRITE_DAY = True
if 'TICKS_RECENT_DAYS' not in globals():
    TICKS_RECENT_DAYS = 30
if 'DATA_VERSION' not in globals():
    DATA_VERSION = "v1"
if 'WRITE_MODE' not in globals():
    WRITE_MODE = "append"

# -------------------------- 1) Par√°metros generales del dataset --------------------------
TIMEFRAME_LABEL: str = "M5"               # Foco intrad√≠a 5 minutos (estricto)
PERSISTENCE_TIMEZONE: str = "UTC"         # Persistencia SIEMPRE en UTC
PARQUET_COMPRESSION: str = "zstd"         # Compresi√≥n recomendada (ratio/velocidad)
PARQUET_WRITE_STATISTICS: bool = True     # Escribir estad√≠sticas en parquet

# -------------------------- 2) PROJECT_ROOT & DATA_ROOT (via path_contract) --------------------------
# Prioridad:
#   1) Variable de entorno MT5_DE_DATA_ROOT (si existe).
#   2) path_contract.data_root() (resoluci√≥n unificada del proyecto).
PROJECT_ROOT = path_contract.detect_project_root()
_env_data = os.environ.get("MT5_DE_DATA_ROOT")
if _env_data:
    DATA_ROOT = Path(_env_data).expanduser().resolve()
else:
    DATA_ROOT = path_contract.data_root(PROJECT_ROOT)

# -------------------------- 3) Definici√≥n de rutas del pipeline --------------------------
# Mantiene la nomenclatura existente del proyecto, derivando TODO desde DATA_ROOT (ROOT √∫nico).
BULK_DATA_DIR            = DATA_ROOT / "bulk_data"
M5_RAW_DIR               = BULK_DATA_DIR / "m5_raw"           # M5 hist√≥rico bruto (particionado s√≠mbolo/a√±o/mes)
TICKS_RECENT_DIR         = BULK_DATA_DIR / "ticks_recent"     # Ticks recientes para QA (30‚Äì90 d√≠as)

HISTORICAL_DATA_DIR      = DATA_ROOT / "historical_data"
M5_CLEAN_DIR             = HISTORICAL_DATA_DIR / "m5_clean"   # Capa gold M5 normalizado (UTC, esquema fijo)

PROCESSED_DATA_DIR       = DATA_ROOT / "processed_data"
M5_WINDOWS_DIR           = PROCESSED_DATA_DIR / "m5_windows"  # Ventanas listas (last_30d/90d/180d)

METADATA_DIR             = DATA_ROOT / "metadata"
LOGS_DIR                 = DATA_ROOT / "logs"
REPORTS_DIR              = DATA_ROOT / "reports"
BACKUPS_DIR              = DATA_ROOT / "backups"

# Archivos de metadata clave (se crean/escriben en celdas espec√≠ficas m√°s adelante)
CONFIG_SNAPSHOT_PATH     = METADATA_DIR / "config_snapshot.json"
SCHEMA_M5_PATH           = METADATA_DIR / "schema_m5.json"            # se escribir√° m√°s adelante
SYMBOLS_BROKER_PATH      = METADATA_DIR / "symbols_broker.parquet"    # se escribir√° m√°s adelante
SERVER_TIME_INFO_PATH    = METADATA_DIR / "server_time_info.json"     # se escribir√° m√°s adelante
DATASET_CATALOG_PATH     = METADATA_DIR / "dataset_catalog.parquet"   # se validar√°/escribir√° m√°s adelante
QA_M5_BULK_PATH          = METADATA_DIR / "qa_m5_bulk.parquet"        # se escribir√° m√°s adelante
RUN_LOG_JSONL_PATH       = METADATA_DIR / "run_log.jsonl"             # se actualizar√° en ingestiones/QA
MANIFEST_PATH            = METADATA_DIR / "manifest.json"             # se generar√° en cierre
CHECKSUMS_JSONL_PATH     = METADATA_DIR / "checksums.jsonl"           # se generar√° en cierre

# -------------------------- 4) Creaci√≥n idempotente de carpetas --------------------------
dirs_to_create = [
    DATA_ROOT,
    BULK_DATA_DIR,
    M5_RAW_DIR,
    TICKS_RECENT_DIR,
    HISTORICAL_DATA_DIR,
    M5_CLEAN_DIR,
    PROCESSED_DATA_DIR,
    M5_WINDOWS_DIR,
    METADATA_DIR,
    LOGS_DIR,
    REPORTS_DIR,
    BACKUPS_DIR,
]
for d in dirs_to_create:
    d.mkdir(parents=True, exist_ok=True)

# -------------------------- 5) Par√°metros de QA & GOLD --------------------------
# Estos par√°metros controlan la QA operativa M5 (Celda 10) y los filtros hacia la capa GOLD (Celda 12C).
# Pueden sobre-escribirse ANTES de ejecutar esta celda, v√≠a globals(), por ejemplo:
#   QA_OK_MIN_OK_RATIO = 0.85
#   QA_GOLD_KEEP_STATUSES = ["OK", "WARN"]

# QA operativa M5 por s√≠mbolo (qa_operativa_summary.parquet)
QA_OK_MIN_OK_RATIO       = float(globals().get("QA_OK_MIN_OK_RATIO", 0.90))    # ‚â• 90% d√≠as OK
QA_OK_MAX_FAIL_RATIO     = float(globals().get("QA_OK_MAX_FAIL_RATIO", 0.05))  # ‚â§ 5% d√≠as FAIL
QA_OK_MIN_REJILLA_MEAN   = float(globals().get("QA_OK_MIN_REJILLA_MEAN", 80.0))# rejilla media ‚â• 80%

QA_BAD_MIN_FAIL_RATIO    = float(globals().get("QA_BAD_MIN_FAIL_RATIO", 0.20)) # ‚â• 20% FAIL ‚Üí BAD
QA_BAD_MAX_REJILLA_MEAN  = float(globals().get("QA_BAD_MAX_REJILLA_MEAN", 60.0))# rejilla media < 60% ‚Üí BAD
QA_BAD_MAX_EMPTY_RATIO   = float(globals().get("QA_BAD_MAX_EMPTY_RATIO", 0.20)) # > 20% EMPTY ‚Üí BAD

# Sanity-check global de cobertura M5 (Celda 10)
MIN_MEAN_REJILLA_HARD    = float(globals().get("MIN_MEAN_REJILLA_HARD", 5.0))  # % rejilla media m√≠nima aceptable

# Para n_bars_mean usamos un default relativo a la rejilla M5 te√≥rica (288 barras/d√≠a).
# Si se desea otro valor, puede sobre-escribirse antes de ejecutar esta celda.
EXPECTED_BARS_M5         = 288
MIN_MEAN_NBARS_HARD      = float(
    globals().get("MIN_MEAN_NBARS_HARD", EXPECTED_BARS_M5 * 0.20)
)  # 20% de las barras esperadas por d√≠a

# Estados que el GOLD va a aceptar por defecto (si nadie los sobreescribe antes).
# OK   = d√≠a totalmente sano.
# WARN = d√≠a estructuralmente sano pero con avisos (rejilla 24h, gaps peque√±os, etc.).
QA_GOLD_KEEP_STATUSES    = list(globals().get("QA_GOLD_KEEP_STATUSES", ["OK", "WARN"]))

# S√≥lo para m√©tricas/reporting en 12C, NO se usa como veto de d√≠as.
MIN_REJILLA_FOR_GOLD     = float(globals().get("MIN_REJILLA_FOR_GOLD", 80.0))

# -------------------------- 6) Snapshot de configuraci√≥n --------------------------
def _safe_get_user():
    try:
        return getpass.getuser()
    except Exception:
        return "N/A"

config_snapshot = {
    "run": {
        "RUN_ID": RUN_ID,
        "created_ts_utc": datetime.now(timezone.utc).isoformat(),
    },
    "environment": {
        "python_version": sys.version.split()[0],
        "os_name": platform.system(),
        "os_release": platform.release(),
        "os_version": platform.version(),
        "user": _safe_get_user(),
    },
    "dataset": {
        "TIMEFRAME_LABEL": TIMEFRAME_LABEL,
        "PERSISTENCE_TIMEZONE": PERSISTENCE_TIMEZONE,
        "TIMEZONE_IANA": TIMEZONE_IANA,                 # <- TZ objetivo incluida en snapshot
        "DATA_VERSION": DATA_VERSION,
        "WRITE_MODE": WRITE_MODE,
        "TICKS_RECENT_DAYS": TICKS_RECENT_DAYS,
        "FORCE_REDOWNLOAD_BULK_M5": bool(FORCE_REDOWNLOAD_BULK_M5),
        "FORCE_REWRITE_DAY": bool(FORCE_REWRITE_DAY),
        "parquet": {
            "compression": PARQUET_COMPRESSION,
            "write_statistics": PARQUET_WRITE_STATISTICS,
        },
    },
    "paths": {
        "DATA_ROOT": str(DATA_ROOT),
        "BULK_DATA_DIR": str(BULK_DATA_DIR),
        "M5_RAW_DIR": str(M5_RAW_DIR),
        "TICKS_RECENT_DIR": str(TICKS_RECENT_DIR),
        "HISTORICAL_DATA_DIR": str(HISTORICAL_DATA_DIR),
        "M5_CLEAN_DIR": str(M5_CLEAN_DIR),
        "PROCESSED_DATA_DIR": str(PROCESSED_DATA_DIR),
        "M5_WINDOWS_DIR": str(M5_WINDOWS_DIR),
        "METADATA_DIR": str(METADATA_DIR),
        "LOGS_DIR": str(LOGS_DIR),
        "REPORTS_DIR": str(REPORTS_DIR),
        "BACKUPS_DIR": str(BACKUPS_DIR),
        # Archivos clave:
        "CONFIG_SNAPSHOT_PATH": str(CONFIG_SNAPSHOT_PATH),
        "SCHEMA_M5_PATH": str(SCHEMA_M5_PATH),
        "SYMBOLS_BROKER_PATH": str(SYMBOLS_BROKER_PATH),
        "SERVER_TIME_INFO_PATH": str(SERVER_TIME_INFO_PATH),
        "DATASET_CATALOG_PATH": str(DATASET_CATALOG_PATH),
        "QA_M5_BULK_PATH": str(QA_M5_BULK_PATH),
        "RUN_LOG_JSONL_PATH": str(RUN_LOG_JSONL_PATH),
        "MANIFEST_PATH": str(MANIFEST_PATH),
        "CHECKSUMS_JSONL_PATH": str(CHECKSUMS_JSONL_PATH),
    },
    "qa": {
        "operativa_m5": {
            "QA_OK_MIN_OK_RATIO": QA_OK_MIN_OK_RATIO,
            "QA_OK_MAX_FAIL_RATIO": QA_OK_MAX_FAIL_RATIO,
            "QA_OK_MIN_REJILLA_MEAN": QA_OK_MIN_REJILLA_MEAN,
            "QA_BAD_MIN_FAIL_RATIO": QA_BAD_MIN_FAIL_RATIO,
            "QA_BAD_MAX_REJILLA_MEAN": QA_BAD_MAX_REJILLA_MEAN,
            "QA_BAD_MAX_EMPTY_RATIO": QA_BAD_MAX_EMPTY_RATIO,
        },
        "sanity_m5": {
            "MIN_MEAN_REJILLA_HARD": MIN_MEAN_REJILLA_HARD,
            "EXPECTED_BARS_M5": EXPECTED_BARS_M5,
            "MIN_MEAN_NBARS_HARD": MIN_MEAN_NBARS_HARD,
        },
        "gold_m5": {
            "QA_GOLD_KEEP_STATUSES": QA_GOLD_KEEP_STATUSES,
            "MIN_REJILLA_FOR_GOLD": MIN_REJILLA_FOR_GOLD,
        },
    },
}

# Persistimos el snapshot (idempotente; se sobreescribe en cada corrida)
CONFIG_SNAPSHOT_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(CONFIG_SNAPSHOT_PATH, "w", encoding="utf-8") as f:
    json.dump(config_snapshot, f, ensure_ascii=False, indent=2)

# -------------------------- 7) Impresiones obligatorias --------------------------
print("================================================================================")
print(f"TZ objetivo (IANA): {TIMEZONE_IANA}")
print(f"DATA_ROOT         : {DATA_ROOT}")
print("--------------------------------------------------------------------------------")
print("Rutas creadas/validadas (idempotente):")
for d in dirs_to_create:
    print(f"OK  {d}")
print("--------------------------------------------------------------------------------")
print(f"Parquet -> compression: {PARQUET_COMPRESSION} | write_statistics: {PARQUET_WRITE_STATISTICS}")
print("--------------------------------------------------------------------------------")
print(f"Snapshot de configuraci√≥n escrito en:\n  {CONFIG_SNAPSHOT_PATH}")
print("================================================================================")
print("‚úÖ Config & Rutas listas: OK para continuar con Celda 03")

# =====================================================================================================
# Variables disponibles para celdas siguientes:
#   DATA_ROOT, BULK_DATA_DIR, M5_RAW_DIR, TICKS_RECENT_DIR,
#   HISTORICAL_DATA_DIR, M5_CLEAN_DIR, PROCESSED_DATA_DIR, M5_WINDOWS_DIR,
#   METADATA_DIR, LOGS_DIR, REPORTS_DIR, BACKUPS_DIR,
#   CONFIG_SNAPSHOT_PATH, SCHEMA_M5_PATH, SYMBOLS_BROKER_PATH, SERVER_TIME_INFO_PATH,
#   DATASET_CATALOG_PATH, QA_M5_BULK_PATH, RUN_LOG_JSONL_PATH, MANIFEST_PATH, CHECKSUMS_JSONL_PATH,
#   TIMEFRAME_LABEL, PERSISTENCE_TIMEZONE, TIMEZONE_IANA, PARQUET_COMPRESSION, PARQUET_WRITE_STATISTICS,
#   RUN_ID, DATA_VERSION, WRITE_MODE, TICKS_RECENT_DAYS, FORCE_REDOWNLOAD_BULK_M5, FORCE_REWRITE_DAY,
#   # QA & GOLD:
#   QA_OK_MIN_OK_RATIO, QA_OK_MAX_FAIL_RATIO, QA_OK_MIN_REJILLA_MEAN,
#   QA_BAD_MIN_FAIL_RATIO, QA_BAD_MAX_REJILLA_MEAN, QA_BAD_MAX_EMPTY_RATIO,
#   MIN_MEAN_REJILLA_HARD, EXPECTED_BARS_M5, MIN_MEAN_NBARS_HARD,
#   QA_GOLD_KEEP_STATUSES, MIN_REJILLA_FOR_GOLD
# =====================================================================================================


TZ objetivo (IANA): America/Guayaquil
DATA_ROOT         : C:\Quant\MT5_Data_Extraction\data
--------------------------------------------------------------------------------
Rutas creadas/validadas (idempotente):
OK  C:\Quant\MT5_Data_Extraction\data
OK  C:\Quant\MT5_Data_Extraction\data\bulk_data
OK  C:\Quant\MT5_Data_Extraction\data\bulk_data\m5_raw
OK  C:\Quant\MT5_Data_Extraction\data\bulk_data\ticks_recent
OK  C:\Quant\MT5_Data_Extraction\data\historical_data
OK  C:\Quant\MT5_Data_Extraction\data\historical_data\m5_clean
OK  C:\Quant\MT5_Data_Extraction\data\processed_data
OK  C:\Quant\MT5_Data_Extraction\data\processed_data\m5_windows
OK  C:\Quant\MT5_Data_Extraction\data\metadata
OK  C:\Quant\MT5_Data_Extraction\data\logs
OK  C:\Quant\MT5_Data_Extraction\data\reports
OK  C:\Quant\MT5_Data_Extraction\data\backups
--------------------------------------------------------------------------------
Parquet -> compression: zstd | write_statistics: True
-----------------------------------

In [4]:
# =============================== Celda 03 ‚Äî Logging y auditor√≠a ===============================
# Objetivo:
#   - Configurar un logger rotativo (archivo + consola) en logs/mt5_de_5m_<RUN_ID>.log.
#   - Formato: [YYYY-MM-DD HH:MM:SS] [RUN_ID] [LEVEL] [CELDA] Mensaje
#   - Banner inicial con: Python/OS, versiones de Polars/PyArrow (sin pandas), RUN_ID,
#     y TZ objetivo IANA (America/Guayaquil) + horas de referencia (UTC y GYE).
#   - Proveer helper get_logger(celda) y log_msg(celda, level, message).
# Reglas:
#   - NO conecta a MT5.
#   - NO usa pandas ni ‚Äúfallbacks‚Äù a pandas.
#   - No bloquea si alguna librer√≠a no est√° instalada (reporta ‚Äúno disponible‚Äù).
#   - Imprime al final: ruta exacta del log, primera l√≠nea efectivamente escrita y mensaje final.
# =================================================================================================

import logging
from logging.handlers import RotatingFileHandler
from datetime import datetime, timezone
from pathlib import Path
import sys, platform, importlib
from typing import Optional

# ----------------------- 0) Dependencias de celdas previas (guardas) -----------------------
# Se espera que la Celda 02 haya definido LOGS_DIR y que la Celda 01 haya definido RUN_ID y TIMEZONE_IANA.
# Si faltan, se proveen valores seguros para no interrumpir la sesi√≥n.
if 'RUN_ID' not in globals():
    RUN_ID = datetime.now().strftime("%Y%m%d_%H%M%S")
    print(f"‚ö†Ô∏è  Aviso: RUN_ID no estaba definido. Se gener√≥ temporalmente: {RUN_ID}")

if 'LOGS_DIR' not in globals():
    LOGS_DIR = (Path.cwd() / "data" / "logs").resolve()
    LOGS_DIR.mkdir(parents=True, exist_ok=True)
    print(f"‚ö†Ô∏è  Aviso: LOGS_DIR no estaba definido. Usando: {LOGS_DIR}")

if 'TIMEZONE_IANA' not in globals() or not isinstance(TIMEZONE_IANA, str) or not TIMEZONE_IANA:
    TIMEZONE_IANA = "America/Guayaquil"
    print(f"‚ö†Ô∏è  Aviso: TIMEZONE_IANA no estaba definido. Se fij√≥ a: {TIMEZONE_IANA}")

# Intentar cargar ZoneInfo para GYE
GYE_TZ = None
try:
    from zoneinfo import ZoneInfo  # Python 3.9+
    GYE_TZ = ZoneInfo(TIMEZONE_IANA)
except Exception:
    # Si falla, continuamos; el logger usar√° la TZ local del sistema.
    pass

# ------------------------------ 1) Archivo de log objetivo ------------------------------
LOG_FILE_PATH = LOGS_DIR / f"mt5_de_5m_{RUN_ID}.log"

# ------------------------------ 2) Formato y formateador con TZ GYE ------------------------------
LOG_DATEFMT = "%Y-%m-%d %H:%M:%S"
LOG_FORMAT  = "[%(asctime)s] [%(run_id)s] [%(levelname)s] [%(celda)s] %(message)s"

class _RunContextFilter(logging.Filter):
    """Inyecta 'run_id' y un 'celda' por defecto para asegurar el formato."""
    def filter(self, record: logging.LogRecord) -> bool:
        if not hasattr(record, "run_id"):
            record.run_id = RUN_ID
        if not hasattr(record, "celda"):
            record.celda = "NA"
        return True

class _TzFormatter(logging.Formatter):
    """Formatter que imprime las fechas en la TZ indicada (por defecto, local)."""
    def __init__(self, fmt: Optional[str] = None, datefmt: Optional[str] = None, tz=None):
        super().__init__(fmt=fmt, datefmt=datefmt)
        self._tz = tz  # ZoneInfo o None

    def formatTime(self, record, datefmt=None):
        dt = datetime.fromtimestamp(record.created, tz=self._tz) if self._tz else datetime.fromtimestamp(record.created)
        if datefmt:
            return dt.strftime(datefmt)
        return dt.isoformat(timespec="seconds")

# ------------------------------ 3) Construcci√≥n del logger ------------------------------
LOGGER_NAME = "mt5_de_5m"
LOGGER = logging.getLogger(LOGGER_NAME)
LOGGER.setLevel(logging.DEBUG)
LOGGER.propagate = False

# Evitar duplicados si se re-ejecuta la celda
for h in list(LOGGER.handlers):
    LOGGER.removeHandler(h)

file_handler = RotatingFileHandler(
    LOG_FILE_PATH,
    mode="a",
    maxBytes=10 * 1024 * 1024,  # ~10 MiB
    backupCount=5,
    encoding="utf-8",
    delay=False
)
console_handler = logging.StreamHandler(stream=sys.stdout)

# Formateadores (timestamps en GYE si est√° disponible)
formatter_gye = _TzFormatter(LOG_FORMAT, datefmt=LOG_DATEFMT, tz=GYE_TZ)
file_handler.setFormatter(formatter_gye)
console_handler.setFormatter(formatter_gye)

# Niveles de handlers (ajustables)
file_handler.setLevel(logging.INFO)
console_handler.setLevel(logging.INFO)

# Filtro para inyectar run_id/celda
ctx_filter = _RunContextFilter()
file_handler.addFilter(ctx_filter)
console_handler.addFilter(ctx_filter)

# Registrar handlers
LOGGER.addHandler(file_handler)
LOGGER.addHandler(console_handler)

# ------------------------------ 4) Utilitarios de logging ------------------------------
def get_logger(celda: str = "NA") -> logging.LoggerAdapter:
    """
    Devuelve un LoggerAdapter con los campos extra ('run_id', 'celda') inyectados.
    Uso:
        log = get_logger("03-Logging")
        log.info("Mensaje")
    """
    return logging.LoggerAdapter(LOGGER, {"run_id": RUN_ID, "celda": celda})

def log_msg(celda: str, level: str, message: str):
    """
    Helper para loguear con prefijo de celda y nivel textual.
    Niveles v√°lidos: DEBUG, INFO, WARNING, ERROR, CRITICAL
    """
    adapter = get_logger(celda)
    lvl = (level or "INFO").upper()
    if   lvl == "DEBUG":    adapter.debug(message)
    elif lvl == "INFO":     adapter.info(message)
    elif lvl == "WARNING":  adapter.warning(message)
    elif lvl == "ERROR":    adapter.error(message)
    elif lvl == "CRITICAL": adapter.critical(message)
    else:                   adapter.info(message + f"  (nivel desconocido: {level})")

# ------------------------------ 5) Detecci√≥n de versiones necesarias ------------------------------
def _detect_version(modname: str) -> str:
    try:
        mod = importlib.import_module(modname)
        return getattr(mod, "__version__", "desconocida")
    except Exception:
        return "no disponible"

PYTHON_VERSION = sys.version.split()[0]
OS_NAME        = platform.system()
OS_RELEASE     = platform.release()
OS_VERSION     = platform.version()

VERS_POLARS  = _detect_version("polars")
VERS_PYARROW = _detect_version("pyarrow")

# ------------------------------ 6) Banner inicial al log ------------------------------
_log = get_logger("03-Logging")

# Registro de una primera l√≠nea para asegurar contenido inmediato
_log.info("Logger inicializado (demostraci√≥n de formato).")

# Horas de referencia y TZ
_now_utc = datetime.now(timezone.utc)
utc_iso  = _now_utc.isoformat(timespec="seconds")
gye_iso  = (_now_utc.astimezone(GYE_TZ).isoformat(timespec="seconds")) if GYE_TZ else "(GYE no disponible; usando TZ local)"

_log.info("=== INICIO DE SESI√ìN DE LOG ===")
_log.info(f"RUN_ID: {RUN_ID} | TIMEFRAME_LABEL: {globals().get('TIMEFRAME_LABEL', 'M5')} | DATA_VERSION: {globals().get('DATA_VERSION', 'v1')}")
_log.info(f"Python: {PYTHON_VERSION} | SO: {OS_NAME} {OS_RELEASE} ({OS_VERSION})")
_log.info(f"polars: {VERS_POLARS} | pyarrow: {VERS_PYARROW}")
_log.info(f"TZ objetivo (IANA): {TIMEZONE_IANA} | Hora GYE: {gye_iso} | Hora UTC: {utc_iso}")
_log.info("========================================================================")

# Flush expl√≠cito a archivo antes de leer
for h in LOGGER.handlers:
    try:
        h.flush()
    except Exception:
        pass

# ------------------------------ 7) Impresiones obligatorias ------------------------------
print("================================================================================")
print(f"Archivo de log activo: {LOG_FILE_PATH}")

first_line = ""
try:
    with open(LOG_FILE_PATH, "r", encoding="utf-8") as f:
        first_line = f.readline().rstrip("\n")
except Exception as e:
    first_line = f"ERROR al leer primera l√≠nea del log: {e}"

print("Primera l√≠nea efectivamente escrita (formato de ejemplo):")
print(first_line if first_line else "(vac√≠o)")
print("--------------------------------------------------------------------------------")
print("‚úÖ Logging y auditor√≠a listos: OK para continuar con Celda 04")

# ========================= Utilitarios disponibles a partir de aqu√≠ =========================
#   - LOGGER (logger base)
#   - LOG_FILE_PATH (path del archivo de log)
#   - get_logger(celda: str) -> LoggerAdapter
#   - log_msg(celda: str, level: str, message: str)
#   - Formato garantizado: [YYYY-MM-DD HH:MM:SS] [RUN_ID] [LEVEL] [CELDA] Mensaje
# ===========================================================================================


[2025-12-02 23:22:53] [20251202_232253] [INFO] [03-Logging] Logger inicializado (demostraci√≥n de formato).
[2025-12-02 23:22:53] [20251202_232253] [INFO] [03-Logging] === INICIO DE SESI√ìN DE LOG ===
[2025-12-02 23:22:53] [20251202_232253] [INFO] [03-Logging] RUN_ID: 20251202_232253 | TIMEFRAME_LABEL: M5 | DATA_VERSION: v1
[2025-12-02 23:22:53] [20251202_232253] [INFO] [03-Logging] Python: 3.11.9 | SO: Windows 10 (10.0.26200)
[2025-12-02 23:22:53] [20251202_232253] [INFO] [03-Logging] polars: 1.35.1 | pyarrow: 22.0.0
[2025-12-02 23:22:53] [20251202_232253] [INFO] [03-Logging] TZ objetivo (IANA): America/Guayaquil | Hora GYE: 2025-12-02T23:22:53-05:00 | Hora UTC: 2025-12-03T04:22:53+00:00
Archivo de log activo: C:\Quant\MT5_Data_Extraction\data\logs\mt5_de_5m_20251202_232253.log
Primera l√≠nea efectivamente escrita (formato de ejemplo):
[2025-12-02 23:22:53] [20251202_232253] [INFO] [03-Logging] Logger inicializado (demostraci√≥n de formato).
-------------------------------------------

In [5]:
# ========================== Celda 04 ‚Äî Inventario de s√≠mbolos del br√≥ker (snapshot) ==========================
# Objetivos (UNA sola funci√≥n):
#   1) Conectar a MT5 con reintentos exponenciales e imprimir diagn√≥stico.
#   2) Capturar info b√°sica de cuenta/terminal y estimar hora del servidor (aprox) vs. UTC.
#   3) Descargar la lista COMPLETA de s√≠mbolos (sin filtrar) y normalizar campos clave.
#   4) (Opcional) A√±adir tick actual por s√≠mbolo (bid/ask/last/time) si est√° disponible.
#   5) Escribir Parquet: metadata/symbols_broker.parquet (Polars-only).
#   6) Escribir JSON: metadata/server_time_info.json.
#   7) Imprimir: ruta escrita, conteos, esquema/dtypes, min/max y percentiles (point/contract_size), % nulos.
#
# Validaciones y prints:
#   - Inicio/fin (UTC y GYE) + TZ objetivo (IANA).
#   - Estado de conexi√≥n MT5 y detalles de cuenta/terminal (si existen).
#   - Ruta exacta del Parquet y JSON escritos.
#   - #s√≠mbolos total, visibles y proxy "tradeable" por trade_mode>=0.
#   - Esquema/dtypes Polars.
#   - min/max y p50/p90/p95/p99 de point y contract_size.
#   - % de nulos en columnas cr√≠ticas (point, contract_size, digits, trade_mode).
#   - Top-5 s√≠mbolos (symbol, path).
#
# Reglas:
#   - NO usa pandas ni loops de transformaci√≥n (solo loops de extracci√≥n MT5).
#   - Conexi√≥n a MT5 se cierra siempre.
#   - Si no hay s√≠mbolos -> error claro. Si MT5 no est√° disponible -> error claro.
# ============================================================================================================

from __future__ import annotations

import time
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import List, Dict, Any, Optional

import polars as pl
import sys, platform

# ------------------------------------ 0) Dependencias previas / guardas ------------------------------------
if 'METADATA_DIR' not in globals():
    METADATA_DIR = (Path.cwd() / "data" / "metadata").resolve()
    METADATA_DIR.mkdir(parents=True, exist_ok=True)

if 'SYMBOLS_BROKER_PATH' not in globals():
    SYMBOLS_BROKER_PATH = METADATA_DIR / "symbols_broker.parquet"

if 'SERVER_TIME_INFO_PATH' not in globals():
    SERVER_TIME_INFO_PATH = METADATA_DIR / "server_time_info.json"

if 'TIMEFRAME_LABEL' not in globals():
    TIMEFRAME_LABEL = "M5"
if 'DATA_VERSION' not in globals():
    DATA_VERSION = "v1"
if 'RUN_ID' not in globals():
    RUN_ID = datetime.now().strftime("%Y%m%d_%H%M%S")

if 'PARQUET_COMPRESSION' not in globals():
    PARQUET_COMPRESSION = "zstd"
if 'PARQUET_WRITE_STATISTICS' not in globals():
    PARQUET_WRITE_STATISTICS = True

if 'TIMEZONE_IANA' not in globals() or not isinstance(TIMEZONE_IANA, str) or not TIMEZONE_IANA:
    TIMEZONE_IANA = "America/Guayaquil"

# TZ objetivo (GYE) para impresiones
try:
    from zoneinfo import ZoneInfo
    GYE_TZ = ZoneInfo(TIMEZONE_IANA)
except Exception:
    GYE_TZ = None  # seguimos sin bloquear, pero avisamos en los prints

def _now_iso_utc() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _now_iso_gye() -> str:
    if GYE_TZ is None:
        return "(GYE no disponible; instala 'tzdata')"
    return datetime.now(timezone.utc).astimezone(GYE_TZ).isoformat(timespec="seconds")

# ----------------------------------------- 1) Conexi√≥n a MT5 -----------------------------------------
print("================================================================================")
print(f"Inicio Celda 04 ‚Äî Inventario de s√≠mbolos | TZ objetivo: {TIMEZONE_IANA}")
print(f"Hora GYE: {_now_iso_gye()} | Hora UTC: {_now_iso_utc()}")
print("--------------------------------------------------------------------------------")

try:
    import MetaTrader5 as mt5
except Exception as e:
    raise ImportError(
        "No se pudo importar 'MetaTrader5'. Instala el paquete y aseg√∫rate de tener el terminal MT5 operativo."
    ) from e

def connect_mt5_with_retries(max_tries: int = 5, base_sleep: float = 1.0) -> bool:
    for i in range(max_tries):
        if mt5.initialize():
            return True
        sleep_s = base_sleep * (2 ** i)
        print(f"‚ö†Ô∏è  Conexi√≥n MT5 fallida #{i+1}/{max_tries}. Reintentando en {sleep_s:.1f}s...")
        time.sleep(sleep_s)
    return False

connected = connect_mt5_with_retries(max_tries=5, base_sleep=1.0)
if not connected:
    last_error = mt5.last_error()
    raise RuntimeError(
        f"No fue posible conectar a MT5 tras varios intentos. last_error={last_error}. "
        f"Verifica terminal, cuenta, permisos y red."
    )
print("‚úÖ Conexi√≥n MT5 establecida.")

# ----------------------------------- 2) Info de cuenta y hora servidor -----------------------------------
account_info = mt5.account_info()
terminal_info = mt5.terminal_info()

server_name  = getattr(account_info, "server", None) if account_info else None
company_name = getattr(account_info, "company", None) if account_info else None
login_id     = getattr(account_info, "login", None) if account_info else None

# Estimaci√≥n de hora de servidor
probe_symbol = "EURUSD"
use_alt_symbol = False
si = mt5.symbol_info(probe_symbol)
if si is None or (not getattr(si, "visible", True)):
    all_syms_for_pick = mt5.symbols_get()
    if all_syms_for_pick and len(all_syms_for_pick) > 0:
        probe_symbol = getattr(all_syms_for_pick[0], "name", "EURUSD")
        use_alt_symbol = True
    else:
        mt5.shutdown()
        raise RuntimeError("La lista de s√≠mbolos est√° vac√≠a; no es posible continuar con el inventario.")

server_time_epoch = None
server_time_iso_utc = None
server_time_source = None
try:
    tick = mt5.symbol_info_tick(probe_symbol)
    if tick is not None and getattr(tick, "time", None):
        server_time_epoch = int(getattr(tick, "time"))
        server_time_iso_utc = datetime.fromtimestamp(server_time_epoch, tz=timezone.utc).isoformat(timespec="seconds")
        server_time_source = "last_tick_time"
    else:
        rates = mt5.copy_rates_from_pos(probe_symbol, mt5.TIMEFRAME_M1, 0, 1)
        if rates is not None and len(rates) > 0 and "time" in rates.dtype.names:
            server_time_epoch = int(rates["time"][0])
            server_time_iso_utc = datetime.fromtimestamp(server_time_epoch, tz=timezone.utc).isoformat(timespec="seconds")
            server_time_source = "last_rate_time"
except Exception as e:
    print(f"‚ö†Ô∏è  No se pudo estimar hora de servidor: {e}")

offset_seconds_estimate = None
if server_time_epoch is not None:
    offset_seconds_estimate = int(server_time_epoch - int(datetime.now(timezone.utc).timestamp()))

server_time_info = {
    "collected_at_utc": _now_iso_utc(),
    "server_name": server_name,
    "company_name": company_name,
    "login_id": login_id,
    "terminal_info": {
        "name": getattr(terminal_info, "name", None),
        "community": getattr(terminal_info, "community", None),
        "path": getattr(terminal_info, "path", None),
        "data_path": getattr(terminal_info, "data_path", None),
        "build": getattr(terminal_info, "build", None),
    },
    "time_probe": {
        "symbol_used": probe_symbol,
        "used_alternative_symbol": use_alt_symbol,
        "server_time_source": server_time_source,
        "server_time_epoch": server_time_epoch,
        "server_time_iso_utc": server_time_iso_utc,
        "local_now_iso_utc": _now_iso_utc(),
        "offset_seconds_estimate": offset_seconds_estimate,
        "note": "Estimaci√≥n aproximada (no TZ oficial del servidor).",
    },
}

# Persistimos JSON auxiliar (no bloqueante si falla)
try:
    SERVER_TIME_INFO_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(SERVER_TIME_INFO_PATH, "w", encoding="utf-8") as f:
        json.dump(server_time_info, f, ensure_ascii=False, indent=2)
    print(f"üìù server_time_info.json escrito en: {SERVER_TIME_INFO_PATH}")
except Exception as e:
    print(f"‚ö†Ô∏è  No se pudo escribir server_time_info.json: {e}")

# ------------------------------------ 3) Descarga y normalizaci√≥n ------------------------------------
print("Descargando lista completa de s√≠mbolos del br√≥ker...")
syms = mt5.symbols_get()
if syms is None:
    err = mt5.last_error()
    mt5.shutdown()
    raise RuntimeError(f"symbols_get() devolvi√≥ None. last_error={err}. Verifica conexi√≥n/permisos.")

if len(syms) == 0:
    mt5.shutdown()
    raise RuntimeError("La lista de s√≠mbolos est√° vac√≠a. ¬øCuenta sin permisos o instrumentos ocultos?")

FIELDS = [
    ("name", "symbol"),
    ("path", "path"),
    ("visible", "visible"),
    ("select", "selected"),
    ("digits", "digits"),
    ("point", "point"),
    ("trade_mode", "trade_mode"),
    ("trade_contract_size", "contract_size"),
    ("trade_stops_level", "stops_level"),
    ("trade_freeze_level", "freeze_level"),
    ("spread", "spread"),
    ("spread_float", "spread_float"),
    ("margin_initial", "margin_initial"),
    ("margin_maintenance", "margin_maintenance"),
    ("swap_mode", "swap_mode"),
    ("swap_long", "swap_long"),
    ("swap_short", "swap_short"),
    ("currency_base", "currency_base"),
    ("currency_profit", "currency_profit"),
    ("currency_margin", "currency_margin"),
    ("description", "description"),
]

rows: List[Dict[str, Any]] = []
for s in syms:
    row = {dst: getattr(s, src, None) for src, dst in FIELDS}
    rows.append(row)

# (Opcional) Tick actual por s√≠mbolo (best-effort). Puede tardar seg√∫n #s√≠mbolos.
GET_TICKS = True
if GET_TICKS:
    for i, r in enumerate(rows):
        sym = r.get("symbol")
        if not sym:
            continue
        try:
            t = mt5.symbol_info_tick(sym)
            if t is not None:
                r["bid"] = float(getattr(t, "bid", None)) if getattr(t, "bid", None) is not None else None
                r["ask"] = float(getattr(t, "ask", None)) if getattr(t, "ask", None) is not None else None
                r["last"] = float(getattr(t, "last", None)) if getattr(t, "last", None) is not None else None
                te = getattr(t, "time", None)
                r["tick_time_utc"] = (
                    datetime.fromtimestamp(int(te), tz=timezone.utc).isoformat(timespec="seconds")
                    if te is not None else None
                )
            if (i + 1) % 500 == 0:
                print(f"  ‚Ä¢ Ticks consultados para {i+1} s√≠mbolos...")
        except Exception:
            r["bid"] = r["ask"] = r["last"] = None
            r["tick_time_utc"] = None

# ------------------------------ 4) DataFrame Polars con tipos expl√≠citos ------------------------------
columns_order = [
    "symbol","path","visible","selected","digits","point","trade_mode","contract_size",
    "stops_level","freeze_level","spread","spread_float","margin_initial","margin_maintenance",
    "swap_mode","swap_long","swap_short","currency_base","currency_profit","currency_margin",
    "description","bid","ask","last","tick_time_utc"
]

df = pl.DataFrame(rows, strict=False)

# Aseguramos columnas faltantes como null
for c in columns_order:
    if c not in df.columns:
        df = df.with_columns(pl.lit(None).alias(c))

# Casteos expl√≠citos y metadatos
df = df.select(columns_order).with_columns([
    pl.col("symbol", "path", "currency_base", "currency_profit", "currency_margin", "description").cast(pl.Utf8),
    pl.col("visible", "selected", "spread_float").cast(pl.Boolean),
    pl.col("digits", "stops_level", "freeze_level", "trade_mode", "spread", "swap_mode").cast(pl.Int32),
    pl.col("point", "contract_size", "margin_initial", "margin_maintenance", "swap_long", "swap_short",
           "bid", "ask", "last").cast(pl.Float64),

    # tick_time_utc -> Datetime con TZ UTC (correcci√≥n: usar 'format' en lugar de 'fmt' y sin 'time_unit')
    pl.when(pl.col("tick_time_utc").is_not_null())
      .then(pl.col("tick_time_utc").str.strptime(pl.Datetime(time_zone="UTC"), strict=False, format=None))
      .otherwise(pl.lit(None))
      .alias("tick_time_utc"),

    # snapshot_ts_utc -> Datetime con TZ UTC (parseando desde ISO)
    pl.lit(_now_iso_utc()).str.strptime(pl.Datetime(time_zone="UTC"), strict=False, format=None).alias("snapshot_ts_utc"),

    pl.lit(TIMEFRAME_LABEL).alias("timeframe_label"),
    pl.lit(DATA_VERSION).alias("data_version"),
    pl.lit(RUN_ID).alias("run_id"),
    pl.lit(server_name).alias("broker_server"),
    pl.lit(company_name).alias("company"),
])

# Reorden final (metadatos primero)
meta_cols = ["snapshot_ts_utc","broker_server","company","timeframe_label","data_version","run_id"]
df = df.select(meta_cols + [c for c in df.columns if c not in meta_cols])

# ------------------------------ 5) M√©tricas (percentiles, nulls, esquema) ------------------------------
def _pct_null(col: str) -> float:
    if col not in df.columns:
        return 100.0
    n = df.height
    if n == 0:
        return 0.0
    return float(df.select(pl.col(col).is_null().sum()).item()) * 100.0 / n

def _percentiles_summary(col: str) -> Optional[pl.DataFrame]:
    if col not in df.columns:
        return None
    sub = df.select(pl.col(col).drop_nans().drop_nulls())
    if sub.height == 0:
        return None
    return sub.select([
        pl.col(col).min().alias("min"),
        pl.col(col).quantile(0.50, "nearest").alias("p50"),
        pl.col(col).quantile(0.90, "nearest").alias("p90"),
        pl.col(col).quantile(0.95, "nearest").alias("p95"),
        pl.col(col).quantile(0.99, "nearest").alias("p99"),
        pl.col(col).max().alias("max"),
        pl.count().alias("n"),
    ])

total_symbols = df.height
visible_count = df.select(pl.col("visible").fill_null(False).sum()).item() if "visible" in df.columns else None
tradeable_proxy = df.select((pl.col("trade_mode").fill_null(-1) >= 0).sum()).item() if "trade_mode" in df.columns else None

point_stats = _percentiles_summary("point")
cs_stats    = _percentiles_summary("contract_size")

# ------------------------------ 6) Escritura Parquet (Polars) ------------------------------
SYMBOLS_BROKER_PATH.parent.mkdir(parents=True, exist_ok=True)
df.write_parquet(
    SYMBOLS_BROKER_PATH,
    compression=PARQUET_COMPRESSION,
    statistics=bool(PARQUET_WRITE_STATISTICS)
)

# ------------------------------ 7) Cierre de MT5 (siempre) ------------------------------
try:
    mt5.shutdown()
except Exception:
    pass

# ------------------------------ 8) Impresiones obligatorias ------------------------------
print("--------------------------------------------------------------------------------")
print(f"Inventario de s√≠mbolos escrito en: {SYMBOLS_BROKER_PATH}")
print(f"Total de s√≠mbolos: {total_symbols}", end="")
if visible_count is not None:
    print(f" | visibles: {visible_count}", end="")
if tradeable_proxy is not None:
    print(f" | 'tradeable'(proxy trade_mode>=0): {tradeable_proxy}", end="")
print()

print("Esquema/dtypes (Polars):")
for c, t in df.schema.items():
    print(f"  - {c}: {t}")

crit_cols = ["point","contract_size","digits","trade_mode"]
print("Nulos en columnas cr√≠ticas (%):")
for c in crit_cols:
    print(f"  - {c}: {_pct_null(c):.2f}%")

def _print_stats(label: str, stats_df: Optional[pl.DataFrame]):
    print(f"{label}:")
    if stats_df is None:
        print("  (sin datos v√°lidos)")
        return
    s = stats_df.row(0)
    keys = ["min","p50","p90","p95","p99","max","n"]
    for k, v in zip(keys, s):
        val = "None" if v is None else (f"{v:.10f}" if isinstance(v, float) else str(v))
        print(f"  - {k}: {val}")

_print_stats("point (min/p50/p90/p95/p99/max, n)", point_stats)
_print_stats("contract_size (min/p50/p90/p95/p99/max, n)", cs_stats)

print("--------------------------------------------------------------------------------")
print("Top-5 s√≠mbolos (symbol, path):")
for r in df.select(["symbol","path"]).head(5).iter_rows():
    print(f"  - {r[0]} | {r[1]}")

print(f"server_time_info.json escrito en: {SERVER_TIME_INFO_PATH}")
print("--------------------------------------------------------------------------------")
print(f"Hora GYE fin: {_now_iso_gye()} | Hora UTC fin: {_now_iso_utc()}")
print("‚úÖ Inventario de s√≠mbolos completado: OK para continuar con Celda 05")
# ============================================================================================================


Inicio Celda 04 ‚Äî Inventario de s√≠mbolos | TZ objetivo: America/Guayaquil
Hora GYE: 2025-12-02T23:22:53-05:00 | Hora UTC: 2025-12-03T04:22:53+00:00
--------------------------------------------------------------------------------
‚úÖ Conexi√≥n MT5 establecida.
üìù server_time_info.json escrito en: C:\Quant\MT5_Data_Extraction\data\metadata\server_time_info.json
Descargando lista completa de s√≠mbolos del br√≥ker...
--------------------------------------------------------------------------------
Inventario de s√≠mbolos escrito en: C:\Quant\MT5_Data_Extraction\data\metadata\symbols_broker.parquet
Total de s√≠mbolos: 131 | visibles: 131 | 'tradeable'(proxy trade_mode>=0): 131
Esquema/dtypes (Polars):
  - snapshot_ts_utc: Datetime(time_unit='us', time_zone='UTC')
  - broker_server: String
  - company: String
  - timeframe_label: String
  - data_version: String
  - run_id: String
  - symbol: String
  - path: String
  - visible: Boolean
  - selected: Boolean
  - digits: Int32
  - point: F

(Deprecated in version 0.20.5)
  pl.count().alias("n"),


In [6]:
# ====================== Celda 05 ‚Äî Filtro de costes (spread + comisiones) [‚Äú3B‚Äù] ======================
# Prop√≥sito:
#   - Estimar el coste round-trip en bps por s√≠mbolo: cost_bps = spread_bps + commission_bps + slippage_bps.
#   - Filtrar el universo elegible seg√∫n umbrales por clase de activo.
# Entradas:
#   - metadata/symbols_broker.parquet  (de Celda 04; debe incluir: symbol, path, bid, ask, contract_size, point, trade_mode, visible, description)
#   - metadata/fees/commissions.json   (opcional; comisiones por clase/s√≠mbolo; USD por round-trip o por lado)
#   - metadata/filters/cost_filter_config.json (opcional; umbrales por clase, slippage_bps, require_valid_tick)
# Salidas:
#   - metadata/filters/eligible_symbols_by_cost.parquet  (s√≠mbolos elegibles con columnas clave)
#   - metadata/filters/eligible_symbols_by_cost.txt      (lista de s√≠mbolos elegibles, uno por l√≠nea)
#   - metadata/filters/cost_filter_report.json           (reporte con m√©tricas y conteos)
#   - metadata/costs_summary.parquet                     (resumen de costes por s√≠mbolo, 1 fila por s√≠mbolo)
#
# Impresiones obligatorias:
#   - Input path, par√°metros (slippage_bps, require_valid_tick), thresholds por clase.
#   - p50/p90/p95/p99 para spread_bps, commission_bps y cost_bps (ignorando nulls).
#   - Œ£ total / elegibles / no elegibles; faltantes por m√©trica.
#   - Top-10 mayor coste y Top-10 elegibles con menor coste.
#   - Rutas exactas de salidas (parquet/txt/json) con n√∫mero de filas.
#
# Reglas:
#   - Polars-only para c√°lculo/transformaciones (sin pandas).
#   - NO re-conecta a MT5; usa el snapshot de Celda 04 (bid/ask).
#   - Una celda = una funci√≥n.
# Nota importante (costes actuales vs hist√≥rico):
#   - Este filtro 3B utiliza symbols_broker.parquet + commissions.json ACTUALES.
#   - En los backtests sobre 4 a√±os de hist√≥rico, el coste aplicado es una aproximaci√≥n
#     "con condiciones actuales" (NO reproduce las condiciones hist√≥ricas exactas).
#   - Interpretaci√≥n: PnL backtest ‚âà PnL con spreads/comisiones de hoy.

# ======================================================================================================

from __future__ import annotations

import json
import re
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, Any, Optional

import polars as pl

# --------------------------- Logger (reutiliza Celda 03 si existe) ---------------------------
def _log(level: str, msg: str):
    """
    Wrapper ligero sobre log_msg() de la Celda 03.
    Si no existe log_msg en globals(), no hace nada (no rompe la ejecuci√≥n).
    """
    if "log_msg" in globals():
        try:
            log_msg("05-Costs", level, msg)
        except Exception:
            # Fallback silencioso si hubiera cualquier problema con el logger
            pass

# --------------------------- Guardas y rutas base (alineadas a celdas previas) ---------------------------
if 'METADATA_DIR' not in globals():
    METADATA_DIR = (Path.cwd() / "data" / "metadata").resolve()
    METADATA_DIR.mkdir(parents=True, exist_ok=True)

SYMBOLS_BROKER_PATH   = METADATA_DIR / "symbols_broker.parquet"
FILTERS_DIR           = METADATA_DIR / "filters"
FEES_DIR              = METADATA_DIR / "fees"
FILTERS_DIR.mkdir(parents=True, exist_ok=True)
FEES_DIR.mkdir(parents=True, exist_ok=True)

ELIGIBLE_PARQUET      = FILTERS_DIR / "eligible_symbols_by_cost.parquet"
ELIGIBLE_TXT          = FILTERS_DIR / "eligible_symbols_by_cost.txt"
REPORT_JSON           = FILTERS_DIR / "cost_filter_report.json"
COMMISSIONS_PATH      = FEES_DIR / "commissions.json"
COST_CFG_PATH         = FILTERS_DIR / "cost_filter_config.json"

# üîπ Nuevo: resumen de costes por s√≠mbolo en metadata/
COSTS_SUMMARY_PATH    = METADATA_DIR / "costs_summary.parquet"

RUN_ID                = globals().get("RUN_ID", datetime.now().strftime("%Y%m%d_%H%M%S"))
TIMEFRAME_LABEL       = globals().get("TIMEFRAME_LABEL", "M5")
DATA_VERSION          = globals().get("DATA_VERSION", "v1")

# --------------------------- Config por defecto (se puede sobreescribir con cost_filter_config.json) ---------------------------
DEFAULT_CFG = {
    "slippage_bps": 0.0,
    "require_valid_tick": True,
    "threshold_bps": {
        "FX_MAJOR": 3.0,
        "FX_MINOR": 6.0,
        "METAL": 10.0,
        "INDEX": 8.0,
        "CRYPTO": 50.0,
        "ENERGY": 12.0,
        "EQUITY": 12.0,
        "OTHER": 12.0
    },
    "classify": {
        "fx_majors_suffix": ["USD","EUR","JPY","GBP","AUD","NZD","CAD","CHF"],
        "metals_prefix": ["XAU","XAG","XPT","XPD"],
        "indices_keywords": ["US500","US100","US30","GER40","FRA40","UK100","EU50","JP225",".CASH"],
        "crypto_prefix": ["BTC","ETH","SOL","BNB","ADA","XRP","DOGE","LTC","DOT","AVAX","UNI","LINK","XLM","ATOM","ETC","FIL","AAVE","ALGO","NEAR","SUI","ARB","OP","APT","MATIC","TON","PEPE","SHIB"],
        "energy_keywords": ["WTI","BRENT","UKOIL","USOIL","NGAS","NATGAS"],
        "crypto_path_markers": ["CRYPTO", "CRYPTOCURRENCY"]
    }
}

# Cargar overrides del usuario (si existen)
cfg = DEFAULT_CFG.copy()
if COST_CFG_PATH.exists():
    try:
        with open(COST_CFG_PATH, "r", encoding="utf-8") as f:
            user_cfg = json.load(f)
        for k, v in user_cfg.items():
            if isinstance(v, dict) and k in cfg:
                cfg[k].update(v)
            else:
                cfg[k] = v
        print(f"‚öôÔ∏è  cost_filter_config.json cargado: {COST_CFG_PATH}")
        _log("INFO", f"cost_filter_config.json cargado desde {COST_CFG_PATH}")
    except Exception as e:
        msg = f"No se pudo leer cost_filter_config.json: {e}. Se usan valores por defecto."
        print(f"‚ö†Ô∏è  {msg}")
        _log("WARNING", msg)

# Comisiones (USD)
COMMISSIONS: Dict[str, Any] = {
    "default": {"unit": "per_lot", "round_trip": True, "value_usd": 0.0},
    "SYMBOL_OVERRIDES": {}
}
if COMMISSIONS_PATH.exists():
    try:
        with open(COMMISSIONS_PATH, "r", encoding="utf-8") as f:
            COMMISSIONS.update(json.load(f))
        print(f"üí≤ commissions.json cargado: {COMMISSIONS_PATH}")
        _log("INFO", f"commissions.json cargado desde {COMMISSIONS_PATH}")
    except Exception as e:
        msg = f"No se pudo leer commissions.json: {e}. Se asume comisi√≥n 0.0 USD (round-trip)."
        print(f"‚ö†Ô∏è  {msg}")
        _log("WARNING", msg)
else:
    msg = "commissions.json no encontrado. Se asume comisi√≥n 0.0 USD (round-trip)."
    print(f"‚ö†Ô∏è  {msg}")
    _log("WARNING", msg)

slippage_bps: float             = float(cfg.get("slippage_bps", 0.0))
require_valid_tick: bool        = bool(cfg.get("require_valid_tick", True))
thresholds_map: Dict[str,float] = {**DEFAULT_CFG["threshold_bps"], **cfg.get("threshold_bps", {})}

# --------------------------- Lectura del snapshot de s√≠mbolos ---------------------------
if not SYMBOLS_BROKER_PATH.exists():
    msg = f"Falta entrada requerida: {SYMBOLS_BROKER_PATH}"
    _log("ERROR", msg)
    raise RuntimeError(msg)

sb = pl.read_parquet(SYMBOLS_BROKER_PATH)
if sb.height == 0:
    msg = "symbols_broker.parquet est√° vac√≠o; no hay s√≠mbolos para evaluar."
    _log("ERROR", msg)
    raise RuntimeError(msg)

_log("INFO", f"Snapshot de s√≠mbolos cargado: {SYMBOLS_BROKER_PATH} ({sb.height} filas)")

# Asegurar columnas necesarias (si alguna falta, crear como null)
need_cols = [
    "symbol","path","visible","trade_mode","digits","point","contract_size",
    "currency_profit","description","spread","spread_float","bid","ask"
]
for c in need_cols:
    if c not in sb.columns:
        sb = sb.with_columns(pl.lit(None).alias(c))

# Uppercase auxiliares para clasificaci√≥n
sb = sb.with_columns([
    pl.col("symbol").cast(pl.Utf8).alias("symbol"),
    pl.col("path").cast(pl.Utf8).alias("path"),
    pl.col("description").cast(pl.Utf8).alias("description"),
    pl.col("symbol").cast(pl.Utf8).str.to_uppercase().alias("_SYM"),
    pl.col("path").cast(pl.Utf8).str.to_uppercase().alias("_PATH"),
    pl.col("description").cast(pl.Utf8).str.to_uppercase().alias("_DESC"),
])

# --------------------------- Clasificaci√≥n de activo (expresiones Polars) ---------------------------
maj = cfg["classify"]["fx_majors_suffix"]
met_pat = r"^(XAU|XAG|XPT|XPD)"
idx_pat = r"(US500|US100|US30|GER40|FRA40|UK100|EU50|JP225|\.CASH)"
ene_pat = r"(WTI|BRENT|UKOIL|USOIL|NGAS|NATGAS)"
cry_pre_pat = r"^(%s)" % "|".join([re.escape(s) for s in cfg["classify"]["crypto_prefix"]])

is_index  = pl.col("_SYM").str.contains(idx_pat) | pl.col("_PATH").str.contains(idx_pat)
is_metal  = pl.col("_SYM").str.contains(met_pat)
is_energy = pl.col("_SYM").str.contains(ene_pat) | pl.col("_PATH").str.contains(ene_pat)
is_crypto_path = pl.col("_PATH").str.contains("CRYPTO") | pl.col("_DESC").str.contains("CRYPTO")
is_crypto_pref = pl.col("_SYM").str.contains(cry_pre_pat)

sym_len   = pl.col("_SYM").str.len_chars()
base3     = pl.col("_SYM").str.slice(0, 3)
quote3    = pl.col("_SYM").str.slice(-3)
is_len_ok = (sym_len >= 6) & (sym_len <= 10)
is_fx     = is_len_ok & quote3.is_in(maj)
is_fx_maj = is_fx & base3.is_in(maj)
is_equity = pl.col("_PATH").str.contains("STOCK|SHARE|EQUITY") | pl.col("_DESC").str.contains("STOCK|SHARE|EQUITY")

asset_class_expr = (
    pl.when(is_index).then(pl.lit("INDEX"))
      .when(is_metal).then(pl.lit("METAL"))
      .when(is_energy).then(pl.lit("ENERGY"))
      .when(is_crypto_path | is_crypto_pref).then(pl.lit("CRYPTO"))
      .when(is_fx).then(pl.when(is_fx_maj).then(pl.lit("FX_MAJOR")).otherwise(pl.lit("FX_MINOR")))
      .when(is_equity).then(pl.lit("EQUITY"))
      .otherwise(pl.lit("OTHER"))
)
sb = sb.with_columns(asset_class_expr.alias("asset_class"))

# --------------------------- Umbral por clase (join de referencia, sin UDF) ---------------------------
th_df = pl.DataFrame({
    "asset_class": list(thresholds_map.keys()),
    "threshold_bps": list(thresholds_map.values())
}).with_columns(pl.col("asset_class").cast(pl.Utf8))

sb = sb.join(th_df, on="asset_class", how="left")

# --------------------------- Comisi√≥n USD round-trip (override‚Üíclase‚Üídefault) ---------------------------
def _extract_comm_value(entry: Dict[str, Any]) -> float:
    if not isinstance(entry, dict):
        return 0.0
    val = float(entry.get("value_usd", 0.0))
    rt  = bool(entry.get("round_trip", True))
    return val if rt else (2.0 * val)

sym_over = COMMISSIONS.get("SYMBOL_OVERRIDES", {}) or {}

def _commission_usd_rt(symbol: Optional[str], asset_class: Optional[str]) -> float:
    s = (symbol or "").upper()
    if s in sym_over:
        return _extract_comm_value(sym_over[s])
    cls = COMMISSIONS.get(asset_class or "", None)
    if cls is not None:
        return _extract_comm_value(cls)
    return _extract_comm_value(COMMISSIONS.get("default", {}))

sb = sb.with_columns(
    pl.struct(["symbol","asset_class"]).map_elements(lambda r: _commission_usd_rt(r["symbol"], r["asset_class"])).alias("commission_usd_rt")
)

# --------------------------- C√°lculo de m√©tricas en bps (expresiones Polars) ---------------------------
valid_tick = pl.col("bid").is_not_null() & pl.col("ask").is_not_null() & (pl.col("bid") > 0) & (pl.col("ask") > 0)
mid       = pl.when(valid_tick).then((pl.col("bid") + pl.col("ask")) * 0.5).otherwise(None)

spread_bps = (
    pl.when(valid_tick & (pl.col("ask") > pl.col("bid")))
      .then(((pl.col("ask") - pl.col("bid")) / mid) * 10000.0)
      .otherwise(None)
)

commission_bps = (
    pl.when((mid.is_not_null()) & (pl.col("contract_size").is_not_null()) & (pl.col("contract_size") > 0))
      .then((pl.col("commission_usd_rt") / (mid * pl.col("contract_size"))) * 10000.0)
      .otherwise(
          pl.when(pl.col("commission_usd_rt") == 0.0).then(0.0).otherwise(None)
      )
)

sb = sb.with_columns([
    spread_bps.alias("spread_bps"),
    commission_bps.alias("commission_bps"),
    pl.lit(float(slippage_bps)).alias("slippage_bps"),
])

cost_bps = (
    pl.when((~valid_tick) & pl.lit(require_valid_tick))
      .then(None)
      .otherwise(pl.col("spread_bps").fill_null(0.0) + pl.col("commission_bps").fill_null(0.0) + pl.col("slippage_bps").fill_null(0.0))
)
sb = sb.with_columns(cost_bps.alias("cost_bps"))

visible_ok    = pl.when(pl.col("visible").is_null()).then(True).otherwise(pl.col("visible"))
trade_mode_ok = pl.when(pl.col("trade_mode").is_null()).then(True).otherwise(pl.col("trade_mode") >= 0)
within_th     = pl.col("cost_bps").is_not_null() & (pl.col("cost_bps") <= pl.col("threshold_bps"))

eligible = (
    pl.when(pl.lit(require_valid_tick) & (~valid_tick)).then(False).otherwise(True)
    & within_th & visible_ok & trade_mode_ok
)
sb = sb.with_columns(eligible.alias("eligible"))

reason_no_tick   = pl.when(pl.lit(require_valid_tick) & (~valid_tick)).then(pl.lit("NO_TICK")).otherwise(pl.lit(""))
reason_spread_na = pl.when(pl.col("spread_bps").is_null()).then(pl.lit("SPREAD_BPS_MISSING")).otherwise(pl.lit(""))
reason_comm_na   = pl.when(pl.col("commission_bps").is_null()).then(pl.lit("COMMISSION_BPS_MISSING")).otherwise(pl.lit(""))
reason_cost_gt   = pl.when(pl.col("cost_bps").is_not_null() & (pl.col("cost_bps") > pl.col("threshold_bps"))) \
                    .then(pl.concat_str([pl.lit("COST>"), pl.col("threshold_bps").round(2).cast(pl.Utf8), pl.lit("bps")], separator="")) \
                    .otherwise(pl.lit(""))
reason_not_vis   = pl.when(pl.col("visible").is_not_null() & (pl.col("visible") == False)).then(pl.lit("NOT_VISIBLE")).otherwise(pl.lit(""))
reason_tm_neg    = pl.when(pl.col("trade_mode").is_not_null() & (pl.col("trade_mode") < 0)).then(pl.lit("TRADE_MODE<0")).otherwise(pl.lit(""))

sb = sb.with_columns(
    pl.concat_list([reason_no_tick, reason_spread_na, reason_comm_na, reason_cost_gt, reason_not_vis, reason_tm_neg])
      .list.eval(pl.element().filter(pl.element() != ""))
      .list.join(";")
      .alias("reason")
)

# --------------------------- Res√∫menes de percentiles ---------------------------
def pct_summary(df: pl.DataFrame, col: str) -> Optional[dict]:
    if col not in df.columns:
        return None
    sub = df.select(pl.col(col).drop_nulls())
    if sub.height == 0:
        return None
    return sub.select([
        pl.col(col).quantile(0.50, "nearest").alias("p50"),
        pl.col(col).quantile(0.90, "nearest").alias("p90"),
        pl.col(col).quantile(0.95, "nearest").alias("p95"),
        pl.col(col).quantile(0.99, "nearest").alias("p99"),
        pl.len().alias("n"),
    ]).to_dicts()[0]

spread_stats = pct_summary(sb, "spread_bps")
comm_stats   = pct_summary(sb, "commission_bps")
cost_stats   = pct_summary(sb, "cost_bps")

# --------------------------- Salidas (parquet/txt/json) existentes ---------------------------
eligible_out = sb.filter(pl.col("eligible")).select([
    "symbol","asset_class","spread_bps","commission_bps","slippage_bps","cost_bps","threshold_bps","reason"
])

eligible_out.write_parquet(ELIGIBLE_PARQUET, compression="zstd", statistics=True)

with open(ELIGIBLE_TXT, "w", encoding="utf-8") as f:
    for s in eligible_out.select("symbol").to_series().to_list():
        f.write(f"{s}\n")

report = {
    "run_id": RUN_ID,
    "timeframe": TIMEFRAME_LABEL,
    "data_version": DATA_VERSION,
    "generated_at_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
    "inputs": {
        "symbols_broker": str(SYMBOLS_BROKER_PATH),
        "commissions_json": str(COMMISSIONS_PATH),
        "cost_filter_config": str(COST_CFG_PATH),
    },
    "params": {
        "slippage_bps": slippage_bps,
        "require_valid_tick": require_valid_tick,
        "threshold_bps": thresholds_map,
    },
    "counts": {
        "total_symbols": int(sb.height),
        "eligible": int(eligible_out.height),
        "not_eligible": int(sb.height - eligible_out.height),
        "missing_spread_bps": int(sb.select(pl.col("spread_bps").is_null().sum()).item()),
        "missing_commission_bps": int(sb.select(pl.col("commission_bps").is_null().sum()).item()),
        "cost_undefined": int(sb.select(pl.col("cost_bps").is_null().sum()).item()),
    },
    "percentiles": {
        "spread_bps": spread_stats,
        "commission_bps": comm_stats,
        "cost_bps": cost_stats,
    },
    "notes": [
        "spread_bps = ((ask - bid) / mid) * 10000, con mid=(bid+ask)/2 y guardas.",
        "commission_bps = (commission_usd_rt / (mid * contract_size)) * 10000. Si commission_usd_rt=0 ‚Üí 0 bps.",
        "slippage_bps es constante desde config.",
        "Elegible requiere (por defecto): tick v√°lido, cost_bps ‚â§ threshold_bps, visible!=False y trade_mode>=0 si viene informado."
    ],
}
with open(REPORT_JSON, "w", encoding="utf-8") as f:
    json.dump(report, f, ensure_ascii=False, indent=2)

_log("INFO", f"eligible_symbols_by_cost.parquet escrito ({eligible_out.height} filas)")

# --------------------------- NUEVO: costs_summary.parquet (resumen por s√≠mbolo) ---------------------------
# Estructura recomendada:
#   - symbol, asset_class
#   - spread_bps, commission_bps, slippage_bps, cost_bps, threshold_bps
#   - eligible, reason
#   - cost_flag: OK / CARO / PROHIBITIVO / UNKNOWN

# Definici√≥n de cost_flag por tramos relativos al threshold_bps:
cost_flag_expr = (
    pl.when(pl.col("cost_bps").is_null() | pl.col("threshold_bps").is_null())
      .then(pl.lit("UNKNOWN"))
      .when(pl.col("cost_bps") <= pl.col("threshold_bps") * 0.5)
      .then(pl.lit("OK"))                  # coste claramente barato vs umbral 3B
      .when(pl.col("cost_bps") <= pl.col("threshold_bps"))
      .then(pl.lit("CARO"))                # coste alto pero todav√≠a dentro de 3B
      .otherwise(pl.lit("PROHIBITIVO"))    # coste por encima del umbral 3B
)

costs_summary = (
    sb.select([
        "symbol",
        "asset_class",
        "spread_bps",
        "commission_bps",
        "slippage_bps",
        "cost_bps",
        "threshold_bps",
        "eligible",
        "reason",
    ])
    .with_columns(cost_flag_expr.alias("cost_flag"))
)

costs_summary.write_parquet(COSTS_SUMMARY_PATH, compression="zstd", statistics=True)
_log("INFO", f"costs_summary.parquet escrito en {COSTS_SUMMARY_PATH} ({costs_summary.height} filas)")

# --------------------------- Impresiones de auditor√≠a ---------------------------
print("================================================================================")
print(">>> Celda 05 :: Filtro de Costes (Spread + Comisiones)")
print(f"üìÅ INPUT ‚Üí {SYMBOLS_BROKER_PATH}  (rows={sb.height})")
print(f"‚öôÔ∏è  slippage_bps={slippage_bps} | require_valid_tick={require_valid_tick}")
print("‚öôÔ∏è  thresholds_bps por clase:")
for k, v in thresholds_map.items():
    print(f"   - {k:<9}: {v:.2f} bps")

print("--------------------------------------------------------------------------------")
def _print_stats(name: str, stats: Optional[dict]):
    print(f"{name} percentiles (bps):")
    if not stats:
        print("  (sin datos v√°lidos)")
        return
    print(f"  p50={stats['p50']:.2f} | p90={stats['p90']:.2f} | p95={stats['p95']:.2f} | p99={stats['p99']:.2f} | n={int(stats['n'])}")

_print_stats("spread_bps", spread_stats)
_print_stats("commission_bps", comm_stats)
_print_stats("cost_bps", cost_stats)

print("--------------------------------------------------------------------------------")
total = report["counts"]["total_symbols"]; n_ok = report["counts"]["eligible"]; n_bad = report["counts"]["not_eligible"]
print(f"Œ£ s√≠mbolos={total} | ‚úÖ elegibles={n_ok} | ‚ùå no elegibles={n_bad}")
print(f"faltan spread_bps={report['counts']['missing_spread_bps']} | faltan commission_bps={report['counts']['missing_commission_bps']} | cost_undefined={report['counts']['cost_undefined']}")

worst = sb.sort("cost_bps", descending=True, nulls_last=True).head(10) \
          .select(["symbol","asset_class","cost_bps","spread_bps","commission_bps","threshold_bps","eligible","reason"])
best_ok = eligible_out.sort("cost_bps").head(10)

print("--------------------------------------------------------------------------------")
print("Top-10 COST m√°s altos (bps):")
for i, r in enumerate(worst.to_dicts(), start=1):
    cb = "NA" if r["cost_bps"] is None else f"{r['cost_bps']:.2f}"
    sp = "NA" if r["spread_bps"] is None else f"{r['spread_bps']:.2f}"
    cm = "NA" if r["commission_bps"] is None else f"{r['commission_bps']:.2f}"
    print(f"  #{i:02d} {r['symbol']:<15} class={r['asset_class']:<8} cost={cb:>6} | spread={sp:>6} | comm={cm:>6} | thr={r['threshold_bps']:.2f} | elig={bool(r['eligible'])} | {r['reason']}")

print("--------------------------------------------------------------------------------")
print("Top-10 elegibles con menor COST (bps):")
for i, r in enumerate(best_ok.to_dicts(), start=1):
    cb = "NA" if r["cost_bps"] is None else f"{r['cost_bps']:.2f}"
    sp = "NA" if r["spread_bps"] is None else f"{r['spread_bps']:.2f}"
    cm = "NA" if r["commission_bps"] is None else f"{r['commission_bps']:.2f}"
    print(f"  #{i:02d} {r['symbol']:<15} class={r['asset_class']:<8} cost={cb:>6} | spread={sp:>6} | comm={cm:>6} | thr={r['threshold_bps']:.2f}")

print("--------------------------------------------------------------------------------")
print(f"üíæ OUTPUT ‚Üí {ELIGIBLE_PARQUET}     (OK, rows={eligible_out.height}, cols={len(eligible_out.columns)})")
print(f"üíæ OUTPUT ‚Üí {ELIGIBLE_TXT}         (OK, lines={eligible_out.height})")
print(f"üíæ OUTPUT ‚Üí {REPORT_JSON}         (OK)")
print(f"üíæ OUTPUT ‚Üí {COSTS_SUMMARY_PATH}  (OK, rows={costs_summary.height}, cols={len(costs_summary.columns)})")
print(">>> Celda 05 :: OK ‚Äî Universo elegible por coste y resumen de costes listos para consumo")
# ======================================================================================================


‚ö†Ô∏è  commissions.json no encontrado. Se asume comisi√≥n 0.0 USD (round-trip).
[2025-12-02 23:22:58] [20251202_232253] [INFO] [05-Costs] Snapshot de s√≠mbolos cargado: C:\Quant\MT5_Data_Extraction\data\metadata\symbols_broker.parquet (131 filas)
[2025-12-02 23:22:58] [20251202_232253] [INFO] [05-Costs] eligible_symbols_by_cost.parquet escrito (107 filas)
[2025-12-02 23:22:58] [20251202_232253] [INFO] [05-Costs] costs_summary.parquet escrito en C:\Quant\MT5_Data_Extraction\data\metadata\costs_summary.parquet (131 filas)
>>> Celda 05 :: Filtro de Costes (Spread + Comisiones)
üìÅ INPUT ‚Üí C:\Quant\MT5_Data_Extraction\data\metadata\symbols_broker.parquet  (rows=131)
‚öôÔ∏è  slippage_bps=0.0 | require_valid_tick=True
‚öôÔ∏è  thresholds_bps por clase:
   - FX_MAJOR : 3.00 bps
   - FX_MINOR : 6.00 bps
   - METAL    : 10.00 bps
   - INDEX    : 8.00 bps
   - CRYPTO   : 50.00 bps
   - ENERGY   : 12.00 bps
   - EQUITY   : 12.00 bps
   - OTHER    : 12.00 bps
-----------------------------------

In [7]:
# ======================== Celda 06 ‚Äî Descarga masiva M5 (4 a√±os, UTC+GYE, Polars-only, v2-debug) ========================
# Reglas:
#   - TF estricto: M5.
#   - Ventana "te√≥rica": √∫ltimos 4 a√±os hasta ayer 23:55 UTC (WS, WE).
#   - Modo profesional:
#       * BULK_MODE_08 = "AUTO" (defecto):
#             - S√≠mbolo NO est√° en cat√°logo -> descarga FULL [WS, WE] (bootstrap).
#             - S√≠mbolo S√ç est√° en cat√°logo -> descarga INCREMENTAL desde last_ts_utc + 1 vela M5
#               con backfill de INCR_BACKFILL_DAYS_08 (para cubrir huecos / correcciones).
#       * BULK_MODE_08 = "FULL": siempre descarga [WS, WE] para todos los s√≠mbolos.
#       * BULK_MODE_08 = "INCREMENTAL": siempre usa modo incremental si hay last_ts_utc; si no, cae a FULL.
#   - Descarga mensual -> partici√≥n diaria -> 1 archivo/d√≠a.
#   - PAD:
#       <DATA_ROOT>/bulk_data/m5_raw/symbol=SYMBOL/year=YYYY/month=MM/part=YYYYMMDD.parquet
#   - Idempotencia: si el d√≠a existe, no reescribe salvo FORCE_*.
#   - Cat√°logo actualizado + logs JSONL.
#   - Lista elegible por coste (Celda 3B/5): metadata/filters/eligible_symbols_by_cost.parquet|.txt (columna 'symbol')
#   - Esquema: timestamp_utc (Int64 ms UTC) + timestamp_gye (Datetime[ms,'America/Guayaquil'])
#   - Polars-only (sin pandas), sin paths absolutos hardcodeados.
#
# v2-debug:
#   - Verificaci√≥n expl√≠cita de cobertura M5:
#       * Barras totales por mes para primeros s√≠mbolos.
#       * Barras por d√≠a (min/p50/max) en cada mes (para debug).
#       * Reapertura de varios part=YYYYMMDD.parquet reci√©n escritos para comprobar n_bars y rango temporal.
#   - Mensajes claros si medianas de barras/d√≠a << 288 (posible D1/M15 o extracci√≥n rota).
# =======================================================================================================================

from __future__ import annotations
import json, time
from pathlib import Path
from datetime import datetime, timedelta, timezone

# Dependencias
try:
    import polars as pl
except Exception as e:
    raise RuntimeError("Polars es obligatorio en la Celda 06. Instala 'polars'.") from e

try:
    import MetaTrader5 as mt5
except Exception as e:
    raise RuntimeError("MetaTrader5 no est√° disponible. Instala 'MetaTrader5'.") from e

# Ventanas exactas por mes/a√±o (opcional)
try:
    from dateutil.relativedelta import relativedelta
    _HAS_REL = True
except Exception:
    _HAS_REL = False

# ----------------------- Config global y PADs -----------------------
CELL_LABEL = "06-BulkM5"
RUN_ID = globals().get("RUN_ID", datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))

# DATA_ROOT debe definirse en celdas previas (ROOT √∫nico del proyecto)
if "DATA_ROOT" not in globals():
    raise RuntimeError("DATA_ROOT no est√° definido. Establ√©celo en la Celda 02 (config & rutas).")

DATA_ROOT      = Path(globals()["DATA_ROOT"])
BULK_M5_DIR    = DATA_ROOT / "bulk_data" / "m5_raw"
META_DIR       = DATA_ROOT / "metadata"
FILTERS_DIR    = META_DIR / "filters"
CATALOG_PATH   = META_DIR / "dataset_catalog.parquet"
SYMBOLS_BROKER = META_DIR / "symbols_broker.parquet"
SCHEMA_JSON    = META_DIR / "schema_m5.json"
RUN_LOG        = META_DIR / "run_log.jsonl"

# Zona horaria local para timestamp_gye (IANA expl√≠cito)
TIMEZONE_IANA = globals().get("TIMEZONE_IANA", "America/Guayaquil")

# Lista elegible Celda 3B/5
ELIGIBLE_PARQUET = FILTERS_DIR / "eligible_symbols_by_cost.parquet"
ELIGIBLE_TXT     = FILTERS_DIR / "eligible_symbols_by_cost.txt"

# Flags
TIMEFRAME_LABEL   = globals().get("TIMEFRAME_LABEL", "M5")
FORCE_REDOWNLOAD  = bool(globals().get("FORCE_REDOWNLOAD_BULK_M5", False))
FORCE_REWRITE_DAY = bool(globals().get("FORCE_REWRITE_DAY", False))

# Ventana fija "te√≥rica": √∫ltimos 4 a√±os
WINDOW_YEARS = 4
WINDOW_DAYS_FALLBACK = 1461  # si no hay dateutil

# Retries
RETRIES_CONN = 5
RETRIES_REQ  = 3
SLEEP_BASE   = 0.8

# Afinadores
MAX_SYMBOLS: int | None = globals().get("MAX_SYMBOLS_08", None)
WHITELIST = globals().get("WHITELIST_08", None)
BLACKLIST = globals().get("BLACKLIST_08", None)

# Flag para controlar el precheck estricto de 4 a√±os (por defecto DESACTIVADO)
STRICT_4Y_PRECHECK = bool(globals().get("STRICT_4Y_PRECHECK", False))

# ----------------- Modo de descarga FULL / INCREMENTAL -----------------
BULK_MODE = str(globals().get("BULK_MODE_08", "AUTO")).upper()
if BULK_MODE not in {"AUTO", "FULL", "INCREMENTAL"}:
    BULK_MODE = "AUTO"

INCR_BACKFILL_DAYS = int(globals().get("INCR_BACKFILL_DAYS_08", 3))  # d√≠as de solape en modo incremental

# ----------------- Par√°metros de debug de cobertura M5 -----------------
DEBUG_08 = bool(globals().get("DEBUG_08", True))
DEBUG_MAX_SYMBOLS = int(globals().get("DEBUG_MAX_SYMBOLS_08", 2))                  # s√≠mbolos con debug detallado
DEBUG_MAX_MONTHS_PER_SYMBOL = int(globals().get("DEBUG_MAX_MONTHS_08", 2))         # meses con stats detalladas
DEBUG_SAMPLE_DAYS_PER_MONTH = int(globals().get("DEBUG_SAMPLE_DAYS_PER_MONTH_08", 3))   # d√≠as ejemplo/mes
DEBUG_SAMPLE_DAYS_PER_SYMBOL = int(globals().get("DEBUG_SAMPLE_DAYS_PER_SYMBOL_08", 3)) # d√≠as re-le√≠dos desde disco
MIN_EXPECTED_BARS_PER_DAY_DEBUG = int(globals().get("MIN_EXPECTED_BARS_PER_DAY_DEBUG_08", 200))
EXPECTED_BARS_M5 = int(globals().get("EXPECTED_BARS_M5", 288))

# ----------------------- Utilidades -----------------------
def _log(level: str, msg: str) -> None:
    ts = datetime.now(timezone.utc).isoformat(timespec="seconds")
    print(f"[{ts}] [{RUN_ID}] [{level}] [{CELL_LABEL}] {msg}", flush=True)

def _ensure_dirs() -> None:
    BULK_M5_DIR.mkdir(parents=True, exist_ok=True)
    META_DIR.mkdir(parents=True, exist_ok=True)
    FILTERS_DIR.mkdir(parents=True, exist_ok=True)
    if not CATALOG_PATH.exists():
        pl.DataFrame(
            {
                "symbol": pl.Series([], dtype=pl.Utf8),
                "first_ts_utc": pl.Series([], dtype=pl.Int64),
                "last_ts_utc": pl.Series([], dtype=pl.Int64),
                "n_files": pl.Series([], dtype=pl.Int64),
                "n_days": pl.Series([], dtype=pl.Int64),
                "n_months": pl.Series([], dtype=pl.Int64),
                "n_years": pl.Series([], dtype=pl.Int64),
                "bytes": pl.Series([], dtype=pl.Int64),
                "last_update_ts_utc": pl.Series([], dtype=pl.Int64),
                "notes": pl.Series([], dtype=pl.Utf8),
            }
        ).write_parquet(CATALOG_PATH, compression="zstd", statistics=True)

def _write_jsonl(path: Path, obj: dict) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "a", encoding="utf-8") as f:
        f.write(json.dumps(obj, ensure_ascii=False) + "\n")

def _mt5_init_with_retries() -> None:
    print("=" * 110)
    print(f"Inicio Celda 06 ‚Äî Descarga masiva M5 | TZ objetivo: {TIMEZONE_IANA}")
    try:
        from zoneinfo import ZoneInfo
        gye_now = datetime.now(timezone.utc).astimezone(ZoneInfo(TIMEZONE_IANA)).isoformat(timespec="seconds")
        print(f"Hora GYE: {gye_now} | Hora UTC: {datetime.now(timezone.utc).isoformat(timespec='seconds')}")
    except Exception:
        print(f"Hora UTC: {datetime.now(timezone.utc).isoformat(timespec='seconds')} (instala 'tzdata' para TZ locales)")
    print("-" * 110)
    print(f"DATA_ROOT       : {DATA_ROOT}")
    print(f"PAD salida M5   : {BULK_M5_DIR}")
    print(f"BULK_MODE_08    : {BULK_MODE}")
    print(f"INCR_BACKFILL_DAYS_08 : {INCR_BACKFILL_DAYS}")
    print(f"STRICT_4Y_PRECHECK    : {STRICT_4Y_PRECHECK}")
    print(f"FORCE_REDOWNLOAD_BULK_M5 = {FORCE_REDOWNLOAD}")
    print(f"FORCE_REWRITE_DAY        = {FORCE_REWRITE_DAY}")
    print(f"DEBUG_08                 = {DEBUG_08}")
    print("-" * 110)
    _log("INFO", "Conectando a MT5 con reintentos...")
    for k in range(RETRIES_CONN):
        if mt5.initialize():
            _log("INFO", "Conexi√≥n MT5 establecida.")
            return
        time.sleep((2 ** k) * SLEEP_BASE)
    raise RuntimeError("No se pudo inicializar MT5 tras varios intentos.")

def _mt5_shutdown() -> None:
    try:
        mt5.shutdown()
        _log("INFO", "Conexi√≥n MT5 cerrada.")
    except Exception:
        pass

def _read_eligible_symbols_if_any(strict_on_empty: bool = True) -> list[str] | None:
    if ELIGIBLE_PARQUET.exists():
        df = pl.read_parquet(ELIGIBLE_PARQUET)
        col = "symbol" if "symbol" in df.columns else df.columns[0]
        syms = sorted({str(s).strip() for s in df.get_column(col).to_list() if str(s).strip()})
        if len(syms) == 0 and strict_on_empty:
            raise RuntimeError("eligible_symbols_by_cost.parquet est√° vac√≠o. Revisa la Celda 05.")
        return syms if syms else (None if not strict_on_empty else [])
    if ELIGIBLE_TXT.exists():
        syms = []
        with open(ELIGIBLE_TXT, "r", encoding="utf-8") as f:
            for line in f:
                v = line.strip()
                if v:
                    syms.append(v)
        syms = sorted({s for s in syms})
        if len(syms) == 0 and strict_on_empty:
            raise RuntimeError("eligible_symbols_by_cost.txt est√° vac√≠o. Revisa la Celda 05.")
        return syms if syms else (None if not strict_on_empty else [])
    return None

def _read_symbols() -> list[str]:
    if not SYMBOLS_BROKER.exists():
        raise FileNotFoundError(f"No existe {SYMBOLS_BROKER}. Ejecuta la Celda 04 (inventario de s√≠mbolos).")
    df = pl.read_parquet(SYMBOLS_BROKER)
    col = "symbol" if "symbol" in df.columns else df.columns[0]
    if WHITELIST:
        df = df.filter(pl.col(col).is_in(WHITELIST))
    if BLACKLIST:
        df = df.filter(~pl.col(col).is_in(BLACKLIST))
    syms = [str(s) for s in df.get_column(col).to_list()]
    if MAX_SYMBOLS is not None:
        syms = syms[:int(MAX_SYMBOLS)]
    return syms

def _window_edges() -> tuple[datetime, datetime]:
    now_utc = datetime.now(timezone.utc)
    # Ayer 23:55 UTC: evitamos √∫ltima vela en formaci√≥n
    end_day = (now_utc - timedelta(days=1)).replace(hour=23, minute=55, second=0, microsecond=0)
    if _HAS_REL:
        start_day = (end_day - relativedelta(years=WINDOW_YEARS)).replace(
            hour=0, minute=0, second=0, microsecond=0
        )
    else:
        start_day = (end_day - timedelta(days=WINDOW_DAYS_FALLBACK - 1)).replace(
            hour=0, minute=0, second=0, microsecond=0
        )
    return start_day, end_day

def _month_iter(start_dt: datetime, end_dt: datetime):
    ms = start_dt.replace(day=1, hour=0, minute=0, second=0, microsecond=0)
    while ms <= end_dt:
        if _HAS_REL:
            me = ms + relativedelta(months=1)
        else:
            me = (ms + timedelta(days=32)).replace(day=1, hour=0, minute=0, second=0, microsecond=0)
        yield ms, me
        ms = me

def _load_schema_column_order() -> list[str]:
    try:
        schema = json.loads(SCHEMA_JSON.read_text(encoding="utf-8"))
        return list(schema["column_order"])
    except Exception:
        return [
            "timestamp_utc", "timestamp_gye", "symbol", "open", "high", "low", "close",
            "tick_volume", "real_volume", "spread_points", "broker", "server_tz",
        ]

def _ensure_schema_order(df: pl.DataFrame, col_order: list[str]) -> pl.DataFrame:
    cols = set(df.columns)
    add_exprs = []
    for c in col_order:
        if c not in cols:
            if c == "timestamp_utc":
                add_exprs.append(pl.lit(None, dtype=pl.Int64).alias(c))
            elif c == "timestamp_gye":
                add_exprs.append(pl.lit(None, dtype=pl.Datetime("ms", TIMEZONE_IANA)).alias(c))
            elif c in {"open", "high", "low", "close", "spread_points"}:
                add_exprs.append(pl.lit(None, dtype=pl.Float64).alias(c))
            elif c in {"tick_volume", "real_volume"}:
                add_exprs.append(pl.lit(None, dtype=pl.Int64).alias(c))
            else:
                add_exprs.append(pl.lit(None, dtype=pl.Utf8).alias(c))
    if add_exprs:
        df = df.with_columns(add_exprs)
    return df.select(col_order)

def _update_catalog_for_symbol(symbol: str) -> None:
    sym_dir = BULK_M5_DIR / f"symbol={symbol}"
    if not sym_dir.exists():
        return
    parts = list(sym_dir.rglob("part=*.parquet"))
    if not parts:
        return
    total_b = 0
    days = set()
    for p in parts:
        try:
            total_b += p.stat().st_size
            days.add(p.name.split("part=")[1].split(".parquet")[0][:8])
        except Exception:
            pass
    n_files = len(parts)
    n_days  = len(days)

    pattern = str(sym_dir / "year=*" / "month=*" / "part=*.parquet")
    minmax = pl.scan_parquet(pattern).select(
        pl.min("timestamp_utc").alias("ts_min"),
        pl.max("timestamp_utc").alias("ts_max")
    ).collect()

    if minmax.height == 0 or minmax[0, "ts_min"] is None:
        return

    ts_min = int(minmax[0, "ts_min"])
    ts_max = int(minmax[0, "ts_max"])
    dt0 = datetime.fromtimestamp(ts_min / 1000, tz=timezone.utc)
    dt1 = datetime.fromtimestamp(ts_max / 1000, tz=timezone.utc)
    n_months = (dt1.year - dt0.year) * 12 + (dt1.month - dt0.month) + 1
    n_years  = dt1.year - dt0.year + 1

    base = pl.read_parquet(CATALOG_PATH) if CATALOG_PATH.exists() else pl.DataFrame(
        schema=[
            pl.Field("symbol", pl.Utf8),
            pl.Field("first_ts_utc", pl.Int64),
            pl.Field("last_ts_utc", pl.Int64),
            pl.Field("n_files", pl.Int64),
            pl.Field("n_days", pl.Int64),
            pl.Field("n_months", pl.Int64),
            pl.Field("n_years", pl.Int64),
            pl.Field("bytes", pl.Int64),
            pl.Field("last_update_ts_utc", pl.Int64),
            pl.Field("notes", pl.Utf8),
        ]
    )
    updated_row = pl.DataFrame(
        {
            "symbol": [symbol],
            "first_ts_utc": [ts_min],
            "last_ts_utc": [ts_max],
            "n_files": [n_files],
            "n_days": [n_days],
            "n_months": [n_months],
            "n_years": [n_years],
            "bytes": [total_b],
            "last_update_ts_utc": [int(datetime.now(timezone.utc).timestamp() * 1000)],
            "notes": [""],
        }
    )
    others = base.filter(pl.col("symbol") != symbol)
    pl.concat([others, updated_row], how="vertical").write_parquet(CATALOG_PATH, compression="zstd", statistics=True)

def _precheck_symbol(symbol: str, ws: datetime, we: datetime) -> bool:
    """Precheck de cobertura M5 ~4 a√±os (solo si STRICT_4Y_PRECHECK=True)."""
    try:
        mt5.symbol_select(symbol, True)
    except Exception:
        pass
    wk = timedelta(days=7)
    ranges = [(ws, ws + wk), (we - wk, we + timedelta(days=1))]
    for (a, b) in ranges:
        got = False
        last_err = None
        for r in range(RETRIES_REQ):
            try:
                arr = mt5.copy_rates_range(symbol, mt5.TIMEFRAME_M5, a, b)
                got = (arr is not None) and (len(arr) > 0)
                if got:
                    break
            except Exception as e:
                last_err = e
            time.sleep((2 ** r) * SLEEP_BASE)
        if not got:
            _write_jsonl(
                RUN_LOG,
                {
                    "run_id": RUN_ID,
                    "cell": CELL_LABEL,
                    "symbol": symbol,
                    "status": "precheck_fail",
                    "msg": f"sin_barras_en_{a.date()}_{b.date()}",
                    "err": str(last_err) if last_err else "",
                },
            )
            return False
    return True

def _month_span_limited(ms: datetime, me: datetime, ws: datetime, we: datetime) -> tuple[datetime, datetime]:
    real_start = max(ms, ws)
    real_end   = min(me, we + timedelta(days=1))
    return real_start, real_end  # [real_start, real_end) exclusivo

def _mt5_array_to_polars(arr) -> pl.DataFrame:
    """Convierte el structured array de MT5 a DataFrame Polars (sin pandas)."""
    if arr is None or len(arr) == 0:
        return pl.DataFrame()
    names = getattr(arr, "dtype", None)
    names = names.names if names is not None else None
    if names:
        return pl.DataFrame({n: arr[n].tolist() for n in names})
    try:
        return pl.DataFrame(arr)
    except Exception:
        try:
            return pl.from_dicts([dict(x) for x in arr])
        except Exception as e:
            raise ValueError(f"No se pudo convertir la respuesta MT5 a Polars: {e}")

# ----------------------- Inicio y setup -----------------------
_ensure_dirs()

# Cargar cat√°logo actual para detectar s√≠mbolos "ya vistos"
_catalog_df = pl.read_parquet(CATALOG_PATH)
catalog_last_ts = {row["symbol"]: row["last_ts_utc"] for row in _catalog_df.to_dicts()} if _catalog_df.height > 0 else {}

eligible_syms = _read_eligible_symbols_if_any(strict_on_empty=True)
if eligible_syms is not None:
    _log("INFO", f"Usando lista elegible por coste (Celda 05): {len(eligible_syms)} s√≠mbolos")
    WHITELIST = eligible_syms
else:
    _log("WARNING", "No se encontr√≥ lista elegible por coste. Se usar√°n todos los s√≠mbolos del broker.")

WS, WE = _window_edges()
print(f"Ventana objetivo (UTC te√≥rica): {WS.date()} ‚Üí {WE.date()} (√∫ltimos {WINDOW_YEARS} a√±os)")

# Conteo total sin filtros (denominador)
try:
    total_all = pl.read_parquet(SYMBOLS_BROKER).height
except Exception:
    total_all = None

symbols_all = _read_symbols()
if MAX_SYMBOLS is not None:
    symbols_all = symbols_all[:int(MAX_SYMBOLS)]

print(
    f"Universo de s√≠mbolos (filtro costes Celda 05 + broker): {len(symbols_all)}"
    + (f" / {total_all}" if total_all is not None else "")
)
if symbols_all:
    print("Ejemplo de s√≠mbolos:", ", ".join(symbols_all[:min(10, len(symbols_all))]))
print(f"STRICT_4Y_PRECHECK = {STRICT_4Y_PRECHECK} | BULK_MODE_08 = {BULK_MODE}")
print(f"FORCE_REDOWNLOAD_BULK_M5={FORCE_REDOWNLOAD} | FORCE_REWRITE_DAY={FORCE_REWRITE_DAY}")
print("-" * 110)

# Conexi√≥n a MT5
_mt5_init_with_retries()

# Guard de timeframe
TIMEFRAME = mt5.TIMEFRAME_M5
assert TIMEFRAME_LABEL == "M5", "TIMEFRAME_LABEL debe ser 'M5'."
_log("INFO", f"Usando TIMEFRAME_M5 de MT5 (valor interno={TIMEFRAME}).")

# ----------------------- Prefiltro cobertura (opcional) -----------------------
kept: list[str] = []
dropped: list[str] = []

if STRICT_4Y_PRECHECK:
    for i, s in enumerate(symbols_all, start=1):
        _log("INFO", f"[Prefiltro {i}/{len(symbols_all)}] {s} ‚Üí chequeo 4y (estricto)...")
        if _precheck_symbol(s, WS, WE):
            kept.append(s)
        else:
            dropped.append(s)

    print("-" * 110)
    print(f"Prefiltro cobertura >=4y (estricto) ‚Üí kept={len(kept)} | dropped={len(dropped)}")
    if kept:
        print("Ejemplo kept:", ", ".join(kept[:min(10, len(kept))]))
        print("Ejemplo dropped:", ", ".join(dropped[:min(10, len(dropped))]))
    if not kept:
        _mt5_shutdown()
        print("‚ö†Ô∏è  Ning√∫n s√≠mbolo cumple el requisito de 4 a√±os en esta sesi√≥n/servidor (modo estricto).")
        print("‚úÖ Descarga M5: SIN cambios. OK para continuar con la siguiente celda.")
        raise SystemExit
else:
    kept = list(symbols_all)
    dropped = []
    print("-" * 110)
    print(
        "Prefiltro cobertura >=4y DESACTIVADO (STRICT_4Y_PRECHECK=False). "
        f"Se intentar√° descargar M5 para todos los s√≠mbolos elegibles: {len(kept)}"
    )

# ----------------------- Descarga por s√≠mbolo en bloques mensuales -----------------------
total_days_written = 0
total_bytes = 0
symbols_ok = 0
col_order = _load_schema_column_order()

for idx, symbol in enumerate(kept, start=1):
    # -------- Decidir ventana por s√≠mbolo seg√∫n cat√°logo + BULK_MODE --------
    last_ts = catalog_last_ts.get(symbol, None)

    if BULK_MODE == "FULL" or last_ts is None:
        sym_mode = "FULL"
        sym_ws, sym_we = WS, WE
    else:
        last_dt = datetime.fromtimestamp(int(last_ts) / 1000, tz=timezone.utc)
        incr_start = (last_dt + timedelta(minutes=5)).replace(second=0, microsecond=0)
        incr_start -= timedelta(days=INCR_BACKFILL_DAYS)
        if incr_start < WS:
            incr_start = WS
        sym_ws, sym_we = incr_start, WE
        if sym_ws >= sym_we:
            _log("INFO", f"[{idx}/{len(kept)}] {symbol}: ya actualizado hasta {last_dt.isoformat()}, sin d√≠as nuevos.")
            continue
        sym_mode = "INCREMENTAL"

    debug_this_symbol = DEBUG_08 and (idx <= DEBUG_MAX_SYMBOLS)
    _log(
        "INFO",
        f"[{idx}/{len(kept)}] {symbol}: modo={sym_mode}, ventana efectiva {sym_ws.date()} ‚Üí {sym_we.date()} | "
        f"BULK_MODE_08={BULK_MODE}, INCR_BACKFILL_DAYS_08={INCR_BACKFILL_DAYS}"
    )

    try:
        mt5.symbol_select(symbol, True)
    except Exception:
        pass

    months = list(_month_iter(sym_ws, sym_we))
    days_written_sym = 0
    debug_days_checked = 0  # relectura de part=YYYYMMDD.parquet

    for m_idx, (m_start, m_end) in enumerate(months, start=1):
        real_start, real_end = _month_span_limited(m_start, m_end, sym_ws, sym_we)
        if real_start >= real_end:
            continue

        yyyy, mm = f"{real_start.year:04d}", f"{real_start.month:02d}"
        yyyy_mm = f"{yyyy}-{mm}"

        ok = False
        last_err = None
        for r in range(RETRIES_REQ):
            try:
                arr = mt5.copy_rates_range(symbol, TIMEFRAME, real_start, real_end)
                if arr is None:
                    raise RuntimeError(f"copy_rates_range devolvi√≥ None (last_error={mt5.last_error()})")
                ok = True
                break
            except Exception as e:
                last_err = e
                time.sleep((2 ** r) * SLEEP_BASE)
        if not ok:
            _log("WARNING", f"{symbol} {yyyy_mm}: fallo mensual tras reintentos: {last_err}")
            _write_jsonl(
                RUN_LOG,
                {
                    "run_id": RUN_ID,
                    "cell": CELL_LABEL,
                    "symbol": symbol,
                    "status": "month_error",
                    "month": yyyy_mm,
                    "msg": str(last_err),
                },
            )
            continue

        if len(arr) == 0:
            _log("INFO", f"{symbol} {yyyy_mm}: mes sin barras (feriado/activo sin trading o fuera de ventana).")
            continue

        df = _mt5_array_to_polars(arr)
        if df.height == 0:
            _log("WARNING", f"{symbol} {yyyy_mm}: df vac√≠o tras conversi√≥n a Polars.")
            continue

        # Transformaciones b√°sicas y schema M5
        df = (
            df
            .with_columns(
                pl.lit(symbol).alias("symbol"),
                (pl.col("time").cast(pl.Int64) * pl.lit(1000, dtype=pl.Int64)).alias("timestamp_utc"),
            )
            .with_columns(
                pl.col("timestamp_utc")
                  .cast(pl.Datetime("ms"))
                  .dt.replace_time_zone("UTC")
                  .alias("_ts_utc_dt"),
            )
            .with_columns(
                pl.col("_ts_utc_dt").dt.convert_time_zone(TIMEZONE_IANA).alias("timestamp_gye"),
                pl.when(pl.col("spread").is_not_null())
                  .then(pl.col("spread").cast(pl.Float64))
                  .otherwise(pl.lit(None, dtype=pl.Float64))
                  .alias("spread_points"),
                pl.col("tick_volume").cast(pl.Int64).alias("tick_volume"),
                pl.when(pl.col("real_volume").is_not_null())
                  .then(pl.col("real_volume").cast(pl.Int64))
                  .otherwise(pl.lit(None, dtype=pl.Int64))
                  .alias("real_volume"),
                pl.lit(None, dtype=pl.Utf8).alias("broker"),
                pl.lit(None, dtype=pl.Utf8).alias("server_tz"),
            )
        )

        df = df.sort("timestamp_utc").unique(subset=["timestamp_utc"], keep="last")

        df = df.with_columns(
            pl.col("_ts_utc_dt").dt.strftime("%Y%m%d").alias("_date_utc_str"),
        )

        # ------------------------ DEBUG: cobertura mensual M5 ------------------------
        if debug_this_symbol and (m_idx <= DEBUG_MAX_MONTHS_PER_SYMBOL):
            n_bars_month = df.height
            day_counts = (
                df.group_by("_date_utc_str")
                  .agg(pl.len().alias("n_bars_day"))
                  .sort("_date_utc_str")
            )
            n_days_month = day_counts.height
            if n_days_month > 0:
                stats = day_counts.select([
                    pl.col("n_bars_day").min().alias("n_min"),
                    pl.col("n_bars_day").median().alias("n_p50"),
                    pl.col("n_bars_day").max().alias("n_max"),
                ]).to_dicts()[0]
                n_min = int(stats["n_min"])
                n_p50 = float(stats["n_p50"])
                n_max = int(stats["n_max"])
            else:
                n_min = n_max = 0
                n_p50 = 0.0

            _log(
                "INFO",
                f"[DEBUG] {symbol} {yyyy_mm}: barras_totales_mes={n_bars_month} | dias_mes={n_days_month} | "
                f"barras/d√≠a min={n_min}, p50={n_p50:.1f}, max={n_max}"
            )

            if n_p50 < MIN_EXPECTED_BARS_PER_DAY_DEBUG:
                _log(
                    "WARNING",
                    f"[DEBUG] {symbol} {yyyy_mm}: MEDIANA barras/d√≠a={n_p50:.1f} << {MIN_EXPECTED_BARS_PER_DAY_DEBUG} "
                    f"(EXPECTED_BARS_M5={EXPECTED_BARS_M5}). Posible timeframe incorrecto (D1/M15) o extracci√≥n rota."
                )

            # Imprimir algunos d√≠as de ejemplo
            print(f"Ejemplo d√≠as {symbol} {yyyy_mm} (barras/d√≠a):")
            for r in day_counts.head(DEBUG_SAMPLE_DAYS_PER_MONTH).iter_rows(named=True):
                print(f"  - {r['_date_utc_str']}: n_bars_day={r['n_bars_day']}")
            print("-" * 80)
        # ---------------------------------------------------------------------------

        unique_days = df.select(pl.col("_date_utc_str").unique().alias("_d")).to_series().to_list()

        for day_str in unique_days:
            y = int(day_str[0:4])
            m = int(day_str[4:6])
            d = int(day_str[6:8])
            day_date = datetime(y, m, d, tzinfo=timezone.utc).date()
            if not (WS.date() <= day_date <= WE.date()):
                continue

            out_dir  = BULK_M5_DIR / f"symbol={symbol}" / f"year={y:04d}" / f"month={m:02d}"
            out_file = out_dir / f"part={day_str}.parquet"

            if out_file.exists() and not (FORCE_REDOWNLOAD or FORCE_REWRITE_DAY):
                continue

            out_day = df.filter(pl.col("_date_utc_str") == day_str).select(
                "timestamp_utc",
                "timestamp_gye",
                "symbol",
                pl.col("open").cast(pl.Float64),
                pl.col("high").cast(pl.Float64),
                pl.col("low").cast(pl.Float64),
                pl.col("close").cast(pl.Float64),
                "tick_volume",
                "real_volume",
                "spread_points",
                "broker",
                "server_tz",
            )
            out_day = _ensure_schema_order(out_day, col_order)

            out_dir.mkdir(parents=True, exist_ok=True)
            out_day.write_parquet(out_file, compression="zstd", statistics=True)

            try:
                total_bytes += out_file.stat().st_size
            except Exception:
                pass
            total_days_written += 1
            days_written_sym += 1

            # ---------------- DEBUG: re-lectura de algunos d√≠as concretos ----------------
            if debug_this_symbol and (debug_days_checked < DEBUG_SAMPLE_DAYS_PER_SYMBOL):
                try:
                    df_chk = pl.read_parquet(out_file)
                    nb = int(df_chk.height)
                    nu = int(df_chk.get_column("timestamp_utc").n_unique()) if "timestamp_utc" in df_chk.columns else -1
                    if "timestamp_utc" in df_chk.columns and nb > 0:
                        tmin = int(df_chk["timestamp_utc"].min())
                        tmax = int(df_chk["timestamp_utc"].max())
                        dt0 = datetime.fromtimestamp(tmin / 1000, tz=timezone.utc)
                        dt1 = datetime.fromtimestamp(tmax / 1000, tz=timezone.utc)
                        _log(
                            "INFO",
                            f"[DEBUG] Relectura {out_file.name}: n_bars={nb}, n_unique_ts={nu}, "
                            f"ts_min_utc={dt0.isoformat()}, ts_max_utc={dt1.isoformat()}"
                        )
                    else:
                        _log(
                            "INFO",
                            f"[DEBUG] Relectura {out_file.name}: n_bars={nb}, columnas={df_chk.columns}"
                        )

                    if nb < MIN_EXPECTED_BARS_PER_DAY_DEBUG:
                        _log(
                            "WARNING",
                            f"[DEBUG] {symbol} d√≠a {day_str}: s√≥lo {nb} barras en part=*.parquet "
                            f"(esperado ~{EXPECTED_BARS_M5}). Comprueba TIMEFRAME y extracci√≥n."
                        )
                except Exception as e:
                    _log("WARNING", f"[DEBUG] No se pudo re-leer {out_file}: {type(e).__name__}: {e}")
                debug_days_checked += 1
            # ---------------------------------------------------------------------------

        _log("INFO", f"{symbol} {yyyy_mm} ‚Üí d√≠as escritos acumulados (run para este s√≠mbolo): {days_written_sym}")

        del df

    if days_written_sym > 0:
        symbols_ok += 1
        _log("INFO", f"{symbol}: total d√≠as escritos en este run = {days_written_sym}")
        try:
            _update_catalog_for_symbol(symbol)
        except Exception as e:
            _log("WARNING", f"Cat√°logo no actualizado para {symbol}: {e}")
    else:
        _log("WARNING", f"{symbol}: no se escribi√≥ ning√∫n d√≠a en esta corrida (modo={sym_mode}).")
        _write_jsonl(
            RUN_LOG,
            {
                "run_id": RUN_ID,
                "cell": CELL_LABEL,
                "symbol": symbol,
                "status": "no_new_days",
                "mode": sym_mode,
                "msg": "sin_d√≠as_nuevos_en_ventana_o_activo_sin_trading",
            },
        )

# Cerrar MT5
_mt5_shutdown()

# ----------------------- Impresiones finales -----------------------
print("=" * 110)
print(f">>> Celda 06 :: Descarga masiva M5 (4 a√±os, UTC+GYE, Polars-only, v2-debug)")
print(f"Salida base M5 raw: {BULK_M5_DIR}")
if STRICT_4Y_PRECHECK:
    modo_label = "s√≠mbolos elegibles (>=4y, modo estricto)"
else:
    modo_label = "s√≠mbolos elegibles (sin precheck estricto; Celda 05 + broker)"
print(f"{modo_label}: {len(kept)} | S√≠mbolos con datos escritos (en esta corrida): {symbols_ok}")
print(f"D√≠as/archivos escritos (esta corrida): {total_days_written}")
print(f"BULK_MODE_08 = {BULK_MODE} | INCR_BACKFILL_DAYS_08 = {INCR_BACKFILL_DAYS}")
print(f"FORCE_REDOWNLOAD_BULK_M5={FORCE_REDOWNLOAD} | FORCE_REWRITE_DAY={FORCE_REWRITE_DAY}")
print("-" * 110)
MiB = (total_bytes / (1024 * 1024)) if total_bytes else 0.0
GiB = (total_bytes / (1024 * 1024 * 1024)) if total_bytes else 0.0
print(f"Tama√±o total escrito: {total_bytes} bytes ({MiB:0.2f} MiB | {GiB:0.2f} GiB)")
print("-" * 110)
sym_dirs = sorted([p.name for p in BULK_M5_DIR.glob("symbol=*") if p.is_dir()])
print(f"Subcarpetas en m5_raw: {len(sym_dirs)}")
if sym_dirs:
    print("Ejemplos de subcarpetas:", ", ".join(sym_dirs[:10]))
print("-" * 110)
try:
    from zoneinfo import ZoneInfo
    print(
        f"Hora GYE fin: {datetime.now(timezone.utc).astimezone(ZoneInfo(TIMEZONE_IANA)).isoformat(timespec='seconds')} "
        f"| Hora UTC fin: {datetime.now(timezone.utc).isoformat(timespec='seconds')}"
    )
except Exception:
    print(f"Hora UTC fin: {datetime.now(timezone.utc).isoformat(timespec='seconds')}")
print("‚úÖ Celda 06 ‚Äî Bulk M5 completada: OK para continuar con la siguiente celda (Cat√°logo m5_raw / QA).")
# =======================================================================================================================


[2025-12-03T04:22:58+00:00] [20251202_232253] [INFO] [06-BulkM5] Usando lista elegible por coste (Celda 05): 107 s√≠mbolos
Ventana objetivo (UTC te√≥rica): 2021-12-02 ‚Üí 2025-12-02 (√∫ltimos 4 a√±os)
Universo de s√≠mbolos (filtro costes Celda 05 + broker): 107 / 131
Ejemplo de s√≠mbolos: EURUSD, GBPUSD, USDCHF, USDJPY, USDCAD, AUDUSD, AUDNZD, AUDCAD, AUDCHF, AUDJPY
STRICT_4Y_PRECHECK = False | BULK_MODE_08 = AUTO
FORCE_REDOWNLOAD_BULK_M5=False | FORCE_REWRITE_DAY=False
--------------------------------------------------------------------------------------------------------------
Inicio Celda 06 ‚Äî Descarga masiva M5 | TZ objetivo: America/Guayaquil
Hora GYE: 2025-12-02T23:22:58-05:00 | Hora UTC: 2025-12-03T04:22:58+00:00
--------------------------------------------------------------------------------------------------------------
DATA_ROOT       : C:\Quant\MT5_Data_Extraction\data
PAD salida M5   : C:\Quant\MT5_Data_Extraction\data\bulk_data\m5_raw
BULK_MODE_08    : AUTO
INCR_BACKFILL

In [8]:
# ===================== Celda 07 ‚Äî Cat√°logo m5_raw (sobre lo ya descargado) =====================
# Objetivo:
#   - Escanear bulk_data/m5_raw y construir:
#       (a) Cat√°logo por s√≠mbolo (ligero, agregado)  -> metadata/dataset_catalog.parquet
#       (b) Manifest por archivo con metadatos       -> metadata/m5_manifest.parquet
#   - Manifest incluye: first_ts_utc / last_ts_utc (Int64 ms UTC) por archivo, para b√∫squedas r√°pidas.
#   - Extrae s√≠mbolo y fecha del path: symbol=.../year=YYYY/month=MM/part=YYYYMMDD.parquet
#   - Polars-only, sin pandas.
#   - Se asume que la Celda 06 (Descarga masiva M5) YA corri√≥ y llen√≥ bulk_data/m5_raw.
# =======================================================================================================

from __future__ import annotations
from pathlib import Path
from datetime import datetime, timezone
import re
from typing import Dict, Any, List, Optional, Tuple

import polars as pl

CELL_LABEL   = "07-Catalogo-M5"
RUN_ID       = globals().get("RUN_ID", datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))

# *** Uso estricto del ROOT definido en celdas previas (sin fallback absoluto) ***
if "DATA_ROOT" not in globals():
    raise RuntimeError("DATA_ROOT no est√° definido. Ejecuta primero la Celda 02 (Configuraci√≥n y Rutas base).")
DATA_ROOT    = Path(globals()["DATA_ROOT"]).resolve()

M5_DIR         = DATA_ROOT / "bulk_data" / "m5_raw"
META_DIR       = DATA_ROOT / "metadata"
CATALOG_PATH   = META_DIR / "dataset_catalog.parquet"   # agregado por s√≠mbolo
MANIFEST_PATH  = META_DIR / "m5_manifest.parquet"       # detalle por archivo (incluye rangos)
TIMEZONE_IANA  = globals().get("TIMEZONE_IANA", "America/Guayaquil")

# Permitir desactivar el c√°lculo de min/max por archivo si se desea solo inventario r√°pido
READ_TS_MINMAX = bool(globals().get("READ_TS_MINMAX", True))

def _log(level: str, msg: str):
    ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{ts}] [{RUN_ID}] [{level.upper()}] [{CELL_LABEL}] {msg}", flush=True)

def _now_local():
    try:
        from zoneinfo import ZoneInfo
        return datetime.now(timezone.utc).astimezone(ZoneInfo(TIMEZONE_IANA)).isoformat(timespec="seconds")
    except Exception:
        return datetime.now(timezone.utc).isoformat(timespec="seconds") + "Z"

def _fmt_size(b: int) -> str:
    mib = b / (1024**2)
    gib = b / (1024**3)
    return f"{b} bytes ({mib:.2f} MiB | {gib:.2f} GiB)"

# --- Utilidad para min/max timestamp_utc por archivo (r√°pida y robusta) ---
def _minmax_ts_in_parquet(fp: Path) -> Tuple[Optional[int], Optional[int]]:
    """
    Lee SOLO la columna 'timestamp_utc' y devuelve (min_ms, max_ms) como enteros.
    Si no existe la columna, retorna (None, None).
    Preferimos Polars Lazy para evitar cargar columnas innecesarias.
    """
    try:
        # Intento 1: Lazy scan y agregaciones (r√°pido en Parquet columnar)
        lf = pl.scan_parquet(str(fp))
        mm = (
            lf.select([
                pl.col("timestamp_utc").cast(pl.Int64, strict=False).min().alias("_min"),
                pl.col("timestamp_utc").cast(pl.Int64, strict=False).max().alias("_max"),
            ])
            .collect()
        )
        vmin = mm["_min"][0]
        vmax = mm["_max"][0]
        if vmin is not None and vmax is not None:
            return int(vmin), int(vmax)

        # Intento 2: Read columns (fallback)
        df = pl.read_parquet(str(fp), columns=["timestamp_utc"])
        if df.height == 0:
            return None, None
        vmin = df.select(pl.col("timestamp_utc").cast(pl.Int64, strict=False).min()).item()
        vmax = df.select(pl.col("timestamp_utc").cast(pl.Int64, strict=False).max()).item()
        return (
            int(vmin) if vmin is not None else None,
            int(vmax) if vmax is not None else None,
        )
    except Exception:
        # Si falla (archivo corrupto o columna ausente), devolvemos None
        return None, None

# Header
print("="*100)
print(f"Inicio Celda 07 ‚Äî Cat√°logo m5_raw | TZ objetivo: {TIMEZONE_IANA}")
print(f"Hora GYE: {_now_local()} | Hora UTC: {datetime.now(timezone.utc).isoformat(timespec='seconds')}")
print("-"*100)
print(f"Escaneando carpeta m5_raw: {M5_DIR}")

if not M5_DIR.exists():
    raise FileNotFoundError(
        f"No existe {M5_DIR}.\n"
        "Esta celda SOLO cataloga lo que ya est√° en disco.\n"
        "Ejecuta primero la Celda 06 ‚Äî Descarga masiva M5."
    )

# Escaneo de archivos: part=YYYYMMDD.parquet
part_re = re.compile(r"part=(\d{8})\.parquet$", re.IGNORECASE)
files: List[Path] = list(M5_DIR.rglob("part=*.parquet"))
n_files = len(files)

if n_files == 0:
    print("‚ö†Ô∏è  Cat√°logo m5_raw: no se encontraron archivos 'part=YYYYMMDD.parquet' en m5_raw.")
    print("    Ruta escaneada : ", M5_DIR)
    print("")
    print("    Esta celda **NO** descarga datos. Solo cataloga lo que YA existe en disco.")
    print("    Acci√≥n: ejecuta antes la Celda 06 ‚Äî Descarga masiva M5 y vuelve a correr esta Celda 07.")
    raise SystemExit("Cat√°logo m5_raw abortado: carpeta vac√≠a. Ejecuta Celda 06 primero.")

total_bytes = 0

# Acumuladores por s√≠mbolo
agg: Dict[str, Dict[str, Any]] = {}
# Manifest detallado
manifest_rows: List[Dict[str, Any]] = []

for fp in files:
    try:
        st = fp.stat()
        sz = st.st_size
        total_bytes += sz
    except Exception:
        sz = 0

    # Extraer symbol=... de la ruta
    sym = None
    for part in fp.parts:
        if part.startswith("symbol="):
            sym = part.split("=", 1)[-1]
            break
    if sym is None:
        sym = "(UNKNOWN)"

    # Extraer fecha del nombre de archivo
    m = part_re.search(fp.name)
    day_str = m.group(1) if m else None  # 'YYYYMMDD' o None

    # first/last timestamp en ms (por archivo)
    if READ_TS_MINMAX:
        fmin, fmax = _minmax_ts_in_parquet(fp)
    else:
        fmin, fmax = None, None

    # Inicializar acumulador por s√≠mbolo
    if sym not in agg:
        agg[sym] = {
            "symbol": sym,
            "n_files": 0,
            "bytes": 0,
            "days": set(),          # set de 'YYYYMMDD'
            "first_ts_utc": None,   # min(ms) por s√≠mbolo
            "last_ts_utc": None,    # max(ms) por s√≠mbolo
        }

    rec = agg[sym]
    rec["n_files"] += 1
    rec["bytes"]   += sz

    if day_str:
        rec["days"].add(day_str)

    # Agregado de min/max ms por s√≠mbolo
    if fmin is not None:
        rec["first_ts_utc"] = fmin if rec["first_ts_utc"] is None else min(rec["first_ts_utc"], fmin)
    if fmax is not None:
        rec["last_ts_utc"]  = fmax if rec["last_ts_utc"]  is None else max(rec["last_ts_utc"],  fmax)

    # Manifest (por archivo)
    manifest_rows.append({
        "symbol": sym,
        "day": day_str,
        "size_bytes": sz,
        "relpath": str(fp.relative_to(DATA_ROOT)),
        "first_ts_utc": fmin,  # Int64 ms UTC (o None si no disponible)
        "last_ts_utc":  fmax,  # Int64 ms UTC (o None si no disponible)
    })

print(f"Archivos Parquet encontrados: {n_files} | Tama√±o total aprox: {_fmt_size(total_bytes)}")

# --------------------------------- Construir cat√°logo final (por s√≠mbolo) ---------------------------------
now_ms_utc = int(datetime.now(timezone.utc).timestamp() * 1000)
catalog_rows: List[Dict[str, Any]] = []

for sym, rec in agg.items():
    days_set = rec["days"]
    n_days   = len(days_set)
    first_day = min(days_set) if days_set else None
    last_day  = max(days_set) if days_set else None

    f_ts = rec["first_ts_utc"]
    l_ts = rec["last_ts_utc"]

    if f_ts is not None and l_ts is not None:
        dt0 = datetime.fromtimestamp(f_ts / 1000, tz=timezone.utc)
        dt1 = datetime.fromtimestamp(l_ts / 1000, tz=timezone.utc)
        n_months = (dt1.year - dt0.year) * 12 + (dt1.month - dt0.month) + 1
        n_years  = dt1.year - dt0.year + 1
    else:
        n_months = 0
        n_years  = 0

    bytes_total = rec["bytes"]
    mib = bytes_total / (1024**2) if bytes_total is not None else 0.0

    catalog_rows.append({
        "symbol": sym,
        "first_ts_utc": f_ts,
        "last_ts_utc":  l_ts,
        "n_files": rec["n_files"],
        "n_days":  n_days,
        "n_months": n_months,
        "n_years":  n_years,
        "bytes": bytes_total,
        "last_update_ts_utc": now_ms_utc,
        "notes": "",
        # columnas extra para QA/lectura humana
        "first_day": first_day,
        "last_day":  last_day,
        "mib": mib,
    })

# --------------------------------- DataFrames y persistencia ---------------------------------
META_DIR.mkdir(parents=True, exist_ok=True)

catalog_df = pl.DataFrame(
    catalog_rows,
    schema={
        "symbol": pl.Utf8,
        "first_ts_utc": pl.Int64,
        "last_ts_utc":  pl.Int64,
        "n_files": pl.Int64,
        "n_days":  pl.Int64,
        "n_months": pl.Int64,
        "n_years":  pl.Int64,
        "bytes": pl.Int64,
        "last_update_ts_utc": pl.Int64,
        "notes": pl.Utf8,
        "first_day": pl.Utf8,
        "last_day":  pl.Utf8,
        "mib": pl.Float64,
    },
).sort("symbol")

catalog_df.write_parquet(CATALOG_PATH, compression="zstd", statistics=True)

manifest_df = pl.DataFrame(
    manifest_rows,
    schema={
        "symbol": pl.Utf8,
        "day": pl.Utf8,            # 'YYYYMMDD' o None
        "size_bytes": pl.Int64,
        "relpath": pl.Utf8,
        "first_ts_utc": pl.Int64,  # ms (nullable)
        "last_ts_utc":  pl.Int64,  # ms (nullable)
    },
)
manifest_df.write_parquet(MANIFEST_PATH, compression="zstd", statistics=True)

# --------------------------------- Prints finales / QA r√°pido ---------------------------------
print("-"*100)
print(f"Cat√°logo por s√≠mbolo guardado en: {CATALOG_PATH}")
print(f"Manifest por archivo guardado en: {MANIFEST_PATH}")
print("-"*100)

# Top-10 por n√∫mero de archivos
top_by_files = catalog_df.sort("n_files", descending=True).head(10)

print("Top-10 por n_files:")
for r in top_by_files.iter_rows(named=True):
    print(
        f"  - {r['symbol']}: files={r['n_files']} | days={r['n_days']} | "
        f"rango_d√≠as={r['first_day']}‚Üí{r['last_day']} | "
        f"rango_ms={r['first_ts_utc']}‚Üí{r['last_ts_utc']} | "
        f"{r['mib']:.2f} MiB"
    )

print("-"*100)

# Top-10 por n_days
top_by_days  = catalog_df.sort("n_days", descending=True).head(10)
print("Top-10 por n_days:")
for r in top_by_days.iter_rows(named=True):
    print(
        f"  - {r['symbol']}: days={r['n_days']} | files={r['n_files']} | "
        f"rango_d√≠as={r['first_day']}‚Üí{r['last_day']} | "
        f"rango_ms={r['first_ts_utc']}‚Üí{r['last_ts_utc']} | "
        f"{r['mib']:.2f} MiB"
    )

print("-"*100)
print("‚úÖ Cat√°logo m5_raw generado.")
print("   - metadata/dataset_catalog.parquet  (agregado por s√≠mbolo)")
print("   - metadata/m5_manifest.parquet      (detalle por archivo, con first_ts_utc/last_ts_utc en ms UTC)")
print("   OK para continuar con la siguiente celda del pipeline.")
# =======================================================================================================


Inicio Celda 07 ‚Äî Cat√°logo m5_raw | TZ objetivo: America/Guayaquil
Hora GYE: 2025-12-02T23:24:19-05:00 | Hora UTC: 2025-12-03T04:24:19+00:00
----------------------------------------------------------------------------------------------------
Escaneando carpeta m5_raw: C:\Quant\MT5_Data_Extraction\data\bulk_data\m5_raw
Archivos Parquet encontrados: 121428 | Tama√±o total aprox: 1256437183 bytes (1198.23 MiB | 1.17 GiB)
----------------------------------------------------------------------------------------------------
Cat√°logo por s√≠mbolo guardado en: C:\Quant\MT5_Data_Extraction\data\metadata\dataset_catalog.parquet
Manifest por archivo guardado en: C:\Quant\MT5_Data_Extraction\data\metadata\m5_manifest.parquet
----------------------------------------------------------------------------------------------------
Top-10 por n_files:
  - AAVUSD: files=1471 | days=1471 | rango_d√≠as=20211119‚Üí20251202 | rango_ms=1637280000000‚Üí1764654600000 | 12.70 MiB
  - AVAUSD: files=1471 | days=1

In [9]:
# ============================ Celda 08 ‚Äî Contrato/schema M5 (UTC+GYE, idempotente) ============================
# Objetivo (UNA funci√≥n):
#   - Definir y persistir el contrato de esquema M5 en metadata/schema_m5.json (SCHEMA_M5_PATH).
#   - EXIGIR doble sello temporal:
#         * timestamp_utc  -> int64 epoch milisegundos (UTC)
#         * timestamp_gye  -> datetime con zona IANA TIMEZONE_IANA (America/Guayaquil por defecto)
#   - Documentar columnas, tipos, unidades y pol√≠ticas:
#         - Particionado f√≠sico en m5_raw: symbol=SYMBOL/year=YYYY/month=MM/part=YYYYMMDD.parquet
#         - Parquet (compresi√≥n, estad√≠sticas, tz expl√≠cita en timestamp_gye)
#         - spread_points en "points" (no pips).
#   - NO leer ni escribir datos de mercado (solo metadata).
#
# Uso:
#   - Obligatoria al menos una vez por entorno/proyecto.
#   - Idempotente: si se re-ejecuta, reescribe schema_m5.json con el mismo contrato (o con ajustes expl√≠citos).
# ============================================================================================================

from pathlib import Path
from datetime import datetime, timezone
import json

# --------------------------------- Guardas de contexto del pipeline ---------------------------------
# Se apoya en las variables definidas en Celdas 01‚Äì02. Si alguna falta, se fijan valores seguros.
if "DATA_ROOT" not in globals():
    # Fallback seguro si alguien ejecuta esta celda aislada.
    DATA_ROOT = (Path.cwd() / "data").resolve()

if "METADATA_DIR" not in globals():
    METADATA_DIR = (DATA_ROOT / "metadata").resolve()
    METADATA_DIR.mkdir(parents=True, exist_ok=True)

if "SCHEMA_M5_PATH" not in globals():
    SCHEMA_M5_PATH = METADATA_DIR / "schema_m5.json"

RUN_ID = globals().get("RUN_ID", datetime.now().strftime("%Y%m%d_%H%M%S"))
TIMEFRAME_LABEL = globals().get("TIMEFRAME_LABEL", "M5")
DATA_VERSION = globals().get("DATA_VERSION", "v1")
PARQUET_COMPRESSION = globals().get("PARQUET_COMPRESSION", "zstd")
PARQUET_WRITE_STATISTICS = bool(globals().get("PARQUET_WRITE_STATISTICS", True))
TIMEZONE_IANA = globals().get("TIMEZONE_IANA", "America/Guayaquil")

# Directorios f√≠sicos relevantes (para documentaci√≥n, no para escribir datos aqu√≠)
BULK_DATA_DIR = globals().get("BULK_DATA_DIR", DATA_ROOT / "bulk_data")
M5_RAW_DIR = globals().get("M5_RAW_DIR", BULK_DATA_DIR / "m5_raw")
HISTORICAL_DATA_DIR = globals().get("HISTORICAL_DATA_DIR", DATA_ROOT / "historical_data")
M5_CLEAN_DIR = globals().get("M5_CLEAN_DIR", HISTORICAL_DATA_DIR / "m5_clean")

# =================================== Contrato de esquema M5 ===================================
# Decisiones de dise√±o (acorde a Celda 06 / 07 del pipeline):
#   - timestamp_utc:
#         * Columna can√≥nica de ordenaci√≥n y joins entre datasets.
#         * Int64 ms UTC (epoch * 1000).
#   - timestamp_gye:
#         * Datetime[ms, TIMEZONE_IANA] (ej. 'America/Guayaquil').
#         * Se deriva SIEMPRE de timestamp_utc en el write-path (no se calcula "a mano" en cada celda).
#   - Campos OHLC, vol√∫menes y spread_points:
#         * Tipos num√©ricos float64/int64 para evitar sorpresas.
#         * spread_points en "points" (no pips) tal como devuelve MT5 en la estructura de rates.
#   - Particionado f√≠sico:
#         * m5_raw: symbol=SYMBOL/year=YYYY/month=MM/part=YYYYMMDD.parquet
#         * Cada archivo = un d√≠a/s√≠mbolo.
#   - Capa GOLD (m5_clean):
#         * Debe respetar el mismo esquema l√≥gico (doble sello + OHLC + vol√∫menes + spread_points opcional).
schema_m5 = {
    "dataset": {
        "name": "mt5_rates_m5",
        "layer_raw": str(M5_RAW_DIR),
        "layer_clean": str(M5_CLEAN_DIR),
        "description": (
            "Barras de 5 minutos (M5) normalizadas con doble sello temporal (UTC + local IANA) "
            "para el servidor MT5 usado en este proyecto."
        ),
        "timeframe_label": TIMEFRAME_LABEL,
        "data_version": DATA_VERSION,
        "run_id": RUN_ID,
        "created_ts_utc": datetime.now(timezone.utc).isoformat(),
        "notes": [
            "timestamp_utc es int64 (ms epoch UTC) y es la clave de ordenaci√≥n y joins.",
            f"timestamp_gye es datetime[ms,'{TIMEZONE_IANA}'], derivado 1:1 de timestamp_utc.",
            "spread_points est√° en 'points' (no pips); no se infiere desde ticks en este cuaderno.",
            "m5_raw y m5_clean deben respetar este contrato de columnas y tipos.",
        ],
    },
    "columns": [
        {
            "name": "timestamp_utc",
            "dtype": "int64_epoch_ms_utc",
            "required": True,
            "unit": "ms_since_epoch_UTC",
            "description": "Marca temporal de la barra (UTC) en milisegundos desde epoch (1970-01-01T00:00:00Z).",
            "examples": [1713312000000, 1713312300000],
        },
        {
            "name": "timestamp_gye",
            "dtype": "datetime_tz_ms",
            "tz": TIMEZONE_IANA,
            "required": True,
            "unit": "datetime_with_tz",
            "description": f"Marca temporal en zona IANA '{TIMEZONE_IANA}', derivada de timestamp_utc.",
            "examples": ["2024-04-17T00:00:00-05:00", "2024-04-17T00:05:00-05:00"],
        },
        {
            "name": "symbol",
            "dtype": "string",
            "required": True,
            "unit": None,
            "description": "Nombre can√≥nico del s√≠mbolo en MT5 (ej. 'EURUSD', 'XAUUSD', 'US500.cash').",
            "examples": ["EURUSD", "XAUUSD"],
        },
        {
            "name": "open",
            "dtype": "float64",
            "required": True,
            "unit": "price_native",
            "description": "Precio de apertura de la barra.",
        },
        {
            "name": "high",
            "dtype": "float64",
            "required": True,
            "unit": "price_native",
            "description": "M√°ximo de la barra.",
        },
        {
            "name": "low",
            "dtype": "float64",
            "required": True,
            "unit": "price_native",
            "description": "M√≠nimo de la barra.",
        },
        {
            "name": "close",
            "dtype": "float64",
            "required": True,
            "unit": "price_native",
            "description": "Cierre de la barra.",
        },
        {
            "name": "tick_volume",
            "dtype": "int64",
            "required": True,
            "unit": "ticks_count",
            "description": "Volumen en ticks reportado por MT5 para la barra.",
        },
        {
            "name": "real_volume",
            "dtype": "int64_nullable",
            "required": False,
            "unit": "contracts_or_lots",
            "description": "Volumen real si el br√≥ker lo provee; en caso contrario NULL.",
        },
        {
            "name": "spread_points",
            "dtype": "float64_nullable",
            "required": False,
            "unit": "points",
            "description": (
                "Spread de la barra en 'points' (normalizado por Point del s√≠mbolo). "
                "Si MT5 no lo provee en el rate M5, se persiste NULL."
            ),
        },
        {
            "name": "broker",
            "dtype": "string_nullable",
            "required": False,
            "unit": None,
            "description": "Identificador del br√≥ker/servidor MT5 (opcional).",
        },
        {
            "name": "server_tz",
            "dtype": "string_nullable",
            "required": False,
            "unit": "IANA_or_vendor_label",
            "description": "Etiqueta de zona del servidor MT5 (opcional, orientativa; no se usa para joins).",
        },
    ],
    # Orden l√≥gico y f√≠sico esperado por Celda 06 (Bulk M5) y Capa GOLD
    "column_order": [
        "timestamp_utc",
        "timestamp_gye",
        "symbol",
        "open",
        "high",
        "low",
        "close",
        "tick_volume",
        "real_volume",
        "spread_points",
        "broker",
        "server_tz",
    ],
    "parquet": {
        "compression": PARQUET_COMPRESSION,
        "write_statistics": PARQUET_WRITE_STATISTICS,
        "encoding_notes": [
            "Se recomienda escribir con compresi√≥n y estad√≠sticas activadas para permitir predicate/pushdown.",
            f"timestamp_gye debe escribirse como Arrow timestamp[ms,tz='{TIMEZONE_IANA}'] (no como string).",
            "timestamp_utc se almacena como Int64 (ms epoch) para joins r√°pidos y robustos.",
        ],
    },
    "partitioning": {
        "raw_layout": "symbol=SYMBOL/year=YYYY/month=MM/part=YYYYMMDD.parquet",
        "raw_root": str(M5_RAW_DIR),
        "keys": ["symbol", "year", "month"],
        "filename_pattern": "part=YYYYMMDD.parquet",
        "rationale": (
            "Escrituras idempotentes por d√≠a y lecturas selectivas por rango/s√≠mbolo. "
            "El manifest y el dataset_catalog se basan en este layout."
        ),
    },
    "timestamp_policy": {
        "double_stamp_required": True,
        "utc_column": "timestamp_utc",
        "local_column": "timestamp_gye",
        "local_tz": TIMEZONE_IANA,
        "utc_representation": "int64_epoch_ms",
        "local_representation": "datetime_tz_ms",
        "conversion_rules": [
            "timestamp_gye = convert_time_zone(from='UTC', to=TIMEZONE_IANA) aplicado sobre timestamp_utc.",
            "NO se almacenan datetime 'naive'.",
            "En todas las celdas de I/O se deben reportar rangos en UTC y TZ local.",
        ],
    },
    "spread_policy": {
        "unit": "points",
        "source": "bar_level_if_available",
        "inference": "no_inference_from_ticks_in_this_notebook",
        "notes": [
            "Si el rate M5 entrega spread, se normaliza a points usando Point del s√≠mbolo (responsabilidad de la celda de descarga).",
            "Si no lo entrega, se persiste NULL y se reporta en QA (rejilla/coverage).",
        ],
    },
    "idempotency_policy": {
        "flags": {
            "FORCE_REDOWNLOAD_BULK_M5": (
                "Re-descarga hist√≥rica completa por s√≠mbolo/ventana en Celda 06 (ignora ficheros existentes)."
            ),
            "FORCE_REWRITE_DAY": (
                "Reescritura puntual de un d√≠a (symbol + YYYYMMDD) incluso si el archivo part=YYYYMMDD.parquet existe."
            ),
        },
        "default": "append_or_skip_if_exists",
        "notes": [
            "Antes de persistir: deduplicar por (symbol, timestamp_utc) en la celda de descarga/limpieza.",
            "Si existe el archivo del d√≠a y no hay flags de fuerza, se omite para evitar duplicados.",
        ],
    },
    "quality_checks": {
        "duplicates": "No se permiten duplicados por (symbol, timestamp_utc) dentro de cada archivo ni a nivel de s√≠mbolo.",
        "ohlc_consistency": "Debe cumplirse low <= high y no NaN en OHLC obligatorios.",
        "m5_grid": (
            "Validar ~288 barras/d√≠a por s√≠mbolo (rejilla M5 completa) salvo festivos/huecos explicados. "
            "La QA en celdas posteriores debe cuantificar esta rejilla."
        ),
        "tz_consistency": (
            "timestamp_gye debe corresponder EXACTAMENTE a timestamp_utc convertido a TIMEZONE_IANA "
            "(sin desfaces ni offset dobles)."
        ),
    },
}

# =================================== Persistir y validar lectura ===================================
SCHEMA_M5_PATH.parent.mkdir(parents=True, exist_ok=True)

with open(SCHEMA_M5_PATH, "w", encoding="utf-8") as f:
    json.dump(schema_m5, f, ensure_ascii=False, indent=2)

with open(SCHEMA_M5_PATH, "r", encoding="utf-8") as f:
    loaded = json.load(f)

# =================================== Impresiones obligatorias ===================================
print("================================================================================")
print("Celda 08 ‚Äî Contrato/schema M5 (UTC+GYE)")
print("--------------------------------------------------------------------------------")
print(f"Esquema M5 guardado en : {SCHEMA_M5_PATH}")
print(f"DATA_ROOT              : {DATA_ROOT}")
print(f"m5_raw (capa raw)      : {M5_RAW_DIR}")
print(f"m5_clean (capa GOLD)   : {M5_CLEAN_DIR}")
print("--------------------------------------------------------------------------------")
print("Columnas (orden, tipos, required, unidad):")
for col in loaded["column_order"]:
    cdef = next((c for c in loaded["columns"] if c["name"] == col), None)
    if not cdef:
        continue
    extra = ""
    if cdef["name"] == "timestamp_gye":
        extra = f" | tz={cdef.get('tz', TIMEZONE_IANA)}"
    print(
        f"  - {cdef['name']:<14} : {cdef['dtype']:<18} "
        f"| required={str(cdef['required']):<5} | unit={str(cdef['unit']):<18}{extra}"
    )

print("--------------------------------------------------------------------------------")
print("Parquet (config por defecto del contrato):")
print(f"  - compresi√≥n       : {loaded['parquet']['compression']}")
print(f"  - write_statistics : {loaded['parquet']['write_statistics']}")
print("--------------------------------------------------------------------------------")
print("Particionado f√≠sico (capa m5_raw):")
print(f"  - root             : {loaded['partitioning']['raw_root']}")
print(f"  - layout           : {loaded['partitioning']['raw_layout']}")
print(f"  - filename_pattern : {loaded['partitioning']['filename_pattern']}")
print(f"  - keys             : {', '.join(loaded['partitioning']['keys'])}")
print("--------------------------------------------------------------------------------")
tp = loaded["timestamp_policy"]
print("Pol√≠tica de doble sello (UTC + TZ local):")
print(f"  - double_stamp_required = {tp['double_stamp_required']}")
print(f"  - utc_column / local_column = {tp['utc_column']} / {tp['local_column']}")
print(f"  - utc_rep / local_rep      = {tp['utc_representation']} / {tp['local_representation']}")
print(f"  - local_tz                 = {tp['local_tz']}")
print("--------------------------------------------------------------------------------")
print("Notas de spread_points:")
for note in loaded["spread_policy"]["notes"]:
    print(f"  * {note}")
print("--------------------------------------------------------------------------------")
print("‚úÖ Celda 08 ‚Äî Contrato/schema M5 normalizado: OK para continuar con la siguiente celda del pipeline.")
# ============================================================================================================


Celda 08 ‚Äî Contrato/schema M5 (UTC+GYE)
--------------------------------------------------------------------------------
Esquema M5 guardado en : C:\Quant\MT5_Data_Extraction\data\metadata\schema_m5.json
DATA_ROOT              : C:\Quant\MT5_Data_Extraction\data
m5_raw (capa raw)      : C:\Quant\MT5_Data_Extraction\data\bulk_data\m5_raw
m5_clean (capa GOLD)   : C:\Quant\MT5_Data_Extraction\data\historical_data\m5_clean
--------------------------------------------------------------------------------
Columnas (orden, tipos, required, unidad):
  - timestamp_utc  : int64_epoch_ms_utc | required=True  | unit=ms_since_epoch_UTC
  - timestamp_gye  : datetime_tz_ms     | required=True  | unit=datetime_with_tz   | tz=America/Guayaquil
  - symbol         : string             | required=True  | unit=None              
  - open           : float64            | required=True  | unit=price_native      
  - high           : float64            | required=True  | unit=price_native      
  - low      

In [10]:
# ======================= Celda 08 ‚Äî Diagn√≥stico r√°pido QA M5 (opcional) =======================
# Prop√≥sito:
#   - Inspeccionar r√°pidamente el resultado de qa_m5_bulk.parquet (Celda 10).
#   - NO forma parte del pipeline "duro": es s√≥lo para debug / an√°lisis manual.
#   - NO pisa DATA_ROOT: reutiliza el DATA_ROOT definido en la Celda 02.
#
# Qu√© hace:
#   1) Lee metadata/qa_m5_bulk.parquet.
#   2) Muestra el n¬∫ de d√≠as por status (OK / WARN / FAIL / EMPTY), con porcentaje.
#   3) Lista las causas de FAIL m√°s frecuentes (columna 'notes'), top 20.
#   4) (Opcional) Si quieres, aqu√≠ puedes extender con m√°s an√°lisis de WARN, rejilla_pct, etc.
# ==============================================================================================

from pathlib import Path
import polars as pl

# Usamos el DATA_ROOT definido en la Celda 02
if "DATA_ROOT" not in globals():
    raise RuntimeError(
        "DATA_ROOT no est√° definido en globals(). "
        "Ejecuta la Celda 02 antes de correr esta Celda 08 de diagn√≥stico QA."
    )

DATA_ROOT = Path(globals()["DATA_ROOT"]).resolve()
QA_OUT_PATH = DATA_ROOT / "metadata" / "qa_m5_bulk.parquet"

if not QA_OUT_PATH.exists():
    raise FileNotFoundError(
        f"No se encontr√≥ qa_m5_bulk.parquet en:\n  {QA_OUT_PATH}\n"
        "Ejecuta la Celda 10 (QA operativa M5) antes de esta celda de diagn√≥stico."
    )

print("================================================================================")
print("Celda 08 ‚Äî Diagn√≥stico r√°pido de QA M5 (qa_m5_bulk.parquet)")
print(f"DATA_ROOT   : {DATA_ROOT}")
print(f"QA_OUT_PATH : {QA_OUT_PATH}")
print("================================================================================")

df_qa = pl.read_parquet(QA_OUT_PATH)

# -------------------- 1) Resumen por status (OK / WARN / FAIL / EMPTY) ------------------------
if df_qa.height == 0:
    print("‚ö†Ô∏è  qa_m5_bulk.parquet est√° vac√≠o: no hay d√≠as auditados para este RUN_ID.")
else:
    print("‚úî Resumen de d√≠as por status:")
    resumen_status = (
        df_qa
        .group_by("status")
        .agg(pl.len().alias("n_dias"))
        .with_columns(
            (pl.col("n_dias") / pl.col("n_dias").sum() * 100.0).alias("pct")
        )
        .sort("n_dias", descending=True)
    )
    print(resumen_status)

# -------------------- 2) Causas de FAIL m√°s frecuentes (notes) -------------------------------
if "status" not in df_qa.columns:
    print("\n‚ö†Ô∏è  La tabla QA no tiene columna 'status'; no se puede desglosar FAIL por notas.")
else:
    df_fail = df_qa.filter(pl.col("status") == "FAIL")

    if df_fail.height == 0:
        print("\n‚úî No hay d√≠as con status FAIL en qa_m5_bulk.parquet.")
    else:
        if "notes" not in df_fail.columns:
            print("\n‚ö†Ô∏è  Hay d√≠as FAIL pero no existe columna 'notes'; no se pueden listar las causas.")
        else:
            print("\n‚úî Top 20 causas de FAIL (columna 'notes'):")
            top_notes = (
                df_fail
                .group_by("notes")
                .agg(pl.len().alias("n"))
                .sort("n", descending=True)
                .head(20)
            )
            print(top_notes)

# -------------------- 3) (Opcional) Rejilla global por status (√∫til para acciones) -----------
if "rejilla_pct" in df_qa.columns:
    print("\n‚úî Rejilla_pct media por status (para entender por qu√© hay WARN/FAIL):")
    rejilla_por_status = (
        df_qa
        .group_by("status")
        .agg([
            pl.col("rejilla_pct").mean().alias("rejilla_mean"),
            pl.col("rejilla_pct").quantile(0.5).alias("rejilla_p50"),
            pl.col("rejilla_pct").min().alias("rejilla_min"),
            pl.col("rejilla_pct").max().alias("rejilla_max"),
        ])
        .sort("status")
    )
    print(rejilla_por_status)
else:
    print("\n‚ö†Ô∏è  La tabla QA no tiene columna 'rejilla_pct'; se omite resumen de rejilla.")

print("================================================================================")
print("‚úÖ Fin de Celda 08 ‚Äî Diagn√≥stico QA M5 (esta celda NO altera el pipeline, s√≥lo lee metadata).")
print("================================================================================")


Celda 08 ‚Äî Diagn√≥stico r√°pido de QA M5 (qa_m5_bulk.parquet)
DATA_ROOT   : C:\Quant\MT5_Data_Extraction\data
QA_OUT_PATH : C:\Quant\MT5_Data_Extraction\data\metadata\qa_m5_bulk.parquet
‚úî Resumen de d√≠as por status:
shape: (2, 3)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îê
‚îÇ status ‚îÜ n_dias ‚îÜ pct       ‚îÇ
‚îÇ ---    ‚îÜ ---    ‚îÜ ---       ‚îÇ
‚îÇ str    ‚îÜ u32    ‚îÜ f64       ‚îÇ
‚ïû‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï™‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ï°
‚îÇ OK     ‚îÜ 63055  ‚îÜ 75.730826 ‚îÇ
‚îÇ WARN   ‚îÜ 20207  ‚îÜ 24.269174 ‚îÇ
‚îî‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¥‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò

‚úî No hay d√≠as con status FAIL en qa_m5_bulk.parquet.

‚úî Rejilla_pct media por status (para entender por qu√© hay WARN/FAIL):
shape: (2, 5)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ

In [11]:
# ======================= Celda 10 ‚Äî QA operativa M5 (Polars, 3B, EAGER, robusta v4 + perfil sesi√≥n + QA econ√≥mica) =======================
# - Toma DATA_ROOT de la Celda 02 (si no existe, falla).
# - EAGER por archivo; tipado estable y schema fijo para evitar ComputeError al construir el DataFrame.
#
# Cambios v4 (respecto a v3):
#   * Mantiene el mismo esquema de salida y los mismos artefactos:
#       - metadata/qa_m5_bulk.parquet  (nivel d√≠a: s√≠mbolo+fecha)
#       - metadata/qa_operativa_summary.parquet (nivel s√≠mbolo)
#   * Nueva l√≥gica de status por d√≠a:
#       - FAIL  ‚Üí errores FATALES de estructura: missing_OHLC, OHLC_null, OHLC_nan, low>high, errores de lectura.
#       - WARN  ‚Üí estructura sana pero problemas corregibles: dups>0 y/o rejilla_pct < WARN_GRID_PCT.
#       - OK    ‚Üí estructura sana, sin dups y con rejilla_pct >= WARN_GRID_PCT.
#   * Mantiene d√≠as vac√≠os como status="EMPTY" (no contaminan QA).
#   * A√±ade sanity-check global de cobertura M5:
#       - Calcula n_bars_mean, n_bars_p50, rejilla_mean, rejilla_p50.
#       - Si rejilla_mean < MIN_MEAN_REJILLA_HARD y n_bars_mean < MIN_MEAN_NBARS_HARD:
#             lanza RuntimeError ‚Üí el dataset NO se considera M5 operativo.
#
# Perfil de sesi√≥n + flags por s√≠mbolo:
#   * Resumen por s√≠mbolo (qa_operativa_summary.parquet):
#       - n_bars_mean, n_bars_p50
#       - expected_bars_per_day_symbol ‚âà n_bars_p50
#       - session_type ‚àà {SESSION_24H, SESSION_DIURNA, SESSION_ILLQ}
#       - qa_struct_flag ‚àà {OK_STRUCT, WARN_STRUCT, BAD_STRUCT} (s√≥lo estructura, sin rejilla_pct).
#
#   * qa_operativa_flag (OK/WARN/BAD), ahora *session-aware*:
#       - BAD:
#           ¬∑ qa_struct_flag == "BAD_STRUCT"
#           ¬∑ o fail_ratio ‚â• QA_BAD_MIN_FAIL_RATIO
#           ¬∑ o empty_ratio > QA_BAD_MAX_EMPTY_RATIO
#           ¬∑ o (session_type == "SESSION_24H" y rejilla_pct_mean < QA_BAD_MAX_REJILLA_MEAN)
#         ‚áí s√≥lo FX/CFD 24h se castigan por rejilla 24h muy baja.
#       - OK:
#           ¬∑ ok_ratio alto, fail_ratio bajo,
#           ¬∑ y cobertura razonable, ajustada al tipo de sesi√≥n:
#               ¬∑ 24h ‚Üí rejilla_pct_mean ‚â• QA_OK_MIN_REJILLA_MEAN
#               ¬∑ diurnas/ill√≠quidas ‚Üí no se exige rejilla 24h alta.
#       - WARN:
#           ¬∑ todo lo dem√°s (incluidas acciones 100% WARN por rejilla 24h baja, pero estructura sana).
#
# Cambios extra en esta versi√≥n:
#   * QA econ√≥mica diaria:
#       - pct_ret_zero: % de retornos 5m exactamente 0.0
#       - qa_price_flag_day ‚àà {OK, BAD, EMPTY, FAIL}
#   * QA econ√≥mica por s√≠mbolo:
#       - days_price_ok, days_price_bad, ratios asociados
#       - qa_price_flag ‚àà {OK, WARN, BAD} a nivel s√≠mbolo
# =======================================================================================================================================

from __future__ import annotations
from pathlib import Path
from datetime import datetime, timezone
from typing import Tuple, Optional, List
import re, time
import polars as pl

# ----------------------------- Par√°metros / Paths -----------------------------
CELL_LABEL   = "10-QA-M5-Operativa"
RUN_ID       = globals().get("RUN_ID", datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))

# ‚úÖ Tomar DATA_ROOT desde la Celda 02 (sin fallback)
if "DATA_ROOT" not in globals():
    raise RuntimeError("DATA_ROOT no est√° definido. Ejecuta la Celda 02 antes de esta celda.")
DATA_ROOT    = Path(globals()["DATA_ROOT"]).resolve()

BULK_M5_DIR  = DATA_ROOT / "bulk_data" / "m5_raw"
META_DIR     = DATA_ROOT / "metadata"
FILTERS_DIR  = META_DIR / "filters"

ELIGIBLE_PARQUET     = FILTERS_DIR / "eligible_symbols_by_cost.parquet"
ELIGIBLE_TXT         = FILTERS_DIR / "eligible_symbols_by_cost.txt"
QA_OUT_PATH          = META_DIR / "qa_m5_bulk.parquet"
QA_SUMMARY_PATH      = META_DIR / "qa_operativa_summary.parquet"

# Barras esperadas y umbrales de rejilla/QA
EXPECTED_BARS_M5     = int(globals().get("EXPECTED_BARS_M5", 288))
EXPECTED_BARS        = EXPECTED_BARS_M5               # alias local
WARN_GRID_PCT        = float(globals().get("WARN_GRID_PCT", 80.0))
PARQUET_COMP         = "zstd"

PROGRESS_EVERY_FILES   = int(globals().get("PROGRESS_EVERY_FILES_QA10", 2000))
PROGRESS_EVERY_SECONDS = float(globals().get("PROGRESS_EVERY_SECONDS_QA10", 2.0))

WHITELIST_QA10   = set(globals().get("WHITELIST_QA10", []) or [])
BLACKLIST_QA10   = set(globals().get("BLACKLIST_QA10", []) or [])
MAX_SYMBOLS_QA10 = globals().get("MAX_SYMBOLS_QA10", None)
TIMEZONE_IANA    = globals().get("TIMEZONE_IANA", "America/Guayaquil")

# Umbrales para el flag qa_operativa_flag (ajustables v√≠a Celda 02)
QA_OK_MIN_OK_RATIO       = float(globals().get("QA_OK_MIN_OK_RATIO", 0.90))     # ‚â• 90% d√≠as OK
QA_OK_MAX_FAIL_RATIO     = float(globals().get("QA_OK_MAX_FAIL_RATIO", 0.05))   # ‚â§ 5% d√≠as FAIL
QA_OK_MIN_REJILLA_MEAN   = float(globals().get("QA_OK_MIN_REJILLA_MEAN", 80.0)) # rejilla media ‚â• 80% (24h)

QA_BAD_MIN_FAIL_RATIO    = float(globals().get("QA_BAD_MIN_FAIL_RATIO", 0.20))  # ‚â• 20% FAIL ‚Üí BAD
QA_BAD_MAX_REJILLA_MEAN  = float(globals().get("QA_BAD_MAX_REJILLA_MEAN", 60.0))# rejilla media < 60 (24h) ‚Üí BAD
QA_BAD_MAX_EMPTY_RATIO   = float(globals().get("QA_BAD_MAX_EMPTY_RATIO", 0.20)) # > 20% EMPTY ‚Üí BAD

# Sanity-check global de cobertura M5 (definidos en Celda 02)
MIN_MEAN_REJILLA_HARD    = float(globals().get("MIN_MEAN_REJILLA_HARD", 5.0))
MIN_MEAN_NBARS_HARD      = float(globals().get("MIN_MEAN_NBARS_HARD", EXPECTED_BARS * 0.20))

# <<< NEW QA ECON >>> Umbrales para QA econ√≥mica simple (precio) por s√≠mbolo/d√≠a
STUCK_RATIO_BAD        = float(globals().get("STUCK_RATIO_BAD", 0.80))   # ‚â•80% retornos 0.0 ‚Üí "stuck"
RET_SPIKE_ABS_HARD     = float(globals().get("RET_SPIKE_ABS_HARD", 0.50))# |ret| ‚â•50% en un 5m ‚Üí spike sospechoso
QA_PRICE_BAD_MIN_RATIO = float(globals().get("QA_PRICE_BAD_MIN_RATIO", 0.10))  # ‚â•10% d√≠as BAD_price ‚Üí s√≠mbolo BAD

# Umbrales para perfil de sesi√≥n (configurables en Celda 02)
SESSION_24H_MIN_COVERAGE_RATIO = float(globals().get("SESSION_24H_MIN_COVERAGE_RATIO", 0.70))
SESSION_DIURNA_MIN_NBARS       = float(globals().get("SESSION_DIURNA_MIN_NBARS", 60.0))
# (SESSION_ILLQ es simplemente n_bars_mean < SESSION_DIURNA_MIN_NBARS)

# Umbrales para qa_struct_flag (solo estructura, sin rejilla)
QA_STRUCT_OK_MAX_FAIL_RATIO            = float(globals().get("QA_STRUCT_OK_MAX_FAIL_RATIO", 0.05))
QA_STRUCT_OK_MAX_LOW_GT_HIGH_RATIO     = float(globals().get("QA_STRUCT_OK_MAX_LOW_GT_HIGH_RATIO", 0.0))
QA_STRUCT_BAD_MIN_FAIL_RATIO           = float(globals().get("QA_STRUCT_BAD_MIN_FAIL_RATIO", 0.20))
QA_STRUCT_BAD_MIN_LOW_GT_HIGH_RATIO    = float(globals().get("QA_STRUCT_BAD_MIN_LOW_GT_HIGH_RATIO", 0.01))

# ----------------------------- Logger helper (usa Celda 03 si existe) -----------------------------
def _log(level: str, msg: str):
    """
    Usa log_msg(celda, level, message) si est√° disponible (Celda 03),
    si no, hace un print() formateado.
    """
    if "log_msg" in globals():
        try:
            log_msg(CELL_LABEL, level, msg)
            return
        except Exception:
            # Fallback a print si algo falla con el logger
            pass
    ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{ts}] [{RUN_ID}] [{level.upper()}] [{CELL_LABEL}] {msg}", flush=True)

# ----------------------------- Utilidades -----------------------------
def _now_local_iana() -> str:
    try:
        from zoneinfo import ZoneInfo
        return datetime.now(timezone.utc).astimezone(ZoneInfo(TIMEZONE_IANA)).isoformat(timespec="seconds")
    except Exception:
        return ""

def _read_eligible_symbols() -> Tuple[Optional[set[str]], Optional[str]]:
    if ELIGIBLE_PARQUET.exists():
        try:
            df = pl.read_parquet(ELIGIBLE_PARQUET)
            col = "symbol" if "symbol" in df.columns else df.columns[0]
            s = df.get_column(col).cast(pl.Utf8, strict=False).str.strip_chars()
            syms = [x for x in s.to_list() if x]
            return set(syms), "parquet"
        except Exception as e:
            _log("WARNING", f"No se pudo leer {ELIGIBLE_PARQUET}: {e}. Probando TXT...")
    if ELIGIBLE_TXT.exists():
        try:
            with open(ELIGIBLE_TXT, "r", encoding="utf-8") as f:
                syms = [ln.strip() for ln in f if ln.strip()]
            return set(syms), "txt"
        except Exception as e:
            _log("WARNING", f"No se pudo leer {ELIGIBLE_TXT}: {e}.")
    return None, None

def _parse_date_from_filename(fp: Path) -> Optional[str]:
    m = re.search(r"part=([0-9]{8})", fp.name)
    if m: return m.group(1)
    m = re.search(r"part=([0-9]{4})[-/]?([0-9]{2})[-/]?([0-9]{2})", fp.name)
    if m: return f"{m.group(1)}{m.group(2)}{m.group(3)}"
    return None

def _safe_quantile(s: pl.Series, q: float) -> Optional[float]:
    if s is None or s.len() == 0 or s.null_count() == s.len():
        return None
    try:
        v = s.drop_nulls()
        if v.len() == 0:
            return None
        return float(v.quantile(q))
    except Exception:
        return None

# ----------------------------- Header -----------------------------
print("="*110)
print(f"Inicio Celda 10 ‚Äî QA operativa M5 (v4 + perfil sesi√≥n + QA econ√≥mica) | TZ local: {TIMEZONE_IANA}")
print(f"Hora local: {_now_local_iana()} | Hora UTC: {datetime.now(timezone.utc).isoformat(timespec='seconds')}")
print("-"*110)
print(f"DATA_ROOT: {DATA_ROOT}")
print(f"Fuente: {BULK_M5_DIR}")
print("-"*110)
_log("INFO", f"Inicio QA operativa M5 sobre {BULK_M5_DIR}")

# ----------------------------- Guardas -----------------------------
if not BULK_M5_DIR.exists():
    raise FileNotFoundError(f"No existe {BULK_M5_DIR}. Ejecuta la celda de descarga M5 primero.")
META_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------- Universo (m5_raw ‚à© 3B) -----------------------------
symbols_found = [p.name.split("=",1)[-1] for p in sorted(BULK_M5_DIR.glob("symbol=*")) if p.is_dir()]
if WHITELIST_QA10:
    symbols_found = [s for s in symbols_found if s in WHITELIST_QA10]
if BLACKLIST_QA10:
    symbols_found = [s for s in symbols_found if s not in BLACKLIST_QA10]
if MAX_SYMBOLS_QA10 is not None:
    symbols_found = symbols_found[:int(MAX_SYMBOLS_QA10)]

elig_set, elig_src = _read_eligible_symbols()
if elig_set is not None:
    base_set        = set(symbols_found)
    audited_symbols = sorted(base_set & elig_set)
    omitted_symbols = sorted(base_set - elig_set)
    _log("INFO", f"Filtro 3B activo ({elig_src}). elegibles={len(elig_set)} | encontrados={len(base_set)} ‚Üí auditados={len(audited_symbols)} | omitidos={len(omitted_symbols)}")
else:
    audited_symbols = sorted(set(symbols_found))
    omitted_symbols = []
    _log("WARNING", "No se encontr√≥ lista elegible (3B). Se auditan todos los s√≠mbolos presentes en m5_raw.")

if not audited_symbols:
    raise SystemExit("No hay s√≠mbolos para auditar despu√©s de aplicar filtros/elegible.")

# ----------------------------- Recolecci√≥n de archivos -----------------------------
files: List[Path] = []
for sym in audited_symbols:
    sdir = BULK_M5_DIR / f"symbol={sym}"
    if sdir.exists():
        files.extend(sorted(sdir.rglob("part=*.parquet")))
total_files = len(files)

print("="*110, flush=True)
print(f"Inicio QA M5 (EAGER v4) ‚Üí archivos a auditar: {total_files} | s√≠mbolos auditados: {len(audited_symbols)} | omitidos: {len(omitted_symbols)}", flush=True)
_log("INFO", f"Archivos a auditar: {total_files} para {len(audited_symbols)} s√≠mbolo(s)")

# ----------------------------- Acumuladores TIPO-FIJO por columna -----------------------------
C = {
    "symbol": [], "date": [],
    "n_bars": [], "n_unique_ts": [], "dups": [], "rejilla_pct": [],
    "has_null_ohlc": [], "has_nan_ohlc": [], "low_gt_high": [],
    "p50_spread_points": [], "p90_spread_points": [], "p99_spread_points": [],
    "p50_spread_bps": [], "p90_spread_bps": [], "p99_spread_bps": [],
    # <<< NEW QA ECON >>> QA econ√≥mica diaria
    "pct_ret_zero": [],          # % de retornos 5m exactamente 0.0
    "qa_price_flag_day": [],     # {OK, BAD, EMPTY, FAIL}
    # QA estructural
    "status": [], "notes": []
}

t0 = time.monotonic()
last_beat = t0
req_cols = ["timestamp_utc","open","high","low","close","spread_points"]

# ----------------------------- Loop EAGER por archivo -----------------------------
for i, fp in enumerate(files, 1):
    # Extraer symbol de forma robusta desde la ruta
    sym_from_path = next((p.split("=", 1)[-1] for p in fp.parts if p.startswith("symbol=")), None)
    symbol = sym_from_path or "(UNKNOWN)"

    date_str = _parse_date_from_filename(fp)

    try:
        df = pl.read_parquet(fp, use_statistics=True)

        n_bars = int(df.height)

        # Derivar date si no est√° en filename
        if date_str is None:
            if "timestamp_utc" in df.columns and n_bars > 0:
                dmin = pl.from_epoch(df.get_column("timestamp_utc"), time_unit="ms").dt.truncate("1d").min()
                try:
                    date_str = dmin.strftime("%Y%m%d")
                except Exception:
                    date_str = "(unknown)"
            else:
                date_str = "(unknown)"

        # Si el archivo est√° vac√≠o ‚Üí marcar como EMPTY, sin tocar flags OHLC
        if n_bars == 0:
            C["symbol"].append(str(symbol))
            C["date"].append(str(date_str))
            C["n_bars"].append(int(0))
            C["n_unique_ts"].append(int(0))
            C["dups"].append(int(0))
            C["rejilla_pct"].append(float(0.0))
            C["has_null_ohlc"].append(False)
            C["has_nan_ohlc"].append(False)
            C["low_gt_high"].append(False)
            C["p50_spread_points"].append(None)
            C["p90_spread_points"].append(None)
            C["p99_spread_points"].append(None)
            C["p50_spread_bps"].append(None)
            C["p90_spread_bps"].append(None)
            C["p99_spread_bps"].append(None)
            # <<< NEW QA ECON >>> d√≠a vac√≠o
            C["pct_ret_zero"].append(None)
            C["qa_price_flag_day"].append("EMPTY")
            C["status"].append("EMPTY")
            C["notes"].append("no_bars_in_file")
        else:
            # Nos quedamos solo con columnas relevantes si existen
            cols_available = set(df.columns)
            use_cols = [c for c in req_cols if c in cols_available]
            if use_cols:
                df = df.select(use_cols)

            # Casts seguros (strict=False para evitar ValueError)
            if "timestamp_utc" in df.columns:
                df = df.with_columns(pl.col("timestamp_utc").cast(pl.Int64, strict=False))
            for c in ["open","high","low","close","spread_points"]:
                if c in df.columns:
                    df = df.with_columns(pl.col(c).cast(pl.Float64, strict=False))

            # M√©tricas b√°sicas
            if "timestamp_utc" in df.columns:
                n_unique_ts = int(df.get_column("timestamp_utc").n_unique())
            else:
                n_unique_ts = 0

            dups = int(max(n_bars - n_unique_ts, 0))
            rejilla_pct = float(round((n_unique_ts * 100.0 / float(EXPECTED_BARS)), 2)) if EXPECTED_BARS else 0.0

            have_ohlc = all(c in df.columns for c in ["open","high","low","close"])
            have_low_high = all(c in df.columns for c in ["low","high"])

            # ========= PARCHE: detecci√≥n de nulls/NaNs/low>high SIN usar `.item()` =========
            if have_ohlc and n_bars > 0:
                # ¬øAlguna barra con OHLC nulo?
                has_null_ohlc = (
                    df.filter(
                        pl.any_horizontal([
                            pl.col("open").is_null(),
                            pl.col("high").is_null(),
                            pl.col("low").is_null(),
                            pl.col("close").is_null(),
                        ])
                    ).height > 0
                )

                # ¬øAlguna barra con OHLC NaN?
                has_nan_ohlc = (
                    df.filter(
                        pl.any_horizontal([
                            pl.col("open").is_nan(),
                            pl.col("high").is_nan(),
                            pl.col("low").is_nan(),
                            pl.col("close").is_nan(),
                        ])
                    ).height > 0
                )
            else:
                has_null_ohlc = False
                has_nan_ohlc  = False

            if have_low_high and n_bars > 0:
                # ¬øAlguna barra con low > high?
                low_gt_high = df.filter(pl.col("low") > pl.col("high")).height > 0
            else:
                low_gt_high = False
            # =========================================================================

            # Spreads
            p50_sp = p90_sp = p99_sp = None
            p50_bps = p90_bps = p99_bps = None

            if "spread_points" in df.columns:
                sp = df.get_column("spread_points")
                p50_sp = _safe_quantile(sp, 0.50)
                p90_sp = _safe_quantile(sp, 0.90)
                p99_sp = _safe_quantile(sp, 0.99)

            if {"spread_points","close"}.issubset(set(df.columns)):
                df_sp = df.with_columns(
                    pl.when(pl.col("close") > 0.0)
                      .then((pl.col("spread_points")/pl.col("close")) * 10_000.0)
                      .otherwise(None)
                      .alias("spread_bps")
                )
                sp_bps = df_sp.get_column("spread_bps")
                p50_bps = _safe_quantile(sp_bps, 0.50)
                p90_bps = _safe_quantile(sp_bps, 0.90)
                p99_bps = _safe_quantile(sp_bps, 0.99)

            # <<< NEW QA ECON >>> QA econ√≥mica simple: retornos y "stuck quotes"
            pct_ret_zero = None
            qa_price_flag_day = "EMPTY"

            if "close" in df.columns and n_bars > 1:
                close = df.get_column("close")
                close_prev = close.shift(1)

                # S√≥lo retornos con close_prev v√°lido y distinto de 0
                mask_valid = close_prev.is_not_null() & (close_prev != 0.0)
                if mask_valid.sum() > 0:
                    close_valid = close.filter(mask_valid)
                    close_prev_valid = close_prev.filter(mask_valid)

                    ret = (close_valid / close_prev_valid) - 1.0
                    ret_abs = ret.abs()

                    # M√°ximo retorno absoluto intrad√≠a
                    ret_abs_max = float(ret_abs.max())
                    n_ret = ret_abs.len()

                    # % de retornos exactamente 0.0 ‚Üí stuck quotes
                    n_zero = int((ret == 0.0).sum())
                    pct_ret_zero = n_zero / float(n_ret)

                    bad_spike = ret_abs_max >= RET_SPIKE_ABS_HARD
                    bad_stuck = pct_ret_zero >= STUCK_RATIO_BAD

                    if bad_spike or bad_stuck:
                        qa_price_flag_day = "BAD"
                    else:
                        qa_price_flag_day = "OK"
                else:
                    qa_price_flag_day = "EMPTY"
            else:
                qa_price_flag_day = "EMPTY"

            # -------------------- Nueva l√≥gica v4: Status / notas --------------------
            fatal_reasons: List[str] = []
            warn_reasons: List[str]  = []

            # Errores estructurales (FATALES ‚Üí FAIL)
            if not have_ohlc:
                fatal_reasons.append("missing_OHLC")
            if has_null_ohlc:
                fatal_reasons.append("OHLC_null")
            if has_nan_ohlc:
                fatal_reasons.append("OHLC_nan")
            if low_gt_high:
                fatal_reasons.append("low>high")

            # Problemas corregibles (avisos ‚Üí WARN)
            if dups > 0:
                warn_reasons.append(f"dups={dups}")

            # Decisi√≥n de status
            if fatal_reasons:
                status = "FAIL"
                # <<< NEW QA ECON >>> si la estructura es fatal, el d√≠a entero va como FAIL en QA de precio
                qa_price_flag_day = "FAIL"
            else:
                status = "OK"
                # Cobertura insuficiente (rejilla) ‚Üí WARN, no FAIL
                if rejilla_pct < WARN_GRID_PCT:
                    warn_reasons.append(f"rejilla<{WARN_GRID_PCT}%")
                    status = "WARN"
                # Duplicados tambi√©n son WARN (si no hay errores fatales)
                if dups > 0 and status == "OK":
                    status = "WARN"

            reasons = fatal_reasons + warn_reasons
            notes   = ",".join(reasons)

            # Append TIPADO
            C["symbol"].append(str(symbol))
            C["date"].append(str(date_str))
            C["n_bars"].append(int(n_bars))
            C["n_unique_ts"].append(int(n_unique_ts))
            C["dups"].append(int(dups))
            C["rejilla_pct"].append(float(rejilla_pct))
            C["has_null_ohlc"].append(bool(has_null_ohlc))
            C["has_nan_ohlc"].append(bool(has_nan_ohlc))
            C["low_gt_high"].append(bool(low_gt_high))
            C["p50_spread_points"].append(None if p50_sp is None else float(p50_sp))
            C["p90_spread_points"].append(None if p90_sp is None else float(p90_sp))
            C["p99_spread_points"].append(None if p99_sp is None else float(p99_sp))
            C["p50_spread_bps"].append(None if p50_bps is None else float(p50_bps))
            C["p90_spread_bps"].append(None if p90_bps is None else float(p90_bps))
            C["p99_spread_bps"].append(None if p99_bps is None else float(p99_bps))
            # <<< NEW QA ECON >>> guardamos m√©tricas econ√≥micas diarias
            C["pct_ret_zero"].append(None if pct_ret_zero is None else float(pct_ret_zero))
            C["qa_price_flag_day"].append(str(qa_price_flag_day))
            C["status"].append(str(status))
            C["notes"].append(str(notes))

    except Exception as e:
        # Error de lectura/parseo ‚Üí marcar FAIL pero sin inflar flags OHLC
        msg = f"exception_read:{type(e).__name__}:{e}"
        C["symbol"].append(str(symbol))
        C["date"].append(str(date_str or "(unknown)"))
        C["n_bars"].append(int(0))
        C["n_unique_ts"].append(int(0))
        C["dups"].append(int(0))
        C["rejilla_pct"].append(float(0.0))
        C["has_null_ohlc"].append(False)
        C["has_nan_ohlc"].append(False)
        C["low_gt_high"].append(False)
        C["p50_spread_points"].append(None)
        C["p90_spread_points"].append(None)
        C["p99_spread_points"].append(None)
        C["p50_spread_bps"].append(None)
        C["p90_spread_bps"].append(None)
        C["p99_spread_bps"].append(None)
        # <<< NEW QA ECON >>> error duro de lectura ‚Üí FAIL
        C["pct_ret_zero"].append(None)
        C["qa_price_flag_day"].append("FAIL")
        C["status"].append("FAIL")
        C["notes"].append(msg)

    # Heartbeat
    now = time.monotonic()
    if (i % PROGRESS_EVERY_FILES == 0) or (now - last_beat >= PROGRESS_EVERY_SECONDS):
        rate = i / max(now - t0, 1e-6)
        _log("INFO", f"Progreso QA: {i}/{total_files} archivos | {rate:.1f} files/s")
        last_beat = now

elapsed = time.monotonic() - t0
rate = total_files / max(elapsed, 1e-6)
_log("INFO", f"QA completada. Procesados {total_files} archivos en {elapsed:.1f}s ({rate:.1f} files/s)")

# ----------------------------- Persistencia (schema FIJO) -----------------------------
schema_map = {
    "symbol": pl.Utf8, "date": pl.Utf8,
    "n_bars": pl.Int64, "n_unique_ts": pl.Int64, "dups": pl.Int64,
    "rejilla_pct": pl.Float64,
    "has_null_ohlc": pl.Boolean, "has_nan_ohlc": pl.Boolean, "low_gt_high": pl.Boolean,
    "p50_spread_points": pl.Float64, "p90_spread_points": pl.Float64, "p99_spread_points": pl.Float64,
    "p50_spread_bps": pl.Float64, "p90_spread_bps": pl.Float64, "p99_spread_bps": pl.Float64,
    # <<< NEW QA ECON >>> columnas nuevas en qa_m5_bulk.parquet
    "pct_ret_zero": pl.Float64,
    "qa_price_flag_day": pl.Utf8,
    "status": pl.Utf8, "notes": pl.Utf8
}
if C["symbol"]:
    out_df = pl.DataFrame(C, schema=schema_map)
else:
    out_df = pl.DataFrame(schema=schema_map)

QA_OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
out_df.write_parquet(QA_OUT_PATH, compression=PARQUET_COMP)
_log("INFO", f"qa_m5_bulk.parquet escrito en {QA_OUT_PATH} ({out_df.height} filas)")

# ----------------------------- Res√∫menes / Prints (nivel d√≠a) -----------------------------
print("-"*110)
print(f"Lectura desde: {BULK_M5_DIR}")
print(f"Reporte QA escrito en: {QA_OUT_PATH}")
print("-"*110)

if out_df.height == 0:
    print("Dataset QA vac√≠o; no hay m√©tricas para resumir.")
else:
    by_status = out_df.group_by("status").agg(pl.len().alias("n")).sort("status")
    counts = {r["status"]: int(r["n"]) for r in by_status.iter_rows(named=True)}
    tot_days = int(out_df.height)
    n_ok    = counts.get("OK", 0)
    n_warn  = counts.get("WARN", 0)
    n_fail  = counts.get("FAIL", 0)
    n_empty = counts.get("EMPTY", 0)
    _pct = lambda n: (100.0 * n / tot_days) if tot_days else 0.0
    print(
        f"Totales por status (d√≠as auditados): "
        f"OK={n_ok} ({_pct(n_ok):.1f}%) | "
        f"WARN={n_warn} ({_pct(n_warn):.1f}%) | "
        f"FAIL={n_fail} ({_pct(n_fail):.1f}%) | "
        f"EMPTY={n_empty} ({_pct(n_empty):.1f}%)"
    )
    print("-"*110)

    # ====================== Sanity-check global de cobertura M5 ======================
    coverage_stats = (
        out_df
        .select([
            pl.col("n_bars").mean().alias("n_bars_mean"),
            pl.col("n_bars").median().alias("n_bars_p50"),
            pl.col("rejilla_pct").mean().alias("rejilla_mean"),
            pl.col("rejilla_pct").quantile(0.5).alias("rejilla_p50"),
        ])
        .to_dicts()[0]
    )

    n_bars_mean   = float(coverage_stats.get("n_bars_mean", 0.0) or 0.0)
    n_bars_p50    = float(coverage_stats.get("n_bars_p50", 0.0) or 0.0)
    rejilla_mean  = float(coverage_stats.get("rejilla_mean", 0.0) or 0.0)
    rejilla_p50   = float(coverage_stats.get("rejilla_p50", 0.0) or 0.0)

    print(
        f"Cobertura global M5 (sobre d√≠as auditados): "
        f"n_bars_mean={n_bars_mean:.1f} | n_bars_p50={n_bars_p50:.1f} | "
        f"rejilla_mean={rejilla_mean:.2f}% | rejilla_p50={rejilla_p50:.2f}%"
    )

    if (rejilla_mean < MIN_MEAN_REJILLA_HARD) and (n_bars_mean < MIN_MEAN_NBARS_HARD):
        msg = (
            "‚ùå QA M5: Cobertura media M5 rid√≠culamente baja.\n"
            f"   EXPECTED_BARS={EXPECTED_BARS}, n_bars_mean={n_bars_mean:.1f}, rejilla_mean={rejilla_mean:.2f}%.\n"
            "   Esto no parece un dataset M5 operativo (puede ser D1 mal etiquetado, "
            "extracci√≥n rota en Celda 08 o datos vac√≠os).\n"
            "   Acci√≥n profesional:\n"
            "     1) Revisar Celda 08 (TIMEFRAME=MT5.TIMEFRAME_M5, ventanas WS/WE, etc.).\n"
            "     2) Forzar re-descarga con FORCE_REDOWNLOAD_BULK_M5=True y FORCE_REWRITE_DAY=True.\n"
            "     3) Inspeccionar manualmente 2‚Äì3 archivos de m5_raw (al menos 100‚Äì200 velas/d√≠a) "
            "antes de volver a correr QA y GOLD."
        )
        _log("ERROR", msg.replace("\n", " "))
        raise RuntimeError(msg)
    # ================================================================================

    # ---------------------- Resumen por s√≠mbolo (qa_operativa_summary) ----------------------
    qa_sym_summary = (
        out_df.group_by("symbol")
              .agg([
                  pl.len().alias("n_days"),
                  (pl.col("status") == "OK").cast(pl.Int32).sum().alias("days_ok"),
                  (pl.col("status") == "WARN").cast(pl.Int32).sum().alias("days_warn"),
                  (pl.col("status") == "FAIL").cast(pl.Int32).sum().alias("days_fail"),
                  (pl.col("status") == "EMPTY").cast(pl.Int32).sum().alias("days_empty"),
                  pl.col("n_bars").mean().alias("n_bars_mean"),
                  pl.col("n_bars").median().alias("n_bars_p50"),
                  pl.col("rejilla_pct").mean().alias("rejilla_pct_mean"),
                  (pl.col("rejilla_pct") >= WARN_GRID_PCT).cast(pl.Int32).sum().alias("days_rejilla_ge_warn"),
                  pl.col("low_gt_high").cast(pl.Int32).sum().alias("days_low_gt_high"),
                  pl.col("p50_spread_bps").mean().alias("p50_spread_bps_mean"),
                  pl.col("p90_spread_bps").mean().alias("p90_spread_bps_mean"),
                  pl.col("p99_spread_bps").mean().alias("p99_spread_bps_mean"),
                  # <<< NEW QA ECON >>> contadores de QA econ√≥mica diaria
                  (pl.col("qa_price_flag_day") == "OK").cast(pl.Int32).sum().alias("days_price_ok"),
                  (pl.col("qa_price_flag_day") == "WARN").cast(pl.Int32).sum().alias("days_price_warn"),
                  (pl.col("qa_price_flag_day") == "BAD").cast(pl.Int32).sum().alias("days_price_bad"),
                  (pl.col("qa_price_flag_day") == "EMPTY").cast(pl.Int32).sum().alias("days_price_empty"),
              ])
              .with_columns([
                  (pl.col("days_ok")    / pl.col("n_days")).alias("ok_ratio"),
                  (pl.col("days_warn")  / pl.col("n_days")).alias("warn_ratio"),
                  (pl.col("days_fail")  / pl.col("n_days")).alias("fail_ratio"),
                  (pl.col("days_empty") / pl.col("n_days")).alias("empty_ratio"),
                  (pl.col("days_rejilla_ge_warn") / pl.col("n_days")).alias("rejilla_ge_warn_ratio"),
                  (pl.col("days_low_gt_high") / pl.col("n_days")).alias("low_gt_high_ratio"),
                  # <<< NEW QA ECON >>> ratios de QA econ√≥mica por s√≠mbolo
                  (pl.col("days_price_ok")  / pl.col("n_days")).alias("price_ok_ratio"),
                  (pl.col("days_price_bad") / pl.col("n_days")).alias("price_bad_ratio"),
              ])
    )

    # Campos de perfil de sesi√≥n y estructura primero
    qa_sym_summary = qa_sym_summary.with_columns([
        # Expected bars por s√≠mbolo (perfil de sesi√≥n puro, sin 24h hardcoded)
        pl.col("n_bars_p50").alias("expected_bars_per_day_symbol"),

        # session_type s√≥lo en funci√≥n de n_bars_mean (volumen temporal / cobertura)
        pl.when(
            pl.col("n_bars_mean") >= pl.lit(EXPECTED_BARS * SESSION_24H_MIN_COVERAGE_RATIO)
        ).then(pl.lit("SESSION_24H"))
         .when(
            pl.col("n_bars_mean") >= pl.lit(SESSION_DIURNA_MIN_NBARS)
        ).then(pl.lit("SESSION_DIURNA"))
         .otherwise(pl.lit("SESSION_ILLQ"))
         .alias("session_type"),

        # qa_struct_flag: solo estructura (fail_ratio + low_gt_high_ratio), sin rejilla_pct
        pl.when(
            (pl.col("fail_ratio")        <= QA_STRUCT_OK_MAX_FAIL_RATIO) &
            (pl.col("low_gt_high_ratio") <= QA_STRUCT_OK_MAX_LOW_GT_HIGH_RATIO)
        ).then(pl.lit("OK_STRUCT"))
         .when(
            (pl.col("fail_ratio")        >= QA_STRUCT_BAD_MIN_FAIL_RATIO) |
            (pl.col("low_gt_high_ratio") >= QA_STRUCT_BAD_MIN_LOW_GT_HIGH_RATIO)
        ).then(pl.lit("BAD_STRUCT"))
         .otherwise(pl.lit("WARN_STRUCT"))
         .alias("qa_struct_flag"),
    ])

    # Flag qa_operativa_flag (OK/WARN/BAD), ahora *session-aware*:
    # - BAD: problemas estructurales serios (BAD_STRUCT), muchos FAIL/EMPTY,
    #        o cobertura 24h muy mala en s√≠mbolos SESSION_24H.
    # - OK : mayor√≠a de d√≠as OK, pocos FAIL, y cobertura aceptable para su tipo de sesi√≥n.
    # - WARN: el resto (incluidas acciones diurnas con rejilla 24h "baja" pero estructura sana).
    qa_sym_summary = qa_sym_summary.with_columns([
        pl.when(
            (pl.col("qa_struct_flag") == "BAD_STRUCT") |
            (pl.col("fail_ratio")   >= QA_BAD_MIN_FAIL_RATIO) |
            (pl.col("empty_ratio")  >  QA_BAD_MAX_EMPTY_RATIO) |
            (
                (pl.col("session_type") == "SESSION_24H") &
                (pl.col("rejilla_pct_mean") < QA_BAD_MAX_REJILLA_MEAN)
            )
        ).then(pl.lit("BAD"))
         .when(
            (pl.col("ok_ratio")   >= QA_OK_MIN_OK_RATIO) &
            (pl.col("fail_ratio") <= QA_OK_MAX_FAIL_RATIO) &
            (
                ((pl.col("session_type") == "SESSION_24H") & (pl.col("rejilla_pct_mean") >= QA_OK_MIN_REJILLA_MEAN)) |
                (pl.col("session_type") != "SESSION_24H")
            )
        ).then(pl.lit("OK"))
         .otherwise(pl.lit("WARN"))
         .alias("qa_operativa_flag")
    ])

    # <<< NEW QA ECON >>> Flag de QA econ√≥mica por s√≠mbolo (OK/WARN/BAD) seg√∫n price_bad_ratio
    qa_sym_summary = qa_sym_summary.with_columns([
        pl.when(pl.col("price_bad_ratio") >= QA_PRICE_BAD_MIN_RATIO)
          .then(pl.lit("BAD"))
          .when(pl.col("price_bad_ratio") > 0.0)
          .then(pl.lit("WARN"))
          .otherwise(pl.lit("OK"))
          .alias("qa_price_flag")
    ])

    # Persistir resumen de QA operativa por s√≠mbolo
    qa_sym_summary.write_parquet(QA_SUMMARY_PATH, compression=PARQUET_COMP)
    _log("INFO", f"qa_operativa_summary.parquet escrito en {QA_SUMMARY_PATH} ({qa_sym_summary.height} filas)")

    # ---------------------- Prints de resumen por s√≠mbolo (usando qa_sym_summary) ----------------------
    sym_summary_for_print = (
        qa_sym_summary
        .with_columns([
            (pl.col("ok_ratio")   * 100.0).round(1).alias("ok_pct"),
            (pl.col("warn_ratio") * 100.0).round(1).alias("warn_pct"),
            (pl.col("fail_ratio") * 100.0).round(1).alias("fail_pct"),
            (pl.col("empty_ratio")* 100.0).round(1).alias("empty_pct"),
            pl.col("rejilla_pct_mean").round(2).alias("rejilla_pct_avg"),
        ])
        .select([
            "symbol","n_days","ok_pct","warn_pct","fail_pct","empty_pct",
            "rejilla_pct_avg","qa_operativa_flag"
        ])
    )

    top_worst = (
        sym_summary_for_print
        .sort(["fail_pct","warn_pct","ok_pct"], descending=[True, True, False])
        .head(10)
    )

    if top_worst.height > 0:
        print("Top-10 s√≠mbolos con mayor % de d√≠as FAIL/WARN (y flag QA):")
        for r in top_worst.iter_rows(named=True):
            print(
                f"  - {r['symbol']}: "
                f"FAIL={r['fail_pct']}% | WARN={r['warn_pct']}% | "
                f"EMPTY={r['empty_pct']}% | OK={r['ok_pct']}% | "
                f"rejilla_avg={r['rejilla_pct_avg']}% | "
                f"flag={r['qa_operativa_flag']} "
                f"(n_days={r['n_days']})"
            )
    else:
        print("No hay filas para resumen por s√≠mbolo.")

    print("-"*110)

    best = out_df.sort("rejilla_pct", descending=True).head(5)
    worst_days = out_df.sort("rejilla_pct", descending=False).head(5)

    if best.height > 0:
        print("Top-5 d√≠as con mayor %rejilla_5m:")
        for r in best.iter_rows(named=True):
            print(f"  - {r['symbol']} {r['date']} | rejilla={r['rejilla_pct']}% | status={r['status']} | dups={r['dups']}")
        print("-"*110)
    if worst_days.height > 0:
        print("Top-5 d√≠as con menor %rejilla_5m:")
        for r in worst_days.iter_rows(named=True):
            print(f"  - {r['symbol']} {r['date']} | rejilla={r['rejilla_pct']}% | status={r['status']} | dups={r['dups']}")

print("-"*110)
print(f"S√≠mbolos auditados (elegibles): {len(audited_symbols)}")
print(f"S√≠mbolos encontrados fuera de la elegible (omitidos): {len(omitted_symbols)}")
print(f"Resumen QA operativa por s√≠mbolo: {QA_SUMMARY_PATH}")
print("-"*110)
print("‚úÖ QA operativa M5 completada (EAGER, robusta v4 + perfil de sesi√≥n + QA econ√≥mica por s√≠mbolo, Polars, 3B, UTC + resumen por s√≠mbolo).")
# ===========================================================================================================


Inicio Celda 10 ‚Äî QA operativa M5 (v4 + perfil sesi√≥n + QA econ√≥mica) | TZ local: America/Guayaquil
Hora local: 2025-12-02T23:27:42-05:00 | Hora UTC: 2025-12-03T04:27:42+00:00
--------------------------------------------------------------------------------------------------------------
DATA_ROOT: C:\Quant\MT5_Data_Extraction\data
Fuente: C:\Quant\MT5_Data_Extraction\data\bulk_data\m5_raw
--------------------------------------------------------------------------------------------------------------
[2025-12-02 23:27:42] [20251202_232253] [INFO] [10-QA-M5-Operativa] Inicio QA operativa M5 sobre C:\Quant\MT5_Data_Extraction\data\bulk_data\m5_raw
[2025-12-02 23:27:42] [20251202_232253] [INFO] [10-QA-M5-Operativa] Filtro 3B activo (parquet). elegibles=107 | encontrados=110 ‚Üí auditados=107 | omitidos=3
Inicio QA M5 (EAGER v4) ‚Üí archivos a auditar: 118385 | s√≠mbolos auditados: 107 | omitidos: 3
[2025-12-02 23:27:43] [20251202_232253] [INFO] [10-QA-M5-Operativa] Archivos a auditar: 118

In [12]:
# ======================= Celda 11 ‚Äî Ingesta incremental diaria M5 (Polars lazy + UTC/GYE) =======================
# Prop√≥sito:
#   - Detectar d√≠as faltantes por s√≠mbolo en bulk_data/m5_raw y descargar incrementos hasta AYER 23:55 UTC.
#   - Escribir idempotente: symbol=SYMBOL/year=YYYY/month=MM/part=YYYYMMDD.parquet
#   - Actualizar cat√°logo (metadata/dataset_catalog.parquet) usando Polars (lazy) para conteos/bytes/yyy-mm.
#   - Registrar acciones en metadata/run_log.jsonl.
#   - (NUEVO) Reprocesar d√≠as se√±alados por QA (rejilla < umbral) mediante:
#       * REPROCESS_RANGE_YYYYMMDD="YYYYMMDD:YYYYMMDD" [aplica a todos o a REPROCESS_SYMBOLS]
#       * metadata/reprocess_days.txt con l√≠neas "SYMBOL,YYYYMMDD" o "YYYYMMDD" (comod√≠n para todos)
#
# Impresiones obligatorias:
#   - "Incremento M5 escrito en: <DATA_ROOT>/bulk_data/m5_raw/"
#   - "D√≠as a√±adidos por s√≠mbolo (top-5): ..."
#   - Totales agregados de archivos/bytes
#   - "Cat√°logo actualizado: <DATASET_CATALOG_PATH> | Filas=<n> | Columnas=<m>"
#   - Esquema (columnas:tipo) del cat√°logo
#   - Timestamps en UTC y TZ local America/Guayaquil
# ==============================================================================================================

from __future__ import annotations

import json, time, re, math, os
from pathlib import Path
from datetime import datetime, date, timedelta, timezone
from typing import Dict, List, Optional, Tuple, Set

import polars as pl

# --------- MT5 (requiere terminal instalado y logueado) ---------
try:
    import MetaTrader5 as mt5
except Exception as e:
    raise ImportError("No se pudo importar 'MetaTrader5'. Verifica instalaci√≥n/terminal MT5.") from e

# --------------------------------- Config/paths ---------------------------------
CELL_LABEL   = "11-IngestaM5"
RUN_ID       = globals().get("RUN_ID", datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))

# *** Requiere ROOT √∫nico definido antes ***
if "DATA_ROOT" not in globals():
    raise RuntimeError("DATA_ROOT no est√° definido. Ejecuta la celda de configuraci√≥n (ROOT √∫nico) antes de la Celda 11.")
DATA_ROOT    = Path(globals()["DATA_ROOT"]).resolve()

M5_RAW_DIR   = DATA_ROOT / "bulk_data" / "m5_raw"
META_DIR     = DATA_ROOT / "metadata"
FILTERS_DIR  = META_DIR / "filters"

SCHEMA_JSON  = META_DIR / "schema_m5.json"
DATASET_CATALOG_PATH = META_DIR / "dataset_catalog.parquet"
RUN_LOG_JSONL_PATH   = META_DIR / "run_log.jsonl"

# Par√°metros de ejecuci√≥n
TIMEFRAME_LABEL = globals().get("TIMEFRAME_LABEL", "M5")
FORCE_REWRITE_DAY = bool(globals().get("FORCE_REWRITE_DAY", False))  # si True, reescribe part=YYYYMMDD.parquet
FILL_GAPS        = bool(globals().get("FILL_GAPS_M5", False))        # si True, tambi√©n rellena huecos intermedios
TIMEZONE_IANA    = globals().get("TIMEZONE_IANA", "America/Guayaquil")

# (NUEVO) Reproceso simple para d√≠as marcados por QA (rejilla < umbral)
REPROCESS_RANGE_YYYYMMDD: Optional[str] = globals().get("REPROCESS_RANGE_YYYYMMDD", None)  # "YYYYMMDD:YYYYMMDD"
REPROCESS_SYMBOLS: Set[str] = set(globals().get("REPROCESS_SYMBOLS", []) or [])            # limita el rango anterior
REPROCESS_DAYS_FILE = META_DIR / "reprocess_days.txt"  # l√≠neas "SYMBOL,YYYYMMDD" o "YYYYMMDD" (comod√≠n)

# Fin de rango incremental: AYER UTC (exclusivo fin de d√≠a)
TODAY_UTC   = datetime.now(timezone.utc).date()
END_DAY_UTC = TODAY_UTC - timedelta(days=1)   # [00:00, 24:00) de AYER

# Retries
RETRIES_CONN = 5
RETRIES_DAY  = 3
SLEEP_BASE   = 1.0

# Parquet
PARQUET_COMPRESSION = "zstd"

# --------------------------------- Utilidades ---------------------------------
def _log(level: str, msg: str):
    ts = datetime.now(timezone.utc).isoformat(timespec="seconds")
    print(f"[{ts}] [{RUN_ID}] [{level.upper()}] [{CELL_LABEL}] {msg}", flush=True)

def _now_local_iana() -> str:
    try:
        from zoneinfo import ZoneInfo
        return datetime.now(timezone.utc).astimezone(ZoneInfo(TIMEZONE_IANA)).isoformat(timespec="seconds")
    except Exception:
        return "(instala 'tzdata' para TZ locales)"

def _append_runlog(record: dict) -> None:
    RUN_LOG_JSONL_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(RUN_LOG_JSONL_PATH, "a", encoding="utf-8") as f:
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

def _ensure_dirs():
    if not M5_RAW_DIR.exists():
        raise FileNotFoundError(f"No existe {M5_RAW_DIR}. Ejecuta la Celda 06 primero.")
    META_DIR.mkdir(parents=True, exist_ok=True)
    if not DATASET_CATALOG_PATH.exists():
        pl.DataFrame(
            {
                "symbol": pl.Series([], dtype=pl.Utf8),
                "first_ts_utc": pl.Series([], dtype=pl.Int64),
                "last_ts_utc": pl.Series([], dtype=pl.Int64),
                "n_files": pl.Series([], dtype=pl.Int64),
                "n_days": pl.Series([], dtype=pl.Int64),
                "n_months": pl.Series([], dtype=pl.Int64),
                "n_years": pl.Series([], dtype=pl.Int64),
                "bytes": pl.Series([], dtype=pl.Int64),
                "last_update_ts_utc": pl.Series([], dtype=pl.Int64),
                "notes": pl.Series([], dtype=pl.Utf8),
            }
        ).write_parquet(DATASET_CATALOG_PATH, compression="zstd", statistics=True)

def _day_edges_utc(d: date) -> Tuple[datetime, datetime]:
    s = datetime(d.year, d.month, d.day, 0, 0, 0, tzinfo=timezone.utc)
    e = s + timedelta(days=1)  # exclusivo
    return s, e

def _parse_day_from_name(name: str) -> Optional[str]:
    # part=YYYYMMDD.parquet
    m = re.search(r"part=([0-9]{8})", name)
    return m.group(1) if m else None

def _yyyymmdd_to_date(s: str) -> date:
    return date(int(s[0:4]), int(s[4:6]), int(s[6:8]))

def _ms_to_iso(ms: Optional[int]) -> str:
    if ms is None or (isinstance(ms, float) and (math.isnan(ms) or math.isinf(ms))):
        return "N/A"
    return datetime.fromtimestamp(int(ms)/1000, tz=timezone.utc).isoformat(timespec="seconds")

def _load_schema_column_order() -> list[str]:
    try:
        schema = json.loads(SCHEMA_JSON.read_text(encoding="utf-8"))
        cols = list(schema.get("column_order", []))
        if cols:
            return cols
    except Exception:
        pass
    # fallback razonable con doble sello
    return ["timestamp_utc","timestamp_gye","symbol","open","high","low","close",
            "tick_volume","real_volume","spread_points","broker","server_tz"]

def _ensure_schema_order(df: pl.DataFrame, col_order: list[str]) -> pl.DataFrame:
    cols = set(df.columns)
    add_exprs = []
    for c in col_order:
        if c not in cols:
            if c == "timestamp_utc":
                add_exprs.append(pl.lit(None, dtype=pl.Int64).alias(c))
            elif c == "timestamp_gye":
                add_exprs.append(pl.lit(None, dtype=pl.Datetime("ms", TIMEZONE_IANA)).alias(c))
            elif c in {"open","high","low","close","spread_points"}:
                add_exprs.append(pl.lit(None, dtype=pl.Float64).alias(c))
            elif c in {"tick_volume","real_volume"}:
                add_exprs.append(pl.lit(None, dtype=pl.Int64).alias(c))
            else:
                add_exprs.append(pl.lit(None, dtype=pl.Utf8).alias(c))
    if add_exprs:
        df = df.with_columns(add_exprs)
    return df.select(col_order)

def _mt5_array_to_polars(arr) -> pl.DataFrame:
    """Convierte el structured array de MT5 a DataFrame Polars (sin pandas)."""
    if arr is None or len(arr) == 0:
        return pl.DataFrame()
    names = getattr(arr, "dtype", None)
    names = names.names if names is not None else None
    if names:
        return pl.DataFrame({n: arr[n].tolist() for n in names})
    try:
        return pl.DataFrame(arr)
    except Exception:
        return pl.from_dicts([dict(x) for x in arr])

# --------------------------- NUEVO: carga de d√≠as a reprocesar ---------------------------
def _expand_range_yyyymmdd(rng: str) -> List[date]:
    """'YYYYMMDD:YYYYMMDD' -> lista de fechas (incluyente)."""
    try:
        a, b = rng.split(":")
        da, db = _yyyymmdd_to_date(a.strip()), _yyyymmdd_to_date(b.strip())
        if da > db: da, db = db, da
        out = []
        cur = da
        while cur <= db:
            out.append(cur)
            cur += timedelta(days=1)
        return out
    except Exception:
        return []

def _load_reprocess_days(symbols_all: List[str]) -> Dict[str, Set[date]]:
    """
    Retorna mapping {symbol -> set(fecha)} a reprocesar.
    Fuentes:
      - archivo metadata/reprocess_days.txt (l√≠neas: 'SYMBOL,YYYYMMDD' o 'YYYYMMDD' comod√≠n)
      - REPROCESS_RANGE_YYYYMMDD (aplica a todos o a REPROCESS_SYMBOLS si se indica)
    """
    mapping: Dict[str, Set[date]] = {}

    # 1) Archivo de reproc (opcional)
    if REPROCESS_DAYS_FILE.exists():
        try:
            for ln in REPROCESS_DAYS_FILE.read_text(encoding="utf-8").splitlines():
                s = ln.strip()
                if not s or s.startswith("#"):
                    continue
                if "," in s:
                    sym, d = [t.strip() for t in s.split(",", 1)]
                else:
                    sym, d = "*", s  # comod√≠n
                try:
                    dd = _yyyymmdd_to_date(d)
                except Exception:
                    continue
                mapping.setdefault(sym, set()).add(dd)
        except Exception as e:
            _log("WARNING", f"No se pudo leer {REPROCESS_DAYS_FILE.name}: {e}")

    # 2) Rango global (opcional)
    if REPROCESS_RANGE_YYYYMMDD:
        days = set(_expand_range_yyyymmdd(REPROCESS_RANGE_YYYYMMDD))
        if days:
            targets = (REPROCESS_SYMBOLS if REPROCESS_SYMBOLS else set(["*"]))
            for sym in targets:
                mapping.setdefault(sym, set()).update(days)

    # Expandir comod√≠n '*' a todos los s√≠mbolos presentes
    if "*" in mapping:
        wildcard = set(mapping.pop("*"))
        for sym in symbols_all:
            mapping.setdefault(sym, set()).update(wildcard)

    # Limitar al m√°ximo (no re-procesar futuro)
    for sym in list(mapping.keys()):
        mapping[sym] = {d for d in mapping[sym] if d <= END_DAY_UTC}

    return mapping

# --------------------------------- Header ---------------------------------
print("="*110)
print(f"Inicio Celda 11 ‚Äî Ingesta incremental diaria M5 | TZ local: {TIMEZONE_IANA}")
print(f"Hora local: {_now_local_iana()} | Hora UTC: {datetime.now(timezone.utc).isoformat(timespec='seconds')}")
print("-"*110)
print(f"DATA_ROOT: {DATA_ROOT}")
print(f"Fuente (m5_raw): {M5_RAW_DIR}")
print("-"*110)

# --------------------------------- Guardas ---------------------------------
_ensure_dirs()
assert TIMEFRAME_LABEL == "M5", "TIMEFRAME_LABEL debe ser 'M5' en esta celda."
TIMEFRAME = mt5.TIMEFRAME_M5
_log("INFO", "Verificaci√≥n TF: usando expl√≠citamente TIMEFRAME_M5.")

# --------------------------------- Descubrimiento de archivos (lazy) ---------------------------------
# Inventario de part-files con Polars (para gap-detection/cat√°logo)
files_meta: List[dict] = []
for sym_dir in sorted(M5_RAW_DIR.glob("symbol=*")):
    if not sym_dir.is_dir():
        continue
    sym = sym_dir.name.split("=",1)[-1]
    for pf in sym_dir.rglob("part=*.parquet"):
        d = _parse_day_from_name(pf.name)
        if not d:
            continue
        y = d[0:4]; m = d[4:6]
        try:
            sz = pf.stat().st_size
        except Exception:
            sz = 0
        files_meta.append({
            "symbol": sym,
            "day_str": d,
            "year": y,
            "month": m,
            "path": str(pf),
            "bytes": int(sz),
        })

if not files_meta:
    raise SystemExit("No hay part=*.parquet en m5_raw; ejecuta primero la ingesta masiva inicial.")

files_df = pl.DataFrame(files_meta).lazy()

# √öltimo d√≠a existente por s√≠mbolo (lazy)
last_day_per_symbol = (
    files_df
    .group_by("symbol")
    .agg([
        pl.col("day_str").max().alias("last_day_str"),
        pl.len().alias("n_files"),
    ])
    .collect()
)

symbols = last_day_per_symbol.get_column("symbol").to_list()
last_day_map = dict(zip(
    last_day_per_symbol.get_column("symbol").to_list(),
    last_day_per_symbol.get_column("last_day_str").to_list()
))

# (opcional) detectar huecos intermedios si se habilita FILL_GAPS
days_by_symbol: Dict[str, set] = {}
if FILL_GAPS:
    by_sym = (
        files_df
        .group_by("symbol")
        .agg(pl.col("day_str"))
        .collect()
    )
    for sym, days_list in zip(by_sym["symbol"].to_list(), by_sym["day_str"].to_list()):
        days_by_symbol[str(sym)] = set(days_list)

# (NUEVO) Cargar d√≠as a reprocesar
reprocess_map = _load_reprocess_days(symbols)

if reprocess_map:
    n_pairs = sum(len(v) for v in reprocess_map.values())
    _log("INFO", f"D√≠as a reprocesar (desde QA / par√°metros): {n_pairs} pares s√≠mbolo-fecha detectados.")

# --------------------------------- Conexi√≥n MT5 ---------------------------------
_log("INFO", "Conectando a MT5 con reintentos...")
for i in range(RETRIES_CONN):
    if mt5.initialize():
        _log("INFO", "Conexi√≥n a MT5 establecida.")
        break
    time.sleep(SLEEP_BASE * (2 ** i))
else:
    raise RuntimeError("No se pudo inicializar MT5 tras m√∫ltiples intentos.")

total_days_added = 0
total_bytes_added = 0
per_symbol_added: Dict[str, int] = {}
per_symbol_minmax: Dict[str, Dict[str, Optional[int]]] = {}  # ms
per_symbol_reprocess_forced: Dict[str, int] = {}             # NUEVO

col_order = _load_schema_column_order()

# --------------------------------- C√°lculo y descarga de incrementos ---------------------------------
for sym in symbols:
    last_day_str = last_day_map.get(sym, None)
    if not last_day_str or not last_day_str.isdigit():
        _log("WARNING", f"{sym}: no se pudo determinar √∫ltimo d√≠a; omitido.")
        continue

    last_day = _yyyymmdd_to_date(last_day_str)
    start_missing = last_day + timedelta(days=1)

    # 1) d√≠as "naturales" faltantes desde el √∫ltimo d√≠a hasta AYER
    missing_days: List[date] = []
    if start_missing <= END_DAY_UTC:
        d = start_missing
        while d <= END_DAY_UTC:
            missing_days.append(d)
            d += timedelta(days=1)

    # 2) (opcional) huecos intermedios
    if FILL_GAPS:
        existing = days_by_symbol.get(sym, set())
        if existing:
            min_d = _yyyymmdd_to_date(min(existing))
            max_d = _yyyymmdd_to_date(max(existing))
            dd = min_d
            while dd <= max_d:
                ds = f"{dd.year:04d}{dd.month:02d}{dd.day:02d}"
                if ds not in existing:
                    missing_days.append(dd)
                dd += timedelta(days=1)

    # 3) (NUEVO) d√≠as marcados para reprocesar por QA/parametrizaci√≥n
    repro_days: Set[date] = reprocess_map.get(sym, set())

    # Unificar lista de trabajo (mantener orden cronol√≥gico y sin duplicados)
    union_days: List[date] = []
    seen: Set[date] = set()
    for d in sorted(set(missing_days).union(repro_days)):
        if d not in seen:
            union_days.append(d); seen.add(d)

    if not union_days:
        continue

    # Descargar por d√≠a
    for d in union_days:
        yyyy, mm, dd = f"{d.year:04d}", f"{d.month:02d}", f"{d.day:02d}"
        out_dir  = M5_RAW_DIR / f"symbol={sym}" / f"year={yyyy}" / f"month={mm}"
        out_file = out_dir / f"part={yyyy}{mm}{dd}.parquet"

        # Forzar reescritura si el d√≠a est√° en reproceso por QA o si FORCE_REWRITE_DAY=True global
        force_this_day = FORCE_REWRITE_DAY or (d in repro_days)

        if out_file.exists() and not force_this_day:
            _append_runlog({
                "ts_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
                "run_id": RUN_ID, "stage": "incremental_m5",
                "symbol": sym, "date": f"{yyyy}-{mm}-{dd}",
                "status": "skipped_exists", "path": str(out_file),
                "message": "Archivo existente (idempotencia)."
            })
            continue

        day_start, day_end = _day_edges_utc(d)
        ok = False; last_err = None
        for k in range(RETRIES_DAY):
            try:
                rates = mt5.copy_rates_range(sym, mt5.TIMEFRAME_M5, day_start, day_end)
                if rates is None:
                    raise RuntimeError(f"copy_rates_range devolvi√≥ None (last_error={mt5.last_error()})")
                if len(rates) == 0:
                    _append_runlog({
                        "ts_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
                        "run_id": RUN_ID, "stage": "incremental_m5",
                        "symbol": sym, "date": f"{yyyy}-{mm}-{dd}",
                        "status": "empty", "path": str(out_file),
                        "message": "Sin barras (feriado/cierre)."
                    })
                    ok = True
                    break

                # Conversi√≥n a Polars (sin pandas)
                df = _mt5_array_to_polars(rates)
                if df.height == 0:
                    ok = True
                    break

                # Campos obligatorios/derivados
                df = (
                    df
                    .with_columns([
                        pl.lit(sym).alias("symbol"),
                        (pl.col("time").cast(pl.Int64) * 1000).alias("timestamp_utc"),
                    ])
                    .with_columns([
                        pl.col("timestamp_utc").cast(pl.Datetime("ms")).dt.replace_time_zone("UTC").alias("_ts_utc_dt"),
                    ])
                    .with_columns([
                        pl.col("_ts_utc_dt").dt.convert_time_zone(TIMEZONE_IANA).alias("timestamp_gye"),
                        pl.col("open").cast(pl.Float64).alias("open"),
                        pl.col("high").cast(pl.Float64).alias("high"),
                        pl.col("low").cast(pl.Float64).alias("low"),
                        pl.col("close").cast(pl.Float64).alias("close"),
                        pl.when(pl.col("tick_volume").is_not_null()).then(pl.col("tick_volume").cast(pl.Int64)).otherwise(pl.lit(None, dtype=pl.Int64)).alias("tick_volume"),
                        pl.when(pl.col("real_volume").is_not_null()).then(pl.col("real_volume").cast(pl.Int64)).otherwise(pl.lit(None, dtype=pl.Int64)).alias("real_volume"),
                        pl.when(pl.col("spread").is_not_null()).then(pl.col("spread").cast(pl.Float64)).otherwise(pl.lit(None, dtype=pl.Float64)).alias("spread_points"),
                        pl.lit(None, dtype=pl.Utf8).alias("broker"),
                        pl.lit(None, dtype=pl.Utf8).alias("server_tz"),
                    ])
                    .sort("timestamp_utc")
                    .unique(subset=["timestamp_utc"], keep="last")
                )

                # Particionar por d√≠a UTC
                df = df.with_columns(pl.col("_ts_utc_dt").dt.strftime("%Y%m%d").alias("_date_utc_str"))
                unique_days = df.select(pl.col("_date_utc_str").unique()).to_series().to_list()

                out_dir.mkdir(parents=True, exist_ok=True)
                for day_str in unique_days:
                    y = int(day_str[0:4]); m = int(day_str[4:6]); ddd = int(day_str[6:8])
                    # dentro de la ventana incremental (seguridad b√°sica)
                    if date(y,m,ddd) > END_DAY_UTC:
                        continue

                    out_dir_day  = M5_RAW_DIR / f"symbol={sym}" / f"year={y:04d}" / f"month={m:02d}"
                    out_file_day = out_dir_day / f"part={day_str}.parquet"

                    # Evaluar si este d√≠a espec√≠fico viene de reproceso QA
                    force_day_specific = force_this_day or (_yyyymmdd_to_date(day_str) in repro_days)

                    if out_file_day.exists() and not force_day_specific:
                        continue

                    out_day = df.filter(pl.col("_date_utc_str") == day_str).select(
                        "timestamp_utc","timestamp_gye","symbol",
                        "open","high","low","close",
                        "tick_volume","real_volume","spread_points","broker","server_tz",
                    )
                    out_day = _ensure_schema_order(out_day, col_order)

                    out_dir_day.mkdir(parents=True, exist_ok=True)
                    out_day.write_parquet(out_file_day, compression=PARQUET_COMPRESSION, statistics=True)

                    try:
                        total_bytes_added += out_file_day.stat().st_size
                    except Exception:
                        pass
                    total_days_added += 1
                    per_symbol_added[sym] = per_symbol_added.get(sym, 0) + 1

                    # Conteo de reprocesos forzados (solo si realmente se reescribi√≥)
                    if force_day_specific:
                        per_symbol_reprocess_forced[sym] = per_symbol_reprocess_forced.get(sym, 0) + 1

                    # min/max por s√≠mbolo (para el print de rango)
                    mm = out_day.select([pl.min("timestamp_utc").alias("mn"), pl.max("timestamp_utc").alias("mx")]).to_dicts()[0]
                    vmin, vmax = mm["mn"], mm["mx"]
                    if sym not in per_symbol_minmax:
                        per_symbol_minmax[sym] = {"min": vmin, "max": vmax}
                    else:
                        per_symbol_minmax[sym]["min"] = min(per_symbol_minmax[sym]["min"], vmin)
                        per_symbol_minmax[sym]["max"] = max(per_symbol_minmax[sym]["max"], vmax)

                    _append_runlog({
                        "ts_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
                        "run_id": RUN_ID, "stage": "incremental_m5",
                        "symbol": sym, "date": f"{y:04d}-{m:02d}-{ddd:02d}",
                        "status": ("reprocess_forced" if force_day_specific else "ok"),
                        "path": str(out_file_day),
                        "bytes": int(out_file_day.stat().st_size) if out_file_day.exists() else None,
                        "message": ("Reescritura forzada por QA (rejilla<umbral)" if force_day_specific else "Escritura incremental completada")
                    })

                ok = True
                break

            except Exception as e:
                last_err = repr(e)
                time.sleep(SLEEP_BASE * (2 ** k))

        if not ok:
            _append_runlog({
                "ts_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
                "run_id": RUN_ID, "stage": "incremental_m5",
                "symbol": sym, "date": f"{yyyy}-{mm}-{dd}",
                "status": "fail", "path": str(out_file),
                "message": f"Fallo tras reintentos: {last_err}"
            })
            _log("WARNING", f"{sym} {yyyy}-{mm}-{dd}: fallo definitivo tras reintentos.")

# Cerrar MT5
try:
    mt5.shutdown()
    _log("INFO", "Conexi√≥n MT5 cerrada.")
except Exception:
    pass

# --------------------------------- Re-scan final (para cat√°logo) ---------------------------------
# Recalcular inventario post-ingesta (puede haber nuevos archivos)
files_meta2: List[dict] = []
for sym_dir in sorted(M5_RAW_DIR.glob("symbol=*")):
    if not sym_dir.is_dir(): continue
    sym = sym_dir.name.split("=",1)[-1]
    for pf in sym_dir.rglob("part=*.parquet"):
        d = _parse_day_from_name(pf.name)
        if not d: continue
        y = d[0:4]; m = d[4:6]
        try:
            sz = pf.stat().st_size
        except Exception:
            sz = 0
        files_meta2.append({"symbol": sym, "day_str": d, "year": y, "month": m, "path": str(pf), "bytes": int(sz)})

files_all_lf = pl.DataFrame(files_meta2).lazy()

# M√©tricas de cat√°logo (lazy)
agg = (
    files_all_lf
    .group_by("symbol")
    .agg([
        pl.len().alias("n_files"),
        pl.col("day_str").n_unique().alias("n_days"),
        pl.col("month").n_unique().alias("n_months"),
        pl.col("year").n_unique().alias("n_years"),
        pl.col("bytes").sum().alias("bytes"),
        pl.col("day_str").min().alias("min_day_str"),
        pl.col("day_str").max().alias("max_day_str"),
    ])
    .collect()
)

# √çndice (symbol, day_str) -> path
idx_lf = files_all_lf.select(["symbol","day_str","path"]).collect()
paths_lookup: Dict[Tuple[str,str], str] = {}
for sym, d, p in zip(idx_lf["symbol"].to_list(), idx_lf["day_str"].to_list(), idx_lf["path"].to_list()):
    paths_lookup[(str(sym), str(d))] = str(p)

# first_ts_utc / last_ts_utc leyendo SOLO el archivo m√°s antiguo y el m√°s reciente por s√≠mbolo
first_ts_map: Dict[str, Optional[int]] = {}
last_ts_map:  Dict[str, Optional[int]] = {}
for row in agg.iter_rows(named=True):
    sym = row["symbol"]
    min_day = row["min_day_str"]
    max_day = row["max_day_str"]
    first_ts_map[sym] = None
    last_ts_map[sym]  = None
    p_min = paths_lookup.get((sym, min_day))
    if p_min and os.path.exists(p_min):
        try:
            tmin = pl.read_parquet(p_min, columns=["timestamp_utc"])
            if tmin.height > 0:
                first_ts_map[sym] = int(tmin.select(pl.col("timestamp_utc").min()).item())
        except Exception as e:
            _log("WARNING", f"{sym}: no se pudo leer first_ts de {Path(p_min).name}: {e}")
    p_max = paths_lookup.get((sym, max_day))
    if p_max and os.path.exists(p_max):
        try:
            tmax = pl.read_parquet(p_max, columns=["timestamp_utc"])
            if tmax.height > 0:
                last_ts_map[sym] = int(tmax.select(pl.col("timestamp_utc").max()).item())
        except Exception as e:
            _log("WARNING", f"{sym}: no se pudo leer last_ts de {Path(p_max).name}: {e}")

# Construir cat√°logo final y persistir (Polars)
now_ms = int(datetime.now(timezone.utc).timestamp() * 1000)
cat_rows: List[dict] = []
for r in agg.iter_rows(named=True):
    sym = r["symbol"]
    cat_rows.append({
        "symbol": sym,
        "first_ts_utc": None if first_ts_map.get(sym) is None else int(first_ts_map[sym]),
        "last_ts_utc":  None if last_ts_map.get(sym)  is None else int(last_ts_map[sym]),
        "n_files": int(r["n_files"]),
        "n_days": int(r["n_days"]),
        "n_months": int(r["n_months"]),
        "n_years": int(r["n_years"]),
        "bytes": int(r["bytes"]),
        "last_update_ts_utc": int(now_ms),
        "notes": "",
    })

catalog_pl = pl.from_dicts(cat_rows).with_columns([
    pl.col("symbol").cast(pl.Utf8),
    pl.col("first_ts_utc").cast(pl.Int64),
    pl.col("last_ts_utc").cast(pl.Int64),
    pl.col("n_files").cast(pl.Int64),
    pl.col("n_days").cast(pl.Int64),
    pl.col("n_months").cast(pl.Int64),
    pl.col("n_years").cast(pl.Int64),
    pl.col("bytes").cast(pl.Int64),
    pl.col("last_update_ts_utc").cast(pl.Int64),
    pl.col("notes").cast(pl.Utf8),
])
catalog_pl.write_parquet(DATASET_CATALOG_PATH, compression="zstd", statistics=True)

# --------------------------------- Impresiones requeridas ---------------------------------
print("="*80)
print(f"Incremento M5 escrito en: {M5_RAW_DIR}")
print(f"Archivos (d√≠as) agregados: {total_days_added}")

ranking = sorted(per_symbol_added.items(), key=lambda kv: kv[1], reverse=True)[:5]
print("D√≠as a√±adidos por s√≠mbolo (top-5):")
if ranking:
    for sym, cnt in ranking:
        vmin = per_symbol_minmax.get(sym, {}).get("min", None)
        vmax = per_symbol_minmax.get(sym, {}).get("max", None)
        print(f"  - {sym}: {cnt} d√≠as | {_ms_to_iso(vmin)} ‚Üí {_ms_to_iso(vmax)}")
else:
    print("  - (sin d√≠as a√±adidos en esta corrida)")

# (NUEVO) Resumen de reprocesos forzados por QA
if per_symbol_reprocess_forced:
    print("-"*80)
    print("Reprocesos forzados por s√≠mbolo (QA rejilla<umbral):")
    for sym, cnt in sorted(per_symbol_reprocess_forced.items(), key=lambda kv: kv[1], reverse=True)[:10]:
        print(f"  - {sym}: {cnt} d√≠a(s) reprocesado(s)")

mib = total_bytes_added / (1024 ** 2)
gib = total_bytes_added / (1024 ** 3)
print("-"*80)
print(f"Tama√±o total agregado: {total_bytes_added} bytes ({mib:.2f} MiB | {gib:.2f} GiB)")

# Cat√°logo (leer con Polars y mostrar esquema)
try:
    cat = pl.read_parquet(DATASET_CATALOG_PATH)
    cols = list(cat.columns)
    dtypes = [str(dt) for dt in cat.dtypes]
    print("-"*80)
    print(f"Cat√°logo actualizado: {DATASET_CATALOG_PATH} | Filas={cat.height} | Columnas={len(cols)}")
    print("Esquema cat√°logo:")
    for c, t in zip(cols, dtypes):
        print(f"  - {c}: {t}")
except Exception as e:
    print("-"*80)
    print(f"‚ö†Ô∏è  No se pudo leer el cat√°logo tras la actualizaci√≥n: {e}")

print("-"*80)
print("‚úÖ Incremental M5 completado: OK para continuar con Celda 12")
# ==============================================================================================================


Inicio Celda 11 ‚Äî Ingesta incremental diaria M5 | TZ local: America/Guayaquil
Hora local: 2025-12-02T23:39:26-05:00 | Hora UTC: 2025-12-03T04:39:26+00:00
--------------------------------------------------------------------------------------------------------------
DATA_ROOT: C:\Quant\MT5_Data_Extraction\data
Fuente (m5_raw): C:\Quant\MT5_Data_Extraction\data\bulk_data\m5_raw
--------------------------------------------------------------------------------------------------------------
[2025-12-03T04:39:26+00:00] [20251202_232253] [INFO] [11-IngestaM5] Verificaci√≥n TF: usando expl√≠citamente TIMEFRAME_M5.
[2025-12-03T04:39:29+00:00] [20251202_232253] [INFO] [11-IngestaM5] Conectando a MT5 con reintentos...
[2025-12-03T04:39:29+00:00] [20251202_232253] [INFO] [11-IngestaM5] Conexi√≥n a MT5 establecida.
[2025-12-03T04:39:30+00:00] [20251202_232253] [INFO] [11-IngestaM5] Conexi√≥n MT5 cerrada.
Incremento M5 escrito en: C:\Quant\MT5_Data_Extraction\data\bulk_data\m5_raw
Archivos (d√≠as) a

In [13]:
# ======================= Celda 12 ‚Äî Materializaciones listas para filtrar (POLARS, ventanas y QA) =======================
# Prop√≥sito:
#   - Construir ventanas last_{30,90,180}d desde m5_clean (o fallback m5_raw‚Üílimpieza on-the-fly).
#   - Validar ANTES de escribir: duplicados=0 y nulls cr√≠ticos (timestamp_utc, OHLC) = 0.
#   - Escribir en processed_data/m5_windows/window=<win>/symbol=<sym>/part=YYYYMMDD.parquet (idempotente).
#   - Imprimir m√©tricas claras: filas/columnas, esquema, rangos UTC/GYE, nulls cr√≠ticos, duplicados.
#   - Progreso (heartbeat).
# ========================================================================================================================

from __future__ import annotations
import os, re, time, shutil, json
from pathlib import Path
from datetime import datetime, timedelta, timezone, date
from typing import List, Dict, Optional, Tuple

try:
    import polars as pl
except Exception as e:
    raise RuntimeError("Se requiere 'polars'. Inst√°lalo e intenta de nuevo.") from e

# ---------------------------------- Config y rutas ----------------------------------
CELL_LABEL = "12-M5-Windows"
RUN_ID = globals().get("RUN_ID", datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))

DATA_ROOT = Path(globals().get("DATA_ROOT", str(path_contract.data_root()))).resolve()

RAW_DIR   = DATA_ROOT / "bulk_data" / "m5_raw"
CLEAN_DIR = DATA_ROOT / "historical_data" / "m5_clean"
WIN_DIR   = DATA_ROOT / "processed_data" / "m5_windows"

META_DIR  = DATA_ROOT / "metadata"
SCHEMA_JSON = META_DIR / "schema_m5.json"
FILTERS_DIR  = META_DIR / "filters"
ELIGIBLE_PARQUET = FILTERS_DIR / "eligible_symbols_by_cost.parquet"
ELIGIBLE_TXT     = FILTERS_DIR / "eligible_symbols_by_cost.txt"

# Flags
FORCE_REWRITE_WINDOWS  = bool(globals().get("FORCE_REWRITE_WINDOWS", False))
TIMEZONE_IANA          = globals().get("TIMEZONE_IANA", "America/Guayaquil")

# Ventanas (d√≠as; hasta AYER UTC)
WINDOWS = {"last_30d": 30, "last_90d": 90, "last_180d": 180}

# Parquet
PARQUET_COMP = "zstd"
PARQUET_STATS = True

# Progreso
PROGRESS_EVERY_FILES   = int(globals().get("PROGRESS_EVERY_FILES_WIN", 200))
PROGRESS_EVERY_SECONDS = float(globals().get("PROGRESS_EVERY_SECONDS_WIN", 2.0))

# ---------------------------------- Utils ----------------------------------
def _log(level: str, msg: str):
    ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{ts}] [{RUN_ID}] [{level}] [{CELL_LABEL}] {msg}", flush=True)

def _now_local_iana() -> str:
    try:
        from zoneinfo import ZoneInfo
        return datetime.now(timezone.utc).astimezone(ZoneInfo(TIMEZONE_IANA)).isoformat(timespec="seconds")
    except Exception:
        return "(instala 'tzdata' para TZ locales)"

def _schema_column_order() -> list[str]:
    try:
        obj = json.loads(SCHEMA_JSON.read_text(encoding="utf-8"))
        cols = obj.get("column_order")
        if cols: return cols
    except Exception:
        pass
    return ["timestamp_utc","symbol","open","high","low","close","tick_volume",
            "real_volume","spread_points","broker","server_tz"]

def _ensure_cols(df: pl.DataFrame, cols: list[str]) -> pl.DataFrame:
    miss = [c for c in cols if c not in df.columns]
    if miss:
        df = df.with_columns([pl.lit(None).alias(c) for c in miss])
    return df.select([c for c in cols if c in df.columns])

def _clean_day(df: pl.DataFrame, symbol: str) -> pl.DataFrame:
    cols = _schema_column_order()
    df = _ensure_cols(df, cols)

    casts = []
    for c in ["open","high","low","close","spread_points"]:
        if c in df.columns:
            casts.append(pl.col(c).cast(pl.Float64, strict=False).fill_nan(None))
    for c in ["tick_volume","real_volume","timestamp_utc"]:
        if c in df.columns:
            casts.append(pl.col(c).cast(pl.Float64, strict=False).fill_nan(None).cast(pl.Int64, strict=False))
    for c in ["symbol","broker","server_tz"]:
        if c in df.columns:
            casts.append(pl.col(c).cast(pl.Utf8, strict=False))
    df = df.with_columns(casts)

    df = df.with_columns(pl.lit(symbol).alias("symbol"))
    df = df.filter(pl.all_horizontal(pl.col(["timestamp_utc","open","high","low","close"]).is_not_null()))
    df = df.filter(pl.col("low") <= pl.col("high"))
    df = df.sort("timestamp_utc").unique(subset=["timestamp_utc"], keep="last")

    fin_cast = {
        "spread_points": pl.Float32,
        "open": pl.Float64, "high": pl.Float64, "low": pl.Float64, "close": pl.Float64,
        "tick_volume": pl.Int64, "real_volume": pl.Int64, "timestamp_utc": pl.Int64,
        "symbol": pl.Utf8, "broker": pl.Utf8, "server_tz": pl.Utf8,
    }
    df = df.with_columns([pl.col(k).cast(v, strict=False) for k, v in fin_cast.items() if k in df.columns])
    return _ensure_cols(df, cols)

def _symbols_from_dir(base: Path) -> list[str]:
    if not base.exists(): return []
    return sorted([p.name.split("=",1)[-1] for p in base.glob("symbol=*") if p.is_dir()])

def _day_from_name(name: str) -> Optional[str]:
    m = re.search(r"part=([0-9]{8})", name)
    return m.group(1) if m else None

def _ymd(d: date) -> str:
    return f"{d.year:04d}{d.month:02d}{d.day:02d}"

def _out_window_path(window_name: str, symbol: str, ymd: str) -> Path:
    return WIN_DIR / f"window={window_name}" / f"symbol={symbol}" / f"part={ymd}.parquet"

def _clean_path(symbol: str, ymd: str) -> Path:
    return CLEAN_DIR / f"symbol={symbol}" / f"year={ymd[:4]}" / f"month={ymd[4:6]}" / f"part={ymd}.parquet"

def _raw_path(symbol: str, ymd: str) -> Optional[Path]:
    p = RAW_DIR / f"symbol={symbol}" / f"year={ymd[:4]}" / f"month={ymd[4:6]}" / f"part={ymd}.parquet"
    return p if p.exists() else None

def _utc_ms_to_local_iso(ms: int) -> str:
    try:
        from zoneinfo import ZoneInfo
        dt = datetime.fromtimestamp(int(ms)/1000, tz=timezone.utc).astimezone(ZoneInfo(TIMEZONE_IANA))
        return dt.isoformat(timespec="seconds")
    except Exception:
        return datetime.fromtimestamp(int(ms)/1000, tz=timezone.utc).isoformat(timespec="seconds")

def _read_eligible_set() -> tuple[set[str] | None, str | None]:
    try:
        if ELIGIBLE_PARQUET.exists():
            df = pl.read_parquet(ELIGIBLE_PARQUET)
            col = "symbol" if "symbol" in df.columns else df.columns[0]
            return set(df[col].cast(pl.Utf8).to_list()), "parquet"
    except Exception:
        pass
    if ELIGIBLE_TXT.exists():
        s = set()
        for ln in ELIGIBLE_TXT.read_text(encoding="utf-8").splitlines():
            v = ln.strip()
            if v: s.add(v)
        return s, "txt"
    return None, None

# ---------------------------------- Header ----------------------------------
print("="*110)
print(f"Inicio Celda 12 ‚Äî Materializaciones listas para filtrar | TZ local: {TIMEZONE_IANA}")
print(f"Hora local: {_now_local_iana()} | Hora UTC: {datetime.now(timezone.utc).isoformat(timespec='seconds')}")
print("-"*110)
print(f"DATA_ROOT: {DATA_ROOT}")
print(f"Entradas preferidas: {CLEAN_DIR}  (fallback a {RAW_DIR} si falta alg√∫n d√≠a)")
print(f"Salida: {WIN_DIR}")
print("-"*110)

# ---------------------------------- Descubrimiento ----------------------------------
CLEAN_DIR.mkdir(parents=True, exist_ok=True)
WIN_DIR.mkdir(parents=True, exist_ok=True)
META_DIR.mkdir(parents=True, exist_ok=True)

syms_clean = _symbols_from_dir(CLEAN_DIR)
syms_raw   = _symbols_from_dir(RAW_DIR)

elig_set, elig_src = _read_eligible_set()
if elig_set is not None:
    print(f"Filtro 3B detectado ({elig_src}): s√≠mbolos elegibles={len(elig_set)}", flush=True)
else:
    print("Filtro 3B no encontrado; ventanas se construyen para todos los s√≠mbolos presentes.", flush=True)

print(f"S√≠mbolos con clean: {len(syms_clean)} | s√≠mbolos con raw: {len(syms_raw)}", flush=True)

# ---------------------------------- Construcci√≥n de ventanas ----------------------------------
now_utc = datetime.now(timezone.utc)
end_day = (now_utc - timedelta(days=1)).date()  # ayer UTC

schema_cols = _schema_column_order()

def _build_window(window_name: str, n_days: int) -> Dict[str, object]:
    start_day = end_day - timedelta(days=n_days-1)
    expected = [_ymd(start_day + timedelta(days=i)) for i in range(n_days)]
    expected_set = set(expected)

    # Universo de s√≠mbolos
    symbols = syms_clean if syms_clean else syms_raw
    if elig_set is not None:
        symbols = [s for s in symbols if s in elig_set]

    files_written = 0
    bytes_written = 0

    qa_dups_flag_files = 0
    qa_nulls_flag_files = 0
    qa_bad_lowhigh_flag_files = 0

    written_paths: List[str] = []

    t0 = time.monotonic()
    last_beat = t0
    processed = 0

    _log("INFO", f"[{window_name}] s√≠mbolos a procesar: {len(symbols)} | d√≠as esperados por s√≠mbolo: {n_days}")

    # --- Escritura idempotente por s√≠mbolo/d√≠a con QA previa ---
    for si, symbol in enumerate(symbols, start=1):
        for ymd in expected:
            src_path = _clean_path(symbol, ymd)
            src_kind = "clean"
            if not src_path.exists():
                alt = _raw_path(symbol, ymd)
                if alt is None:
                    continue
                src_path = alt
                src_kind = "raw"

            dst = _out_window_path(window_name, symbol, ymd)
            if dst.exists() and not FORCE_REWRITE_WINDOWS:
                # No escribimos, pero contar√° para m√©tricas finales
                processed += 1
                now = time.monotonic()
                if (processed % PROGRESS_EVERY_FILES == 0) or (now - last_beat >= PROGRESS_EVERY_SECONDS):
                    rate = processed / max(now - t0, 1e-6)
                    _log("INFO", f"[{window_name}] progreso: proc={processed} | escritos={files_written} | {rate:.1f} files/s")
                    last_beat = now
                continue

            try:
                df = pl.read_parquet(src_path)
                if src_kind == "raw":
                    df = _clean_day(df, symbol)  # normaliza y deduplica
                else:
                    df = _ensure_cols(df, schema_cols)

                # QA previa (archivo fuente)
                has_nulls = bool(df.select(pl.any_horizontal([pl.col(c).is_null() for c in ["timestamp_utc","open","high","low","close"]])).to_series()[0])
                bad_lh    = bool(df.select((pl.col("low") > pl.col("high")).any()).to_series()[0])
                dups_cnt  = int(df.height - df.select(pl.col("timestamp_utc").n_unique()).to_series()[0])

                if has_nulls: qa_nulls_flag_files += 1
                if bad_lh:    qa_bad_lowhigh_flag_files += 1
                if dups_cnt:  qa_dups_flag_files += 1

                # Si hay problemas, limpiar de forma robusta
                if has_nulls or bad_lh or dups_cnt:
                    df = _clean_day(df, symbol)

                # Post-limpieza: chequeo duro
                has_nulls2 = bool(df.select(pl.any_horizontal([pl.col(c).is_null() for c in ["timestamp_utc","open","high","low","close"]])).to_series()[0])
                bad_lh2    = bool(df.select((pl.col("low") > pl.col("high")).any()).to_series()[0])
                dups_cnt2  = int(df.height - df.select(pl.col("timestamp_utc").n_unique()).to_series()[0])

                if (not has_nulls2) and (not bad_lh2) and (dups_cnt2 == 0) and (df.height > 0):
                    dst.parent.mkdir(parents=True, exist_ok=True)
                    df.write_parquet(dst, compression=PARQUET_COMP, statistics=PARQUET_STATS)
                    files_written += 1
                    try:
                        bytes_written += dst.stat().st_size
                    except Exception:
                        pass
                    written_paths.append(str(dst))
                else:
                    _log("WARNING", f"[{window_name}] {symbol} {ymd}: QA no superada tras limpieza (omitido).")

            except Exception as e:
                _log("WARNING", f"[{window_name}] {symbol} {ymd}: error {e.__class__.__name__}")

            processed += 1
            now = time.monotonic()
            if (processed % PROGRESS_EVERY_FILES == 0) or (now - last_beat >= PROGRESS_EVERY_SECONDS):
                rate = processed / max(now - t0, 1e-6)
                _log("INFO", f"[{window_name}] progreso: proc={processed} | escritos={files_written} | {rate:.1f} files/s")
                last_beat = now

    # --- C√°lculo de completitud y paths para m√©tricas (incluye existentes) ---
    symbols_any = 0
    symbols_complete = 0
    symbols_partial = 0
    metric_paths: List[str] = []

    for symbol in symbols:
        present_after = 0
        for ymd in expected:
            p = _out_window_path(window_name, symbol, ymd)
            if p.exists():
                present_after += 1
                metric_paths.append(str(p))
        if present_after > 0:
            symbols_any += 1
            if present_after == len(expected):
                symbols_complete += 1
            else:
                symbols_partial += 1

    # Remover duplicados de metric_paths
    metric_paths = sorted(set(metric_paths))

    # --- M√©tricas agregadas (sobre TODA la ventana presente: escritos hoy + existentes) ---
    agg_rows = 0
    schema_dict = {}
    ts_min = None
    ts_max = None
    nulls_ohlc = 0
    dups_union = 0

    if metric_paths:
        lf = pl.scan_parquet(metric_paths, low_memory=True)

        schema_obj = lf.collect_schema()
        try:
            schema_dict = {k: str(v) for k, v in schema_obj.items()}
        except Exception:
            try:
                schema_dict = {k: str(v) for k, v in zip(schema_obj.names(), schema_obj.dtypes())}
            except Exception:
                schema_dict = {}

        # Duplicados UNION correctos por (symbol, timestamp_utc)
        dups_union_expr = (
            pl.len() - pl.concat_str([
                pl.col("symbol"),
                pl.lit("|"),
                pl.col("timestamp_utc").cast(pl.Utf8),
            ]).n_unique()
        ).alias("dups_union")

        agg = lf.select([
            pl.len().alias("rows"),
            pl.col("timestamp_utc").min().alias("ts_min"),
            pl.col("timestamp_utc").max().alias("ts_max"),
            dups_union_expr,
            pl.any_horizontal([pl.col(c).is_null() for c in ["open","high","low","close"]]).sum().alias("null_rows_ohlc"),
        ]).collect()

        agg_rows = int(agg["rows"][0])
        ts_min = None if agg["ts_min"][0] is None else int(agg["ts_min"][0])
        ts_max = None if agg["ts_max"][0] is None else int(agg["ts_max"][0])
        dups_union = int(agg["dups_union"][0])
        nulls_ohlc = int(agg["null_rows_ohlc"][0])

    return {
        "window": window_name,
        "days": n_days,
        "symbols_any": symbols_any,
        "symbols_complete": symbols_complete,
        "symbols_partial": symbols_partial,
        "files_written": files_written,
        "bytes_written": bytes_written,
        "agg_rows": agg_rows,
        "schema": schema_dict,
        "ts_min": ts_min,
        "ts_max": ts_max,
        "dups_union": dups_union,
        "nulls_ohlc": nulls_ohlc,
        "qa_inputs_flagged": {
            "files_with_nulls": qa_nulls_flag_files,
            "files_with_low_gt_high": qa_bad_lowhigh_flag_files,
            "files_with_dups": qa_dups_flag_files,
        }
    }

reports = []
for wname, nd in WINDOWS.items():
    reports.append(_build_window(wname, nd))

# ---------------------------------- Impresiones finales requeridas ----------------------------------
print("-"*110)
for rep in reports:
    print(f"Ventana: {rep['window']} (√∫ltimos {rep['days']} d√≠as) ‚Üí {WIN_DIR / ('window=' + rep['window'])}")
    print(f"  - S√≠mbolos con datos: {rep['symbols_any']} | COMPLETAS: {rep['symbols_complete']} | PARCIALES: {rep['symbols_partial']}")
    print(f"  - Archivos escritos: {rep['files_written']} | Bytes: {rep['bytes_written']} ({rep['bytes_written']/1048576:.2f} MiB)")
    print(f"  - Filas totales (ventana presente): {rep['agg_rows']}")
    if rep['schema']:
        print("  - Esquema:")
        for k, v in rep['schema'].items():
            print(f"      ‚Ä¢ {k}: {v}")
    if rep["ts_min"] is not None and rep["ts_max"] is not None:
        utc_min = datetime.fromtimestamp(rep["ts_min"]/1000, tz=timezone.utc).isoformat(timespec="seconds")
        utc_max = datetime.fromtimestamp(rep["ts_max"]/1000, tz=timezone.utc).isoformat(timespec="seconds")
        gye_min = _utc_ms_to_local_iso(rep["ts_min"])
        gye_max = _utc_ms_to_local_iso(rep["ts_max"])
        print(f"  - Rango UTC: {utc_min} ‚Üí {utc_max}")
        print(f"  - Rango {TIMEZONE_IANA}: {gye_min} ‚Üí {gye_max}")
    else:
        print("  - Rango: (sin filas)")
    print(f"  - QA union: duplicados (symbol,timestamp_utc) = {rep['dups_union']}  |  nulls cr√≠ticos (OHLC) = {rep['nulls_ohlc']}")
    qaf = rep["qa_inputs_flagged"]
    print(f"  - QA entrada (antes de limpiar): files con nulls={qaf['files_with_nulls']}, low>high={qaf['files_with_low_gt_high']}, dups={qaf['files_with_dups']}")
    print("-"*110)

total_files = sum(r["files_written"] for r in reports)
total_bytes = sum(r["bytes_written"] for r in reports)
print(f"Total ventanas escritas: {total_files} archivos | {total_bytes} bytes ({total_bytes/1048576:.2f} MiB)")
print("‚úÖ Materializaciones listas para filtrar (ventanas construidas con QA previa).")
# ========================================================================================================================
 

Inicio Celda 12 ‚Äî Materializaciones listas para filtrar | TZ local: America/Guayaquil
Hora local: 2025-12-02T23:39:34-05:00 | Hora UTC: 2025-12-03T04:39:34+00:00
--------------------------------------------------------------------------------------------------------------
DATA_ROOT: C:\Quant\MT5_Data_Extraction\data
Entradas preferidas: C:\Quant\MT5_Data_Extraction\data\historical_data\m5_clean  (fallback a C:\Quant\MT5_Data_Extraction\data\bulk_data\m5_raw si falta alg√∫n d√≠a)
Salida: C:\Quant\MT5_Data_Extraction\data\processed_data\m5_windows
--------------------------------------------------------------------------------------------------------------
Filtro 3B detectado (parquet): s√≠mbolos elegibles=107
S√≠mbolos con clean: 92 | s√≠mbolos con raw: 110
[2025-12-02 23:39:34] [20251202_232253] [INFO] [12-M5-Windows] [last_30d] s√≠mbolos a procesar: 89 | d√≠as esperados por s√≠mbolo: 30
[2025-12-02 23:39:34] [20251202_232253] [INFO] [12-M5-Windows] [last_30d] progreso: proc=200 | es

In [14]:
# ======================= Celda 12A ‚Äî Data Quality Summary por s√≠mbolo (POLARS, ventanas M5) =======================
# Prop√≥sito:
#   - Leer ventanas M5 trading-ready (window=last_180d por defecto).
#   - Resumir por s√≠mbolo: cobertura temporal, % d√≠as presentes, % velas faltantes, gaps grandes.
#   - Construir un score homog√©neo de calidad (0‚Äì100) y un flag ("OK", "WARNING", "BAD").
#   - Escribir: metadata/data_quality_summary.parquet
#   - Si algo falla en la lectura, escribir un archivo vac√≠o y NO romper el pipeline.
#
# v2 ‚Äî Ajuste por tipo de sesi√≥n (no penalizar acciones como si fueran 24h):
#   - Lee metadata/qa_operativa_summary.parquet (Celda 10).
#   - Usa expected_bars_per_day_symbol por s√≠mbolo como "velas esperadas" de referencia.
#   - Fallback limpio a EXPECTED_BARS_PER_DAY_M5 (288) si:
#       * falta expected_bars_per_day_symbol,
#       * o < 10 (valor raro).
#   - missing_candles_pct y days_with_large_gaps se calculan contra expected_bars_symbol,
#     no contra 288 fijo.
# =================================================================================================================

from __future__ import annotations
import os, json
from pathlib import Path
from datetime import datetime, timezone

try:
    import polars as pl
except Exception as e:
    raise RuntimeError("Se requiere 'polars' para la Celda 12A. Inst√°lalo e intenta de nuevo.") from e

# ---------------------------------- Config y rutas ----------------------------------
CELL_LABEL = "12A-DataQuality"
RUN_ID = globals().get("RUN_ID", datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
TIMEZONE_IANA = globals().get("TIMEZONE_IANA", "America/Guayaquil")

# Para evitar l√≠os de tz, usamos un offset fijo UTC‚Üílocal (Ecuador = UTC-5, sin DST)
LOCAL_OFFSET_HOURS = int(globals().get("TIMEZONE_UTC_OFFSET_HOURS", -5))

DATA_ROOT = Path(globals().get("DATA_ROOT", str(path_contract.data_root()))).resolve()
WIN_DIR   = DATA_ROOT / "processed_data" / "m5_windows"
META_DIR  = DATA_ROOT / "metadata"
META_DIR.mkdir(parents=True, exist_ok=True)

DQ_PATH          = META_DIR / "data_quality_summary.parquet"
QA_SUMMARY_PATH  = META_DIR / "qa_operativa_summary.parquet"   # ‚Üê salida de Celda 10

# Ventana base para evaluar calidad (por defecto last_180d)
WINDOW_BASE_NAME = globals().get("DATA_QUALITY_WINDOW_BASE", "last_180d")
WINDOW_BASE_DIR  = WIN_DIR / f"window={WINDOW_BASE_NAME}"

# Par√°metros de calidad
#   - EXPECTED_BARS_PER_DAY_M5 se convierte ahora en "fallback global" 24h (ej. FX).
#   - BIG_GAP_THRESHOLD: fracci√≥n m√≠nima de velas en un d√≠a para NO considerarlo "gap grande".
EXPECTED_BARS_PER_DAY = int(globals().get("EXPECTED_BARS_PER_DAY_M5", 288))
BIG_GAP_THRESHOLD     = float(globals().get("BIG_GAP_THRESHOLD_FRACTION", 0.7))  # gap grande si candles < expected_bars_symbol * 0.7

# ---------------------------------- Utils ----------------------------------
def _now_local_iana() -> str:
    try:
        from zoneinfo import ZoneInfo
        return datetime.now(timezone.utc).astimezone(ZoneInfo(TIMEZONE_IANA)).isoformat(timespec="seconds")
    except Exception:
        return "(instala 'tzdata' para TZ locales)"

def _log(level: str, msg: str):
    ts = _now_local_iana()
    print(f"[{ts}] [{RUN_ID}] [{level}] [{CELL_LABEL}] {msg}", flush=True)

def _write_empty_summary(reason: str | None = None):
    """
    Escribe un parquet vac√≠o pero con el esquema esperado, para no romper el pipeline.
    """
    schema = {
        "symbol": pl.String,
        "first_date": pl.Date,
        "last_date": pl.Date,
        "days_with_data": pl.Int32,
        "days_expected": pl.Int32,
        "coverage_days": pl.Float64,
        "total_candles": pl.Int64,
        "days_with_large_gaps": pl.Int32,
        "missing_candles_pct": pl.Float64,
        "bad_days_ratio": pl.Float64,
        "data_quality_score": pl.Float64,
        "data_quality_flag": pl.String,
        "comentario": pl.String,
    }
    df_empty = pl.DataFrame(schema=schema)
    df_empty.write_parquet(DQ_PATH, compression="zstd", statistics=True)
    print(f"Archivo vac√≠o escrito en: {DQ_PATH}")
    if reason:
        print(f"Motivo: {reason}")
    print("‚ö†Ô∏è Celda 12A finalizada con error en lectura/agrupaci√≥n, pero sin romper pipeline.")

def _ensure_ts_utc_datetime(df: pl.DataFrame) -> pl.DataFrame:
    """
    Normaliza la columna 'timestamp_utc' a Datetime(us) sin zona horaria,
    y la expone como 'ts_utc'.
    Soporta:
      - Datetime con o sin tz.
      - Int/Float (epoch, asumimos us por defecto).
      - Utf8 (ISO u otros formatos parseables).
    """
    if "timestamp_utc" not in df.columns:
        raise ValueError("El parquet no contiene columna 'timestamp_utc'.")

    dt = df.schema["timestamp_utc"]

    # Caso 1: ya es Datetime
    if isinstance(dt, pl.datatypes.Datetime):
        # Elimina tz si existe y fuerza a us
        return df.with_columns(
            pl.col("timestamp_utc")
            .dt.replace_time_zone(None)
            .cast(pl.Datetime("us"))
            .alias("ts_utc")
        )

    # Caso 2: num√©rico ‚Üí epoch
    if dt in (pl.Int64, pl.Int32, pl.UInt64, pl.UInt32, pl.Float64, pl.Float32):
        # Asumimos microsegundos; si fuera ms, el error ser√° muy peque√±o para esta celda
        return df.with_columns(
            pl.from_epoch(pl.col("timestamp_utc"), time_unit="us")
            .cast(pl.Datetime("us"))
            .alias("ts_utc")
        )

    # Caso 3: texto ‚Üí intentar parseo laxo
    if dt == pl.Utf8:
        return df.with_columns(
            pl.col("timestamp_utc")
            .str.strptime(pl.Datetime, strict=False, time_unit="us")
            .alias("ts_utc")
        )

    # Fallback: intentar cast directo
    return df.with_columns(
        pl.col("timestamp_utc").cast(pl.Datetime("us")).alias("ts_utc")
    )

def _read_qa_operativa_summary(path: Path) -> pl.DataFrame | None:
    """
    Lee qa_operativa_summary.parquet (Celda 10) y devuelve:
        symbol, expected_bars_per_day_symbol, session_type

    Si no est√° disponible, o no tiene columnas m√≠nimas, devuelve None
    y la celda usar√° EXPECTED_BARS_PER_DAY como fallback global.
    """
    if not path.exists():
        _log("WARNING", f"No se encontr√≥ {path}; se usar√° EXPECTED_BARS_PER_DAY_M5={EXPECTED_BARS_PER_DAY} para todos los s√≠mbolos.")
        return None
    try:
        df = pl.read_parquet(path)
    except Exception as e:
        _log("WARNING", f"No se pudo leer {path}: {e}; se usar√° EXPECTED_BARS_PER_DAY_M5={EXPECTED_BARS_PER_DAY}.")
        return None

    if df.is_empty():
        _log("WARNING", f"{path} est√° vac√≠o; se usar√° EXPECTED_BARS_PER_DAY_M5={EXPECTED_BARS_PER_DAY}.")
        return None

    cols = df.columns

    # Asegurar columna expected_bars_per_day_symbol: si no est√°, derivar de n_bars_p50 (Celda 10 v4)
    if "expected_bars_per_day_symbol" not in cols:
        if "n_bars_p50" in cols:
            df = df.with_columns(
                pl.col("n_bars_p50").alias("expected_bars_per_day_symbol")
            )
            _log("INFO", "Columna 'expected_bars_per_day_symbol' no encontrada en qa_operativa_summary; se deriv√≥ desde 'n_bars_p50'.")
        else:
            _log("WARNING", "qa_operativa_summary no contiene 'expected_bars_per_day_symbol' ni 'n_bars_p50'; se usar√° EXPECTED_BARS_PER_DAY_M5 global.")
            return None

    # Asegurar session_type (para posibles usos posteriores, aunque aqu√≠ no se persista)
    if "session_type" not in cols:
        df = df.with_columns(pl.lit("UNKNOWN_SESSION").alias("session_type"))

    return df.select(["symbol", "expected_bars_per_day_symbol", "session_type"])

# ---------------------------------- Header ----------------------------------
print("="*110)
print(f"Inicio Celda 12A ‚Äî Data Quality Summary por s√≠mbolo | TZ local: {TIMEZONE_IANA}")
print(f"Hora local: {_now_local_iana()} | Hora UTC: {datetime.now(timezone.utc).isoformat(timespec='seconds')}")
print("-"*110)
print(f"DATA_ROOT: {DATA_ROOT}")
print(f"Ventanas M5 (origen calidad): {WIN_DIR}")
print(f"Metadata: {META_DIR}")
print("-"*110)
print(f"Ventana base seleccionada: {WINDOW_BASE_NAME} ‚Üí {WINDOW_BASE_DIR}")

# ---------------------------------- L√≥gica principal ----------------------------------
if not WINDOW_BASE_DIR.exists():
    _log("ERROR", f"No existe la carpeta de ventanas base: {WINDOW_BASE_DIR}")
    _write_empty_summary(f"Carpeta de ventana base inexistente: {WINDOW_BASE_DIR}")
else:
    # Recolectar todos los archivos part=*.parquet de todos los s√≠mbolos
    pattern = "symbol=*/part=*.parquet"
    all_files = sorted(WINDOW_BASE_DIR.glob(pattern))

    if not all_files:
        _log("WARNING", f"No se encontraron archivos con patr√≥n {pattern} en {WINDOW_BASE_DIR}")
        _write_empty_summary("Ventana base sin archivos de datos.")
    else:
        try:
            _log("INFO", f"N√∫mero de archivos encontrados para calidad M5 = {len(all_files)}")
            day_frames: list[pl.DataFrame] = []
            files_ok = 0
            files_skipped = 0

            for fp in all_files:
                # Derivar s√≠mbolo desde la carpeta, por si el parquet no lo trae
                sym_from_dir = fp.parent.name
                if sym_from_dir.startswith("symbol="):
                    sym_from_dir = sym_from_dir.split("=", 1)[1]
                sym_from_dir = sym_from_dir.strip().upper()

                try:
                    # Leemos SOLO lo necesario para esta celda (symbol, timestamp_utc)
                    df = pl.read_parquet(str(fp), columns=["symbol", "timestamp_utc"])
                except Exception as e:
                    files_skipped += 1
                    _log("WARN", f"Error leyendo {fp}: {e.__class__.__name__}: {e} ‚Äî archivo omitido.")
                    continue

                if df.is_empty():
                    files_skipped += 1
                    continue

                # Normalizar timestamp_utc a Datetime(us) sin tz ‚Üí ts_utc
                try:
                    df = _ensure_ts_utc_datetime(df)
                except Exception as e:
                    files_skipped += 1
                    _log("WARN", f"Error normalizando timestamp_utc en {fp}: {e.__class__.__name__}: {e} ‚Äî archivo omitido.")
                    continue

                # Normalizar symbol y derivar trading_day (usando ts_utc con offset fijo a local)
                df = df.with_columns([
                    pl.when(pl.col("symbol").is_null())
                      .then(pl.lit(sym_from_dir))
                      .otherwise(pl.col("symbol"))
                      .cast(pl.Utf8)
                      .str.to_uppercase()
                      .alias("symbol"),
                    (pl.col("ts_utc") + pl.duration(hours=LOCAL_OFFSET_HOURS))
                        .dt.date()
                        .alias("trading_day"),
                ])

                # Agregar por s√≠mbolo y d√≠a en este archivo
                df_day_file = (
                    df.group_by(["symbol", "trading_day"])
                      .agg([
                          pl.len().alias("candles"),  # pl.count() ‚Üí pl.len() para evitar deprecation
                      ])
                )

                if not df_day_file.is_empty():
                    day_frames.append(df_day_file)
                    files_ok += 1

            _log("INFO", f"Archivos procesados OK = {files_ok}, archivos omitidos = {files_skipped}")

            if not day_frames:
                _log("WARNING", "Despu√©s de procesar archivos no hay datos agregados por d√≠a; escribiendo resumen vac√≠o.")
                _write_empty_summary("Sin m√©tricas por s√≠mbolo (no se pudo agregar por d√≠a).")
            else:
                # Concatenar todos los agregados por d√≠a
                df_day = pl.concat(day_frames, how="vertical_relaxed")

                # ---------------- Enriquecer con perfil de sesi√≥n desde Celda 10 ----------------
                qa_sym = _read_qa_operativa_summary(QA_SUMMARY_PATH)

                if qa_sym is None:
                    # Fallback: todos con EXPECTED_BARS_PER_DAY (ej. 288) como velas esperadas
                    _log(
                        "INFO",
                        f"Usando EXPECTED_BARS_PER_DAY_M5={EXPECTED_BARS_PER_DAY} como expected_bars_symbol para todos los s√≠mbolos (sin perfil de sesi√≥n)."
                    )
                    df_day = df_day.with_columns(
                        pl.lit(EXPECTED_BARS_PER_DAY).alias("expected_bars_symbol")
                    )
                else:
                    _log(
                        "INFO",
                        "Usando 'expected_bars_per_day_symbol' de qa_operativa_summary como perfil de sesi√≥n por s√≠mbolo."
                    )
                    df_day = (
                        df_day.join(qa_sym, on="symbol", how="left")
                              .with_columns([
                                  pl.when(
                                      pl.col("expected_bars_per_day_symbol").is_null()
                                      | (pl.col("expected_bars_per_day_symbol") < 10)
                                  )
                                  .then(pl.lit(EXPECTED_BARS_PER_DAY))
                                  .otherwise(pl.col("expected_bars_per_day_symbol"))
                                  .alias("expected_bars_symbol"),
                              ])
                    )
                # -------------------------------------------------------------------------

                # Agregar por s√≠mbolo para m√©tricas finales (usando expected_bars_symbol por s√≠mbolo)
                _log("INFO", "Resumiendo m√©tricas de calidad por s√≠mbolo (ajustadas a expected_bars_symbol)...")

                df_sym = (
                    df_day.group_by("symbol")
                    .agg([
                        pl.col("trading_day").min().alias("first_day"),
                        pl.col("trading_day").max().alias("last_day"),
                        pl.col("trading_day").n_unique().alias("days_with_data"),
                        pl.col("candles").sum().alias("total_candles"),
                        pl.col("candles").mean().alias("avg_candles_per_day"),
                        pl.col("expected_bars_symbol").first().alias("expected_bars_symbol"),
                        (pl.col("candles") < pl.col("expected_bars_symbol") * BIG_GAP_THRESHOLD)
                            .sum()
                            .alias("days_with_large_gaps"),
                    ])
                )

                if df_sym.height == 0:
                    _log("WARNING", "Despu√©s de agrupar por s√≠mbolo no quedaron filas; escribiendo resumen vac√≠o.")
                    _write_empty_summary("Sin m√©tricas por s√≠mbolo.")
                else:
                    # Paso 4: m√©tricas derivadas + score
                    # days_expected = (last_day - first_day) + 1, usando representaci√≥n interna de Date (d√≠as desde epoch)
                    df_sym = df_sym.with_columns([
                        (
                            (
                                pl.col("last_day").cast(pl.Date).cast(pl.Int64)
                                - pl.col("first_day").cast(pl.Date).cast(pl.Int64)
                            ) + 1
                        )
                        .clip(1, None)   # min=1, max=None
                        .cast(pl.Int32)
                        .alias("days_expected"),
                    ])

                    df_sym = df_sym.with_columns([
                        (pl.col("days_with_data") / pl.col("days_expected")).alias("coverage_days"),

                        # missing_candles_pct medido contra expected_bars_symbol (perfil de sesi√≥n),
                        # con fallback ya resuelto en expected_bars_symbol.
                        pl.when(pl.col("days_with_data") > 0)
                          .then(
                              1.0 - (
                                  pl.col("total_candles")
                                  / (pl.col("days_with_data") * pl.col("expected_bars_symbol"))
                              )
                          )
                          .otherwise(1.0)
                          .clip(0.0, 1.0)   # min=0.0, max=1.0
                          .alias("missing_candles_pct"),

                        # bad_days_ratio: d√≠as con "gap grande" sobre d√≠as con datos,
                        # donde "gap grande" ya se midi√≥ vs expected_bars_symbol * BIG_GAP_THRESHOLD.
                        pl.when(pl.col("days_with_data") > 0)
                          .then(pl.col("days_with_large_gaps") / pl.col("days_with_data"))
                          .otherwise(1.0)
                          .clip(0.0, 1.0)   # min=0.0, max=1.0
                          .alias("bad_days_ratio"),
                    ])

                    df_sym = df_sym.with_columns([
                        (
                            (
                                0.6 * pl.col("coverage_days")
                                + 0.3 * (1.0 - pl.col("missing_candles_pct"))
                                + 0.1 * (1.0 - pl.col("bad_days_ratio"))
                            ) * 100.0
                        )
                        .clip(0.0, 100.0)   # min=0.0, max=100.0
                        .round(2)
                        .alias("data_quality_score")
                    ])

                    # Primero creamos data_quality_flag SOLO en base al score
                    df_sym = df_sym.with_columns([
                        pl.when(pl.col("data_quality_score") >= 90.0)
                          .then(pl.lit("OK"))
                          .when(pl.col("data_quality_score") >= 70.0)
                          .then(pl.lit("WARNING"))
                          .otherwise(pl.lit("BAD"))
                          .alias("data_quality_flag")
                    ])

                    # Luego, en un segundo with_columns, creamos comentario (ya existe data_quality_flag)
                    df_sym = df_sym.with_columns([
                        pl.when(pl.col("data_quality_flag") == pl.lit("OK"))
                          .then(pl.lit("Cobertura y completitud altas; apto para trading."))
                          .when(pl.col("data_quality_flag") == pl.lit("WARNING"))
                          .then(pl.lit("Cobertura/velas con algunas carencias; revisar antes de usar con size alto."))
                          .otherwise(pl.lit("Datos con huecos importantes; usar s√≥lo para exploraci√≥n."))
                          .alias("comentario")
                    ])

                    # Ajustes de tipos para ser coherentes con el esquema "vac√≠o"
                    df_sym = df_sym.with_columns([
                        pl.col("days_with_data").cast(pl.Int32),
                        pl.col("days_expected").cast(pl.Int32),
                        pl.col("days_with_large_gaps").cast(pl.Int32),
                        pl.col("total_candles").cast(pl.Int64),
                    ])

                    # Salida: mantenemos el MISMO esquema que la versi√≥n anterior (no exponemos expected_bars_symbol aqu√≠)
                    df_out = (
                        df_sym
                        .select([
                            pl.col("symbol"),
                            pl.col("first_day").alias("first_date"),
                            pl.col("last_day").alias("last_date"),
                            pl.col("days_with_data"),
                            pl.col("days_expected"),
                            pl.col("coverage_days"),
                            pl.col("total_candles"),
                            pl.col("days_with_large_gaps"),
                            pl.col("missing_candles_pct"),
                            pl.col("bad_days_ratio"),
                            pl.col("data_quality_score"),
                            pl.col("data_quality_flag"),
                            pl.col("comentario"),
                        ])
                        .sort("symbol")
                    )

                    df_out.write_parquet(DQ_PATH, compression="zstd", statistics=True)
                    print(f"Archivo de resumen de calidad escrito en: {DQ_PATH}")
                    print(f"S√≠mbolos evaluados: {df_out.height}")
                    print("Preview (primeros 10 s√≠mbolos):")
                    print(df_out.head(10))
                    print("‚úÖ Celda 12A ‚Äî Data Quality Summary (ajustada por tipo de sesi√≥n) completada correctamente.")

        except Exception as e:
            _log("ERROR", f"Error al leer/agrupar ventanas M5: {e.__class__.__name__}: {e}")
            _write_empty_summary(str(e))

# =================================================================================================================


Inicio Celda 12A ‚Äî Data Quality Summary por s√≠mbolo | TZ local: America/Guayaquil
Hora local: 2025-12-02T23:39:57-05:00 | Hora UTC: 2025-12-03T04:39:57+00:00
--------------------------------------------------------------------------------------------------------------
DATA_ROOT: C:\Quant\MT5_Data_Extraction\data
Ventanas M5 (origen calidad): C:\Quant\MT5_Data_Extraction\data\processed_data\m5_windows
Metadata: C:\Quant\MT5_Data_Extraction\data\metadata
--------------------------------------------------------------------------------------------------------------
Ventana base seleccionada: last_180d ‚Üí C:\Quant\MT5_Data_Extraction\data\processed_data\m5_windows\window=last_180d
[2025-12-02T23:39:58-05:00] [20251202_232253] [INFO] [12A-DataQuality] N√∫mero de archivos encontrados para calidad M5 = 16073
[2025-12-02T23:40:43-05:00] [20251202_232253] [INFO] [12A-DataQuality] Archivos procesados OK = 16073, archivos omitidos = 0
[2025-12-02T23:40:43-05:00] [20251202_232253] [INFO] [12A-D

In [15]:
# ======================= Celda 12B ‚Äî Universe snapshot M5 (QA + DQ + 3B + QA econ√≥mica) =======================
# Prop√≥sito:
#   - Combinar:
#       * metadata/qa_operativa_summary.parquet  (Celda 10, QA operativa + perfil de sesi√≥n + qa_struct_flag + qa_price_flag)
#       * metadata/data_quality_summary.parquet (Celda 12A, calidad de ventanas last_180d)
#       * filters/eligible_symbols_by_cost.*    (lista 3B de s√≠mbolos elegibles por coste)
#   - Construir un ‚Äúuniverse snapshot‚Äù por s√≠mbolo, con:
#       * S√≠mbolos auditados en QA (Celda 10).
#       * Campos QA (qa_operativa_flag, qa_price_flag, qa_struct_flag, session_type, expected_bars_per_day_symbol, etc.).
#       * Campos de calidad de datos (data_quality_score, data_quality_flag, coverage_days, etc.).
#       * Campo de selecci√≥n final:
#             universe_flag ‚àà {"IN", "OUT"}
#         basado en QA + QA econ√≥mica + DQ + estructura + 3B.
#       * Copia expl√≠cita de qa_struct_flag como qa_struct_flag_qa para la Celda 12C.
#   - Escribir:
#       * metadata/universe_snapshot_{RUN_ID}.parquet
#
# Notas de dise√±o (v2 parchada):
#   - No rompe el pipeline si falta data_quality_summary: en ese caso se construye el universo s√≥lo con QA,
#     marcando data_quality_flag="MISSING" y data_quality_score=0.0 (y universe_flag se puede relajar v√≠a par√°metros).
#   - Usa los mismos filtros 3B que Celda 10 y Celda 12 (eligible_symbols_by_cost.{parquet,txt}).
#   - Es totalmente compatible con Celda 12C:
#       * La columna 'symbol' siempre est√° presente.
#       * Incluye 'qa_struct_flag' y 'qa_struct_flag_qa'.
#       * universe_flag hoy no se usa en 12C, pero queda listo para el siguiente notebook.
#
# Nota sobre sesgo de supervivencia:
#   - Este universe_snapshot se construye "as of RUN_ID" usando:
#       * QA operativa (Celda 10, sobre todo el hist√≥rico disponible).
#       * Data Quality last_180d (Celda 12A).
#       * Filtro de costes 3B actual (Celda 05).
#   - Si usas este universe_snapshot para backtests en 2018‚Äì2021, est√°s testeando
#     sobre el universo que HOY sobrevive (sesgo de supervivencia aceptado por dise√±o).
# =================================================================================================

from __future__ import annotations
from pathlib import Path
from datetime import datetime, timezone
from typing import Tuple, Optional, Set

import polars as pl

# ---------------------------------- Identidad / paths ----------------------------------
CELL_LABEL = "12B-Universe-Snapshot"
RUN_ID     = globals().get("RUN_ID", datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))

# DATA_ROOT viene de la Celda 02 (igual que en 10, 12, 12A, 12C)
if "DATA_ROOT" not in globals():
    raise RuntimeError("DATA_ROOT no est√° definido. Ejecuta la Celda 02 antes de la Celda 12B.")

DATA_ROOT = Path(globals()["DATA_ROOT"]).resolve()

META_DIR     = DATA_ROOT / "metadata"
FILTERS_DIR  = META_DIR / "filters"

QA_SUMMARY_PATH = META_DIR / "qa_operativa_summary.parquet"   # salida Celda 10
DQ_PATH         = META_DIR / "data_quality_summary.parquet"   # salida Celda 12A

ELIGIBLE_PARQUET = FILTERS_DIR / "eligible_symbols_by_cost.parquet"
ELIGIBLE_TXT     = FILTERS_DIR / "eligible_symbols_by_cost.txt"

UNIVERSE_PATH        = META_DIR / f"universe_snapshot_{RUN_ID}.parquet"
UNIVERSE_LATEST_PATH = META_DIR / "universe_snapshot_latest.parquet"

TIMEZONE_IANA = globals().get("TIMEZONE_IANA", "America/Guayaquil")

# Par√°metros de selecci√≥n de universo (ajustables en Celda 02)
# Flags de QA operativa aceptados en el universo
UNIVERSE_KEEP_QA_FLAGS      = list(globals().get("UNIVERSE_KEEP_QA_FLAGS", ["OK", "WARN"]))

# Flags de Data Quality aceptados en el universo
UNIVERSE_KEEP_DQ_FLAGS      = list(globals().get("UNIVERSE_KEEP_DQ_FLAGS", ["OK", "WARNING"]))

# Excluir s√≠mbolos con problemas estructurales serios
UNIVERSE_EXCLUDE_BAD_STRUCT = bool(globals().get("UNIVERSE_EXCLUDE_BAD_STRUCT", True))

# M√≠nimo score de Data Quality para estar en universo
UNIVERSE_MIN_DQ_SCORE       = float(globals().get("UNIVERSE_MIN_DQ_SCORE", 60.0))

# <<< NEW QA ECON >>> Flags de QA econ√≥mica aceptados en el universo
UNIVERSE_KEEP_PRICE_QA_FLAGS = list(globals().get("UNIVERSE_KEEP_PRICE_QA_FLAGS", ["OK", "WARN"]))

PARQUET_COMP_UNIVERSE = globals().get("PARQUET_COMP_UNIVERSE", "zstd")

# ---------------------------------- Utils / logging ----------------------------------
def _now_local_iana() -> str:
    try:
        from zoneinfo import ZoneInfo
        return datetime.now(timezone.utc).astimezone(ZoneInfo(TIMEZONE_IANA)).isoformat(timespec="seconds")
    except Exception:
        return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _log(level: str, msg: str):
    ts = _now_local_iana()
    print(f"[{ts}] [{RUN_ID}] [{level}] [{CELL_LABEL}] {msg}", flush=True)

def _read_eligible_set() -> Tuple[Optional[Set[str]], Optional[str]]:
    """
    Lee la lista 3B de s√≠mbolos elegibles por coste.
    - Primero intenta parquet.
    - Si falla o no existe, intenta TXT.
    - Devuelve (set_de_s√≠mbolos, origen) o (None, None) si no hay lista.
    """
    # parquet
    try:
        if ELIGIBLE_PARQUET.exists():
            df = pl.read_parquet(ELIGIBLE_PARQUET)
            col = "symbol" if "symbol" in df.columns else df.columns[0]
            syms = (
                df.get_column(col)
                  .cast(pl.Utf8, strict=False)
                  .str.strip_chars()
                  .str.to_uppercase()
                  .drop_nulls()
                  .to_list()
            )
            syms = [s for s in syms if s]
            return set(syms), "parquet"
    except Exception as e:
        _log("WARNING", f"No se pudo leer {ELIGIBLE_PARQUET}: {e}. Probando TXT...")

    # txt
    try:
        if ELIGIBLE_TXT.exists():
            txt = ELIGIBLE_TXT.read_text(encoding="utf-8", errors="ignore")
            syms = []
            for ln in txt.splitlines():
                v = ln.strip()
                if not v:
                    continue
                syms.append(v.upper())
            return set(syms), "txt"
    except Exception as e:
        _log("WARNING", f"No se pudo leer {ELIGIBLE_TXT}: {e}.")

    return None, None

# ---------------------------------- Header ----------------------------------
print("=" * 110)
print(f"Inicio Celda 12B ‚Äî Universe snapshot M5 | TZ local: {TIMEZONE_IANA}")
print(f"Hora local: {_now_local_iana()} | Hora UTC: {datetime.now(timezone.utc).isoformat(timespec='seconds')}")
print("-" * 110)
print(f"DATA_ROOT              : {DATA_ROOT}")
print(f"QA summary (Celda 10)  : {QA_SUMMARY_PATH}")
print(f"DQ summary (Celda 12A) : {DQ_PATH}")
print(f"3B filters (eligible)  : {FILTERS_DIR}")
print(f"Universe OUT path      : {UNIVERSE_PATH}")
print("-" * 110)
print("Par√°metros de selecci√≥n de universo:")
print(f"  UNIVERSE_KEEP_QA_FLAGS        = {UNIVERSE_KEEP_QA_FLAGS}")
print(f"  UNIVERSE_KEEP_DQ_FLAGS        = {UNIVERSE_KEEP_DQ_FLAGS}")
print(f"  UNIVERSE_EXCLUDE_BAD_STRUCT   = {UNIVERSE_EXCLUDE_BAD_STRUCT}")
print(f"  UNIVERSE_MIN_DQ_SCORE         = {UNIVERSE_MIN_DQ_SCORE}")
print(f"  UNIVERSE_KEEP_PRICE_QA_FLAGS  = {UNIVERSE_KEEP_PRICE_QA_FLAGS}")  # <<< NEW QA ECON >>>
print(f"  PARQUET_COMP_UNIVERSE         = '{PARQUET_COMP_UNIVERSE}'")
print("-" * 110)

META_DIR.mkdir(parents=True, exist_ok=True)
FILTERS_DIR.mkdir(parents=True, exist_ok=True)

# ---------------------------------- Guardas b√°sicas ----------------------------------
if not QA_SUMMARY_PATH.exists():
    msg = f"No se encontr√≥ qa_operativa_summary.parquet en: {QA_SUMMARY_PATH}. Ejecuta la Celda 10 antes de 12B."
    _log("ERROR", msg)
    raise FileNotFoundError(msg)

# DQ es deseable pero no estrictamente obligatorio
dq_available = DQ_PATH.exists()

# ---------------------------------- Lectura de QA y normalizaci√≥n ----------------------------------
qa = pl.read_parquet(QA_SUMMARY_PATH)

if qa.is_empty():
    msg = "qa_operativa_summary.parquet est√° vac√≠o; no hay s√≠mbolos auditados en QA. Universe snapshot no tiene sentido."
    _log("ERROR", msg)
    raise RuntimeError(msg)

# Normalizar s√≠mbolo en QA
qa = qa.with_columns(
    pl.col("symbol")
      .cast(pl.Utf8, strict=False)
      .str.strip_chars()
      .str.to_uppercase()
      .alias("symbol")
)

n_qa_syms = qa.get_column("symbol").n_unique()
_log("INFO", f"S√≠mbolos en QA operativa (Celda 10): {n_qa_syms}")

# Asegurar columnas clave de QA (por compatibilidad futura)
if "qa_operativa_flag" not in qa.columns:
    raise RuntimeError("qa_operativa_summary.parquet no contiene columna 'qa_operativa_flag' (Celda 10).")

if "qa_struct_flag" not in qa.columns:
    # En el peor caso, creamos un flag neutro para no romper 12C
    _log("WARNING", "qa_operativa_summary no contiene 'qa_struct_flag'; se crea 'qa_struct_flag'='WARN_STRUCT' para todos.")
    qa = qa.with_columns(
        pl.lit("WARN_STRUCT").alias("qa_struct_flag")
    )

if "session_type" not in qa.columns:
    _log("WARNING", "qa_operativa_summary no contiene 'session_type'; se crea 'session_type'='UNKNOWN_SESSION'.")
    qa = qa.with_columns(
        pl.lit("UNKNOWN_SESSION").alias("session_type")
    )

if "expected_bars_per_day_symbol" not in qa.columns:
    _log("WARNING", "qa_operativa_summary no contiene 'expected_bars_per_day_symbol'; se crea con n_bars_p50 o fallback a 288.")
    if "n_bars_p50" in qa.columns:
        qa = qa.with_columns(
            pl.col("n_bars_p50").alias("expected_bars_per_day_symbol")
        )
    else:
        expected_default = int(globals().get("EXPECTED_BARS_M5", 288))
        qa = qa.with_columns(
            pl.lit(expected_default).alias("expected_bars_per_day_symbol")
        )

# <<< NEW QA ECON >>> asegurar columna qa_price_flag (para backward compatibility)
if "qa_price_flag" not in qa.columns:
    _log("WARNING", "qa_operativa_summary no contiene 'qa_price_flag'; se crea 'qa_price_flag'='OK' para todos.")
    qa = qa.with_columns(
        pl.lit("OK").alias("qa_price_flag")
    )

# ---------------------------------- Lectura de DQ (Data Quality) ----------------------------------
dq = None
if dq_available:
    try:
        dq = pl.read_parquet(DQ_PATH)
        if dq.is_empty():
            _log("WARNING", f"{DQ_PATH} est√° vac√≠o; se usar√° data_quality_flag='MISSING' y score=0.0 en todo el universo.")
            dq = None
        else:
            dq = dq.with_columns(
                pl.col("symbol")
                  .cast(pl.Utf8, strict=False)
                  .str.strip_chars()
                  .str.to_uppercase()
                  .alias("symbol")
            )
            n_dq_syms = dq.get_column("symbol").n_unique()
            _log("INFO", f"S√≠mbolos en Data Quality (Celda 12A): {n_dq_syms}")
    except Exception as e:
        _log("WARNING", f"No se pudo leer {DQ_PATH}: {e}; contin√∫o sin DQ (fallback).")
        dq = None
else:
    _log("WARNING", f"No se encontr√≥ {DQ_PATH}; contin√∫o sin DQ (fallback).")

# ---------------------------------- Filtro 3B (eligible_symbols_by_cost) ----------------------------------
elig_set, elig_src = _read_eligible_set()
if elig_set is not None:
    base_syms = set(qa.get_column("symbol").to_list())
    audited_syms = sorted(base_syms & elig_set)
    omitted_syms = sorted(base_syms - elig_set)
    _log(
        "INFO",
        f"Filtro 3B activo ({elig_src}): elegibles={len(elig_set)} | QA={len(base_syms)} ‚Üí "
        f"auditados={len(audited_syms)} | omitidos={len(omitted_syms)}"
    )
    qa = qa.filter(pl.col("symbol").is_in(audited_syms))
    if dq is not None:
        dq = dq.filter(pl.col("symbol").is_in(audited_syms))
else:
    _log("WARNING", "No se encontr√≥ lista elegible 3B; el universo se construir√° con TODOS los s√≠mbolos presentes en QA.")

# Recalcular n_qa_syms tras aplicar 3B
n_qa_syms_post = qa.get_column("symbol").n_unique()
_log("INFO", f"S√≠mbolos tras filtro 3B en QA: {n_qa_syms_post}")

# ---------------------------------- Join QA + DQ ----------------------------------
if dq is not None:
    # Join left: mantenemos todos los s√≠mbolos de QA, anexando m√©tricas de DQ cuando existan
    uni = qa.join(dq, on="symbol", how="left")
else:
    # Sin DQ: creamos columnas placeholders para no romper notebooks posteriores
    _log("INFO", "Construyendo universe snapshot sin DQ; se crean columnas de Data Quality con valores por defecto.")
    uni = qa.with_columns([
        pl.lit(None).cast(pl.Date).alias("first_date"),
        pl.lit(None).cast(pl.Date).alias("last_date"),
        pl.lit(0).cast(pl.Int32).alias("days_with_data"),
        pl.lit(0).cast(pl.Int32).alias("days_expected"),
        pl.lit(0.0).cast(pl.Float64).alias("coverage_days"),
        pl.lit(0).cast(pl.Int64).alias("total_candles"),
        pl.lit(0).cast(pl.Int32).alias("days_with_large_gaps"),
        pl.lit(1.0).cast(pl.Float64).alias("missing_candles_pct"),
        pl.lit(1.0).cast(pl.Float64).alias("bad_days_ratio"),
        pl.lit(0.0).cast(pl.Float64).alias("data_quality_score"),
        pl.lit("MISSING").cast(pl.Utf8).alias("data_quality_flag"),
        pl.lit("Sin DQ disponible en 12A.").cast(pl.Utf8).alias("comentario"),
    ])

# ---------------------------------- Enriquecimiento y flags de selecci√≥n ----------------------------------
# Copia expl√≠cita de qa_struct_flag como qa_struct_flag_qa para que 12C pueda detectarla sin ambig√ºedad
if "qa_struct_flag" not in uni.columns:
    uni = uni.with_columns(pl.lit("WARN_STRUCT").alias("qa_struct_flag"))

uni = uni.with_columns(
    pl.col("qa_struct_flag").alias("qa_struct_flag_qa")
)

# Asegurar columnas de DQ, incluso si vinieron parcialmente nulas
if "data_quality_flag" not in uni.columns:
    uni = uni.with_columns(pl.lit("MISSING").alias("data_quality_flag"))

if "data_quality_score" not in uni.columns:
    uni = uni.with_columns(pl.lit(0.0).cast(pl.Float64).alias("data_quality_score"))

# Normalizar algunos campos clave
uni = uni.with_columns([
    pl.col("qa_operativa_flag").cast(pl.Utf8),
    pl.col("qa_struct_flag").cast(pl.Utf8),
    pl.col("qa_struct_flag_qa").cast(pl.Utf8),
    pl.col("session_type").cast(pl.Utf8),
    pl.col("data_quality_flag").fill_null("MISSING").cast(pl.Utf8),
    pl.col("data_quality_score").fill_null(0.0).cast(pl.Float64),
    pl.col("qa_price_flag").fill_null("OK").cast(pl.Utf8),   # <<< NEW QA ECON >>> normalizaci√≥n
])

# Flag 3B expl√≠cito
if elig_set is not None:
    uni = uni.with_columns(
        pl.col("symbol").is_in(list(elig_set)).alias("is_eligible_3b")
    )
else:
    uni = uni.with_columns(
        pl.lit(True).alias("is_eligible_3b")
    )

# Flags intermedios de selecci√≥n
uni = uni.with_columns([
    pl.col("qa_operativa_flag").is_in(UNIVERSE_KEEP_QA_FLAGS).alias("keep_by_qa"),
    pl.col("data_quality_flag").is_in(UNIVERSE_KEEP_DQ_FLAGS).alias("keep_by_dq"),
    pl.when(UNIVERSE_EXCLUDE_BAD_STRUCT)
      .then(pl.col("qa_struct_flag").fill_null("BAD_STRUCT") != "BAD_STRUCT")
      .otherwise(pl.lit(True))
      .alias("keep_by_struct"),
    # <<< NEW QA ECON >>> QA econ√≥mica por s√≠mbolo (si no existe, ya la hemos normalizado a 'OK')
    pl.col("qa_price_flag").fill_null("OK").is_in(UNIVERSE_KEEP_PRICE_QA_FLAGS).alias("keep_by_price_qa"),
])

# universe_flag final:
#   - qa_operativa_flag ‚àà UNIVERSE_KEEP_QA_FLAGS (OK/WARN por defecto).
#   - qa_price_flag ‚àà UNIVERSE_KEEP_PRICE_QA_FLAGS (OK/WARN por defecto).
#   - data_quality_flag ‚àà UNIVERSE_KEEP_DQ_FLAGS (OK/WARNING por defecto).
#   - opcionalmente excluimos BAD_STRUCT.
#   - data_quality_score ‚â• UNIVERSE_MIN_DQ_SCORE (60.0 por defecto).
uni = uni.with_columns([
    pl.when(
        pl.col("is_eligible_3b") &
        pl.col("keep_by_qa") &
        pl.col("keep_by_dq") &
        pl.col("keep_by_struct") &
        pl.col("keep_by_price_qa") &
        (pl.col("data_quality_score") >= UNIVERSE_MIN_DQ_SCORE)
    )
    .then(pl.lit("IN"))
    .otherwise(pl.lit("OUT"))
    .alias("universe_flag")
])

# ---------------------------------- Orden de columnas / salida ----------------------------------
# Aseguramos que symbol y universe_flag est√©n al principio para legibilidad
cols_front = [
    "symbol",
    "universe_flag",
    "is_eligible_3b",
    "qa_operativa_flag",
    "qa_price_flag",              # <<< NEW QA ECON >>>
    "qa_struct_flag",
    "qa_struct_flag_qa",
    "session_type",
    "expected_bars_per_day_symbol",
    "data_quality_score",
    "data_quality_flag",
]

cols_existing = [c for c in cols_front if c in uni.columns]
cols_rest = [c for c in uni.columns if c not in cols_existing]

uni_out = uni.select(cols_existing + cols_rest).sort("symbol")

# ---------------------------------- Persistencia ----------------------------------
uni_out.write_parquet(UNIVERSE_PATH, compression=PARQUET_COMP_UNIVERSE)
# Alias "latest" para facilitar consumo por otros notebooks sin preocuparse por RUN_ID
uni_out.write_parquet(UNIVERSE_LATEST_PATH, compression=PARQUET_COMP_UNIVERSE)

# ---------------------------------- Resumen ----------------------------------
n_total = uni_out.height
n_in    = uni_out.filter(pl.col("universe_flag") == "IN").height
n_out   = n_total - n_in

print("=" * 110)
print(">>> Celda 12B :: Universe snapshot M5")
print(f"üìÅ QA summary (in)        ‚Üí {QA_SUMMARY_PATH}")
print(f"üìÅ DQ summary (in)        ‚Üí {DQ_PATH if dq_available else '[NO DQ / FALLBACK]'}")
print(f"üìÅ Universe snapshot (out)‚Üí {UNIVERSE_PATH}")
print(f"üìÅ Universe latest (out)  ‚Üí {UNIVERSE_LATEST_PATH}")
print("-" * 110)
print(f"Œ£ s√≠mbolos QA tras 3B         = {n_qa_syms_post}")
print(f"Œ£ s√≠mbolos en universe file   = {n_total}")
print(f"Œ£ s√≠mbolos universe_flag='IN' = {n_in}")
print(f"Œ£ s√≠mbolos universe_flag='OUT'= {n_out}")
print("-" * 110)

if n_in > 0:
    top_preview = (
        uni_out
        .select([
            "symbol",
            "universe_flag",
            "qa_operativa_flag",
            "qa_price_flag",        # <<< NEW QA ECON >>>
            "qa_struct_flag",
            "session_type",
            "data_quality_flag",
            "data_quality_score",
        ])
        .head(20)
    )
    print("Preview (primeros 20 s√≠mbolos en universe_snapshot):")
    print(top_preview)
else:
    print("‚ö†Ô∏è  No hay s√≠mbolos con universe_flag='IN'. Revisa umbrales UNIVERSE_* o QA/DQ antes de seguir.")

print("=" * 110)
print("‚úÖ Celda 12B ‚Äî Universe snapshot M5 generado correctamente.")
# =================================================================================================


Inicio Celda 12B ‚Äî Universe snapshot M5 | TZ local: America/Guayaquil
Hora local: 2025-12-02T23:40:43-05:00 | Hora UTC: 2025-12-03T04:40:43+00:00
--------------------------------------------------------------------------------------------------------------
DATA_ROOT              : C:\Quant\MT5_Data_Extraction\data
QA summary (Celda 10)  : C:\Quant\MT5_Data_Extraction\data\metadata\qa_operativa_summary.parquet
DQ summary (Celda 12A) : C:\Quant\MT5_Data_Extraction\data\metadata\data_quality_summary.parquet
3B filters (eligible)  : C:\Quant\MT5_Data_Extraction\data\metadata\filters
Universe OUT path      : C:\Quant\MT5_Data_Extraction\data\metadata\universe_snapshot_20251202_232253.parquet
--------------------------------------------------------------------------------------------------------------
Par√°metros de selecci√≥n de universo:
  UNIVERSE_KEEP_QA_FLAGS        = ['OK', 'WARN']
  UNIVERSE_EXCLUDE_BAD_STRUCT   = True
  UNIVERSE_MIN_DQ_SCORE         = 60.0
  UNIVERSE_KEEP_PRICE_QA_

In [16]:
# ======================= Celda 12C ‚Äî Capa GOLD M5 (m5_clean) =======================
# Prop√≥sito:
#   - Construir la capa GOLD M5 (m5_clean) a partir de:
#       * bulk_data/m5_raw                               (RAW)
#       * metadata/qa_m5_bulk.parquet                    (QA diario s√≠mbolo+fecha, Celda 10)
#       * metadata/universe_snapshot_{RUN_ID}.parquet    (universo final, Celda 12B)
#   - S√≥lo se incluyen en GOLD los d√≠as/s√≠mbolos que:
#       * Est√°n en el universo snapshot.
#       * Son estructuralmente sanos:
#            - status ‚àà QA_GOLD_KEEP_STATUSES (por defecto ["OK", "WARN"] si no se defini√≥ antes).
#            - opcionalmente: s√≠mbolo con qa_struct_flag == "OK_STRUCT" si existe en el universe.
#
# Cambios clave vs dise√±o antiguo:
#   - Se ELIMINA el veto global por rejilla_pct ‚â• MIN_REJILLA_FOR_GOLD.
#   - GOLD = datos estructuralmente sanos (FAIL/EMPTY quedan fuera).
#   - La cobertura (rejilla_pct) se eval√∫a:
#        * Globalmente en Celda 10 (sanity-check M5).
#        * Por s√≠mbolo en Celda 12A (DataQuality, ya ajustado por tipo de sesi√≥n).
#   - MIN_REJILLA_FOR_GOLD se mantiene como par√°metro para futuros refinamientos
#     (por ejemplo, para SESSION_24H), pero NO se aplica en el filtro base.
#   - B√∫squeda robusta de RAW por s√≠mbolo+fecha usando rglob(), y escritura GOLD
#     compatible con layout particionado por year/month.
# ===================================================================================

from __future__ import annotations
from pathlib import Path
from datetime import datetime, timezone
import time, re
from typing import Dict, List, Optional

import polars as pl

# ----------------------------- Identidad / Paths base -----------------------------
CELL_LABEL    = "12C-GOLD-M5"
RUN_ID        = globals().get("RUN_ID", datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
TIMEZONE_IANA = globals().get("TIMEZONE_IANA", "America/Guayaquil")

# DATA_ROOT obligatorio (Celda 02)
if "DATA_ROOT" not in globals():
    raise RuntimeError("DATA_ROOT no est√° definido. Ejecuta la Celda 02 antes de esta celda.")

DATA_ROOT = Path(globals()["DATA_ROOT"]).resolve()

# Reutilizamos rutas creadas en Celda 02 si est√°n en globals; si no, derivamos de DATA_ROOT.
META_DIR = Path(globals().get("METADATA_DIR", DATA_ROOT / "metadata")).resolve()

if "M5_RAW_DIR" in globals():
    M5_RAW_DIR = Path(globals()["M5_RAW_DIR"]).resolve()
else:
    M5_RAW_DIR = DATA_ROOT / "bulk_data" / "m5_raw"

if "M5_CLEAN_DIR" in globals():
    M5_CLEAN_DIR = Path(globals()["M5_CLEAN_DIR"]).resolve()
else:
    M5_CLEAN_DIR = DATA_ROOT / "historical_data" / "m5_clean"

QA_BULK_PATH         = META_DIR / "qa_m5_bulk.parquet"
UNIVERSE_PATH        = META_DIR / f"universe_snapshot_{RUN_ID}.parquet"
UNIVERSE_LATEST_PATH = META_DIR / "universe_snapshot_latest.parquet"

# ----------------------------- Par√°metros ajustables -----------------------------
# Definidos idealmente en Celda 02; aqu√≠ S√ìLO se leen.
#   - QA_GOLD_KEEP_STATUSES ‚Üí viene de la Celda 02 (por defecto ["OK", "WARN"]).
#   - MIN_REJILLA_FOR_GOLD  ‚Üí 80.0 (NO se usa en el filtro base; reservado para refinamientos futuros).

# Leemos los estados permitidos desde la configuraci√≥n global (Celda 02).
# Si no existe, reventamos: eso indica que ejecutaste 12C sin pasar por 02.
QA_GOLD_KEEP_STATUSES = list(globals()["QA_GOLD_KEEP_STATUSES"])
QA_KEEP_STATUSES      = QA_GOLD_KEEP_STATUSES  # alias local

MIN_REJILLA_FOR_GOLD  = float(globals().get("MIN_REJILLA_FOR_GOLD", 80.0))

OVERWRITE_M5_CLEAN     = bool(globals().get("OVERWRITE_M5_CLEAN", False))
PARQUET_COMP_GOLD      = globals().get("PARQUET_COMP_GOLD", "zstd")

PROGRESS_EVERY_FILES   = int(globals().get("PROGRESS_EVERY_FILES_GOLD", 2000))
PROGRESS_EVERY_SECONDS = float(globals().get("PROGRESS_EVERY_SECONDS_GOLD", 2.0))

# ----------------------------- Logger helper --------------------------------------
def _log(level: str, msg: str):
    """
    Usa log_msg(celda, level, message) si est√° disponible (Celda 03),
    si no, hace un print() formateado.
    """
    if "log_msg" in globals():
        try:
            log_msg(CELL_LABEL, level, msg)
            return
        except Exception:
            pass
    ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{ts}] [{RUN_ID}] [{level.upper()}] [{CELL_LABEL}] {msg}", flush=True)

def _now_local_iana() -> str:
    try:
        from zoneinfo import ZoneInfo
        return datetime.now(timezone.utc).astimezone(ZoneInfo(TIMEZONE_IANA)).isoformat(timespec="seconds")
    except Exception:
        return datetime.now(timezone.utc).isoformat(timespec="seconds")

# Log expl√≠cito de los estados GOLD efectivos
_log("INFO", f"QA_GOLD_KEEP_STATUSES (efectivos) = {QA_GOLD_KEEP_STATUSES}")

# ----------------------------- Helpers de paths/RAW --------------------------------
def _parse_date_from_filename(name: str) -> Optional[str]:
    """
    Extrae YYYYMMDD de nombres tipo 'part=YYYYMMDD.parquet' o variantes.
    """
    m = re.search(r"part=([0-9]{8})", name)
    if m:
        return m.group(1)
    m = re.search(r"part=([0-9]{4})[-_/]?([0-9]{2})[-_/]?([0-9]{2})", name)
    if m:
        return f"{m.group(1)}{m.group(2)}{m.group(3)}"
    return None

_RAW_INDEX: Dict[str, Dict[str, Path]] = {}

def _build_raw_index_for_symbol(sym: str) -> Dict[str, Path]:
    """
    Escanea M5_RAW_DIR/symbol=sym con rglob('part=*.parquet') y construye
    un √≠ndice date_str(YYYYMMDD) -> Path.
    Soporta layouts con year=/month= y layouts planos.
    """
    root = M5_RAW_DIR / f"symbol={sym}"
    index: Dict[str, Path] = {}
    if not root.exists():
        _log("WARNING", f"Directorio RAW no existe para s√≠mbolo {sym}: {root}")
        return index

    files = list(root.rglob("part=*.parquet"))
    for p in files:
        ds = _parse_date_from_filename(p.name)
        if not ds:
            continue
        # Si hubiera duplicados para el mismo d√≠a, se conserva el primero.
        index.setdefault(ds, p)

    _log("INFO", f"√çndice RAW para {sym}: {len(index)} d√≠a(s) encontrados bajo {root}")
    return index

def _get_clean_paths(sym: str, d: str) -> tuple[Path, Path]:
    """
    Devuelve:
      - legacy_clean_path: symbol=SYM/part=YYYYMMDD.parquet
      - clean_path:        symbol=SYM/year=YYYY/month=MM/part=YYYYMMDD.parquet
    """
    year = d[:4]
    month = d[4:6]
    legacy_clean_path = M5_CLEAN_DIR / f"symbol={sym}" / f"part={d}.parquet"
    partitioned_dir   = M5_CLEAN_DIR / f"symbol={sym}" / f"year={year}" / f"month={month}"
    clean_path        = partitioned_dir / f"part={d}.parquet"
    return legacy_clean_path, clean_path

# ----------------------------- Header ---------------------------------------------
print("=" * 110)
print("Inicio Celda 12C ‚Äî Capa GOLD M5 (m5_clean)")
print(f"TZ local: {TIMEZONE_IANA}")
print(f"Hora local: {_now_local_iana()} | Hora UTC: {datetime.now(timezone.utc).isoformat(timespec='seconds')}")
print("-" * 110)
print(f"DATA_ROOT       : {DATA_ROOT}")
print(f"RAW (m5_raw)    : {M5_RAW_DIR}")
print(f"GOLD (m5_clean) : {M5_CLEAN_DIR}")
print(f"QA diario       : {QA_BULK_PATH}")
print(f"Universe        : {UNIVERSE_PATH}")
print("-" * 110)
print("Par√°metros GOLD (estructura-base):")
print(f"  QA_GOLD_KEEP_STATUSES = {QA_GOLD_KEEP_STATUSES}   # status aceptados (FAIL/EMPTY fuera por defecto)")
print(f"  MIN_REJILLA_FOR_GOLD  = {MIN_REJILLA_FOR_GOLD}   # hoy NO se usa como veto global en GOLD")
print(f"  OVERWRITE_M5_CLEAN    = {OVERWRITE_M5_CLEAN}")
print(f"  PARQUET_COMP_GOLD     = '{PARQUET_COMP_GOLD}'")
print("-" * 110)

_log("INFO", "Inicio construcci√≥n de capa GOLD m5_clean (filtro estructural sin veto de rejilla).")

# ----------------------------- Guardas de existencia ------------------------------
if not M5_RAW_DIR.exists():
    raise FileNotFoundError(f"No existe M5_RAW_DIR: {M5_RAW_DIR}. Ejecuta la Celda 08 antes de esta.")

if not QA_BULK_PATH.exists():
    raise FileNotFoundError(f"No se encontr√≥ QA diario en: {QA_BULK_PATH}. Ejecuta la Celda 10 antes de esta.")

if not UNIVERSE_PATH.exists():
    if UNIVERSE_LATEST_PATH.exists():
        _log(
            "WARNING",
            f"No se encontr√≥ universe_snapshot para RUN_ID={RUN_ID} en {UNIVERSE_PATH}; "
            f"usando universe_snapshot_latest.parquet en su lugar."
        )
        UNIVERSE_PATH = UNIVERSE_LATEST_PATH
    else:
        raise FileNotFoundError(
            f"No se encontr√≥ universe_snapshot ni por RUN_ID ({UNIVERSE_PATH}) "
            f"ni universe_snapshot_latest.parquet."
        )

M5_CLEAN_DIR.mkdir(parents=True, exist_ok=True)
META_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------- Lectura de universe_snapshot y QA bulk ------------
uni = pl.read_parquet(UNIVERSE_PATH)
if "symbol" not in uni.columns:
    raise ValueError(f"{UNIVERSE_PATH} no tiene columna 'symbol'.")

universe_syms = set(uni.get_column("symbol").cast(pl.Utf8).to_list())
n_universe = len(universe_syms)
_log("INFO", f"Universe snapshot cargado: {n_universe} s√≠mbolo(s) en universo base.")

# Filtro estructural por s√≠mbolo usando qa_struct_flag (si existe en el snapshot)
qa_struct_col: Optional[str] = None
if "qa_struct_flag_qa" in uni.columns:
    qa_struct_col = "qa_struct_flag_qa"
elif "qa_struct_flag" in uni.columns:
    qa_struct_col = "qa_struct_flag"

candidate_syms = set(universe_syms)

if qa_struct_col is not None:
    struct_ok_syms = set(
        uni.filter(pl.col(qa_struct_col) == "OK_STRUCT")
           .get_column("symbol")
           .cast(pl.Utf8)
           .to_list()
    )
    if struct_ok_syms:
        before = len(candidate_syms)
        candidate_syms = candidate_syms & struct_ok_syms
        after = len(candidate_syms)
        _log(
            "INFO",
            f"Filtro estructural aplicado ({qa_struct_col} == 'OK_STRUCT'): "
            f"{before} ‚Üí {after} s√≠mbolo(s) estructuralmente OK en universo."
        )
    else:
        _log(
            "WARNING",
            f"Columna {qa_struct_col} presente en universe pero ning√∫n s√≠mbolo con 'OK_STRUCT'; "
            f"se usar√° el universo completo SIN filtro estructural por s√≠mbolo."
        )
else:
    _log("INFO", "No se encontr√≥ qa_struct_flag en universe snapshot; no se aplica filtro estructural por s√≠mbolo.")

# ----------------------------- Lectura y normalizaci√≥n de QA diario --------------
qa_cols_needed = ["symbol", "date", "status", "rejilla_pct"]
qa_bulk = pl.read_parquet(QA_BULK_PATH)

missing_cols = [c for c in qa_cols_needed if c not in qa_bulk.columns]
if missing_cols:
    raise ValueError(f"{QA_BULK_PATH} no tiene columnas requeridas: {missing_cols}")

qa_bulk = qa_bulk.select(qa_cols_needed)
tot_days_qa = qa_bulk.height

# Normalizamos la columna 'date' a string YYYYMMDD ‚Üí 'date_ymd'
dt_type = qa_bulk.schema["date"]
if dt_type == pl.Date:
    qa_bulk = qa_bulk.with_columns(
        pl.col("date").dt.strftime("%Y%m%d").alias("date_ymd")
    )
else:
    # Cast laxo a texto, quitando separadores comunes ('-', '_', '/')
    qa_bulk = qa_bulk.with_columns(
        pl.col("date")
        .cast(pl.Utf8)
        .str.replace_all(r"[^0-9]", "")
        .alias("date_ymd")
    )

# ----------------------------- Selecci√≥n de d√≠as buenos para GOLD ----------------
# Criterio base:
#   - symbol ‚àà candidate_syms  (universo ‚à© s√≠mbolos estructuralmente OK, si aplica)
#   - status ‚àà QA_GOLD_KEEP_STATUSES (por defecto ["OK", "WARN"])
#   - NO se usa rejilla_pct como veto global.
_good_filter = (
    pl.col("symbol").is_in(list(candidate_syms)) &
    pl.col("status").is_in(QA_KEEP_STATUSES)
)

good_days = (
    qa_bulk
    .filter(_good_filter)
    .select(["symbol", "date_ymd"])
    .unique()
)

n_good_pairs = good_days.height
_log("INFO", f"D√≠as candidatos a GOLD (s√≠mbolo+fecha) tras filtro estructural: {n_good_pairs}")

if n_good_pairs == 0:
    # ---------------------- Diagn√≥stico detallado cuando no hay candidatos -------
    print("‚ö†Ô∏è  No hay d√≠as que cumplan los criterios para GOLD con los par√°metros actuales.")
    print("-" * 110)
    print("Resumen de QA diario (qa_m5_bulk) para este RUN_ID:")
    print(f"  Œ£ d√≠as auditados en QA      = {tot_days_qa}")

    if tot_days_qa > 0:
        by_status = (
            qa_bulk
            .group_by("status")
            .agg(pl.len().alias("n"))
            .sort("status")
        )
        counts = {r["status"]: int(r["n"]) for r in by_status.iter_rows(named=True)}
        _pct = lambda n: (100.0 * n / tot_days_qa) if tot_days_qa else 0.0

        n_ok    = counts.get("OK", 0)
        n_warn  = counts.get("WARN", 0)
        n_fail  = counts.get("FAIL", 0)
        n_empty = counts.get("EMPTY", 0)

        print(
            f"  Distribuci√≥n de status: "
            f"OK={n_ok} ({_pct(n_ok):.1f}%) | "
            f"WARN={n_warn} ({_pct(n_warn):.1f}%) | "
            f"FAIL={n_fail} ({_pct(n_fail):.1f}%) | "
            f"EMPTY={n_empty} ({_pct(n_empty):.1f}%)"
        )

        cov_stats = (
            qa_bulk
            .select([
                pl.col("rejilla_pct").mean().alias("rejilla_mean"),
                pl.col("rejilla_pct").quantile(0.5).alias("rejilla_p50"),
            ])
            .to_dicts()[0]
        )
        rejilla_mean = float(cov_stats.get("rejilla_mean", 0.0) or 0.0)
        rejilla_p50  = float(cov_stats.get("rejilla_p50", 0.0) or 0.0)

        print(f"  Rejilla global (sobre d√≠as auditados): rejilla_mean={rejilla_mean:.2f}% | rejilla_p50={rejilla_p50:.2f}%")
    else:
        print("  ‚ö†Ô∏è  qa_m5_bulk.parquet no contiene filas para este RUN_ID.")

    print("-" * 110)
    print("Par√°metros de filtrado hacia GOLD usados en esta celda:")
    print(f"  QA_GOLD_KEEP_STATUSES = {QA_GOLD_KEEP_STATUSES}")
    print(f"  MIN_REJILLA_FOR_GOLD  = {MIN_REJILLA_FOR_GOLD}  (NO usado como veto en esta celda)")
    print(f"  Œ£ s√≠mbolos en universo base                = {n_universe}")
    print(f"  Œ£ s√≠mbolos candidatos tras filtro estructural (qa_struct_flag, si aplica) = {len(candidate_syms)}")
    print("-" * 110)
    print("Interpretaci√≥n profesional:")
    print("  - GOLD hoy descarta √∫nicamente:")
    print("       * D√≠as con status NO incluido en QA_GOLD_KEEP_STATUSES (p.ej. FAIL, EMPTY).")
    print("       * Opcionalmente, s√≠mbolos con qa_struct_flag != 'OK_STRUCT' si esa columna existe en el universe.")
    print("  - La rejilla_pct ya NO se usa como veto global en esta capa; la cobertura se eval√∫a en:")
    print("       * Celda 10 (sanity-check global de M5).")
    print("       * Celda 12A (DataQuality ajustado por tipo de sesi√≥n).")
    print("  - Si casi todos los d√≠as est√°n en FAIL/EMPTY, o ning√∫n s√≠mbolo queda con qa_struct_flag='OK_STRUCT',")
    print("    el problema NO est√° en esta celda, sino aguas arriba (extracci√≥n M5 o QA operativa).")
    print("-" * 110)
    print("La capa GOLD m5_clean NO se ha generado en este RUN_ID porque no hay d√≠as que cumplan")
    print("los criterios estructurales definidos. Ajusta QA_GOLD_KEEP_STATUSES o revisa QA/estructura")
    print("si esto no es intencionado.")
    print("=" * 110)

    _log(
        "WARNING",
        f"Sin candidatos a GOLD. tot_days_qa={tot_days_qa}, "
        f"QA_GOLD_KEEP_STATUSES={QA_GOLD_KEEP_STATUSES}, "
        f"MIN_REJILLA_FOR_GOLD={MIN_REJILLA_FOR_GOLD}, "
        f"n_universe={n_universe}, n_candidate_syms={len(candidate_syms)}"
    )

else:
    # Agrupar por s√≠mbolo para iterar de forma eficiente
    grouped = (
        good_days
        .group_by("symbol")
        .agg(pl.col("date_ymd").alias("dates_ymd"))
    )

    symbols_with_candidates = grouped.height
    print(f"S√≠mbolos con al menos un d√≠a candidato a GOLD: {symbols_with_candidates}/{len(candidate_syms)}")

    # ----------------------------- Loop de construcci√≥n de GOLD -------------------
    total_candidates   = n_good_pairs
    files_written      = 0
    skipped_existing   = 0
    missing_raw_files  = 0
    days_written_by_symbol: Dict[str, int] = {}

    t0 = time.monotonic()
    last_beat = t0

    for row in grouped.iter_rows(named=True):
        sym = row["symbol"]
        dates: List[str] = row["dates_ymd"]  # lista de strings YYYYMMDD
        dates_sorted = sorted(dates)

        # Construir √≠ndice RAW una vez por s√≠mbolo si hace falta
        if sym not in _RAW_INDEX:
            _RAW_INDEX[sym] = _build_raw_index_for_symbol(sym)
        sym_raw_index = _RAW_INDEX.get(sym, {})

        for d in dates_sorted:
            legacy_clean_path, clean_path = _get_clean_paths(sym, d)

            # Si ya existe GOLD (legacy o particionado) y no se quiere overwrite ‚Üí skip
            if (clean_path.exists() or legacy_clean_path.exists()) and not OVERWRITE_M5_CLEAN:
                skipped_existing += 1
                files_written += 1
                now = time.monotonic()
                if (files_written % PROGRESS_EVERY_FILES == 0) or (now - last_beat >= PROGRESS_EVERY_SECONDS):
                    rate = files_written / max(now - t0, 1e-6)
                    _log("INFO", f"GOLD progreso: {files_written}/{total_candidates} archivos (sym={sym}) | {rate:.1f} files/s")
                    last_beat = now
                continue

            # Intento directo: layout plano
            raw_path = M5_RAW_DIR / f"symbol={sym}" / f"part={d}.parquet"
            if not raw_path.exists():
                # Buscar en √≠ndice (soporta year=/month=)
                raw_path = sym_raw_index.get(d)

            if raw_path is None or not Path(raw_path).exists():
                missing_raw_files += 1
                files_written += 1
                _log("WARNING", f"Falta RAW para GOLD (sym={sym}, date={d}); no se encontr√≥ archivo en √≠ndice ni en layout plano.")
                now = time.monotonic()
                if (files_written % PROGRESS_EVERY_FILES == 0) or (now - last_beat >= PROGRESS_EVERY_SECONDS):
                    rate = files_written / max(now - t0, 1e-6)
                    _log("INFO", f"GOLD progreso: {files_written}/{total_candidates} archivos (sym={sym}) | {rate:.1f} files/s")
                    last_beat = now
                continue

            try:
                df = pl.read_parquet(raw_path, use_statistics=True)
            except Exception as e:
                files_written += 1
                _log("WARNING", f"No se pudo leer RAW {raw_path}: {type(e).__name__}: {e}")
                now = time.monotonic()
                if (files_written % PROGRESS_EVERY_FILES == 0) or (now - last_beat >= PROGRESS_EVERY_SECONDS):
                    rate = files_written / max(now - t0, 1e-6)
                    _log("INFO", f"GOLD progreso: {files_written}/{total_candidates} archivos (sym={sym}) | {rate:.1f} files/s")
                    last_beat = now
                continue

            # Tipado m√≠nimo coherente con contrato M5
            cols = set(df.columns)

            if "timestamp_utc" in cols:
                df = df.with_columns(pl.col("timestamp_utc").cast(pl.Int64, strict=False))

            for c in ["open", "high", "low", "close", "spread_points", "real_volume"]:
                if c in cols:
                    df = df.with_columns(pl.col(c).cast(pl.Float64, strict=False))

            if "tick_volume" in cols:
                df = df.with_columns(pl.col("tick_volume").cast(pl.Int64, strict=False))

            # Limpieza m√≠nima: filtrar nulos y deduplicar por timestamp_utc
            if "timestamp_utc" in cols:
                df = (
                    df
                    .filter(pl.col("timestamp_utc").is_not_null())
                    .sort("timestamp_utc")
                    .unique(subset=["timestamp_utc"], keep="first")
                )

            # Escribir como GOLD (layout particionado por year/month)
            clean_path.parent.mkdir(parents=True, exist_ok=True)
            df.write_parquet(clean_path, compression=PARQUET_COMP_GOLD)

            files_written += 1
            days_written_by_symbol[sym] = days_written_by_symbol.get(sym, 0) + 1

            now = time.monotonic()
            if (files_written % PROGRESS_EVERY_FILES == 0) or (now - last_beat >= PROGRESS_EVERY_SECONDS):
                rate = files_written / max(now - t0, 1e-6)
                _log("INFO", f"GOLD progreso: {files_written}/{total_candidates} archivos (sym={sym}) | {rate:.1f} files/s")
                last_beat = now

    # ----------------------------- Resumen de ejecuci√≥n ---------------------------
    elapsed = time.monotonic() - t0
    rate    = files_written / max(elapsed, 1e-6)

    n_syms_gold     = len(days_written_by_symbol)
    total_days_gold = sum(days_written_by_symbol.values())

    print("=" * 110)
    print(">>> Celda 12C :: Capa GOLD M5 (m5_clean)")
    print(f"üìÅ INPUT RAW ‚Üí {M5_RAW_DIR}")
    print(f"üìÅ INPUT QA  ‚Üí {QA_BULK_PATH}")
    print(f"üìÅ INPUT UNI ‚Üí {UNIVERSE_PATH}")
    print(f"üìÅ OUTPUT    ‚Üí {M5_CLEAN_DIR}")
    print("-" * 110)
    print(f"Œ£ s√≠mbolos en universo base          = {n_universe}")
    print(f"Œ£ s√≠mbolos candidatos (estructura)   = {len(candidate_syms)}")
    print(f"Œ£ s√≠mbolos con al menos un d√≠a GOLD  = {n_syms_gold}")
    print(f"Œ£ d√≠as candidatos (QA estructural)   = {n_good_pairs}")
    print(f"Œ£ ficheros GOLD procesados           = {files_written}")
    print(f"Œ£ d√≠as GOLD escritos                  = {total_days_gold}")
    print(f"Œ£ RAW faltantes                      = {missing_raw_files}")
    print(f"Œ£ d√≠as ya existentes (skip)          = {skipped_existing}  (OVERWRITE_M5_CLEAN={OVERWRITE_M5_CLEAN})")
    print(f"Tiempo total                         = {elapsed:.1f}s ({rate:.1f} files/s)")
    print("-" * 110)

    if n_syms_gold > 0:
        top_syms = sorted(days_written_by_symbol.items(), key=lambda kv: kv[1], reverse=True)[:5]
        print("Top-5 s√≠mbolos por n¬∫ de d√≠as en GOLD:")
        for sym, nd in top_syms:
            print(f"  - {sym}: n_days_GOLD={nd}")
    else:
        print("‚ö†Ô∏è  No se ha escrito ning√∫n d√≠a en GOLD (revisa QA, universe y correspondencia con m5_raw).")

    print("=" * 110)
    print("‚úÖ Celda 12C ‚Äî m5_clean (GOLD) generada correctamente dentro de los criterios estructurales definidos.")

    _log(
        "INFO",
        f"GOLD m5_clean completado: syms_gold={n_syms_gold}, days_gold={total_days_gold}, "
        f"missing_raw={missing_raw_files}, skipped_existing={skipped_existing}, elapsed={elapsed:.1f}s"
    )
# ===================================================================================
   

[2025-12-02 23:40:43] [20251202_232253] [INFO] [12C-GOLD-M5] QA_GOLD_KEEP_STATUSES (efectivos) = ['OK', 'WARN']
Inicio Celda 12C ‚Äî Capa GOLD M5 (m5_clean)
TZ local: America/Guayaquil
Hora local: 2025-12-02T23:40:43-05:00 | Hora UTC: 2025-12-03T04:40:43+00:00
--------------------------------------------------------------------------------------------------------------
DATA_ROOT       : C:\Quant\MT5_Data_Extraction\data
RAW (m5_raw)    : C:\Quant\MT5_Data_Extraction\data\bulk_data\m5_raw
GOLD (m5_clean) : C:\Quant\MT5_Data_Extraction\data\historical_data\m5_clean
QA diario       : C:\Quant\MT5_Data_Extraction\data\metadata\qa_m5_bulk.parquet
Universe        : C:\Quant\MT5_Data_Extraction\data\metadata\universe_snapshot_20251202_232253.parquet
--------------------------------------------------------------------------------------------------------------
Par√°metros GOLD (estructura-base):
  QA_GOLD_KEEP_STATUSES = ['OK', 'WARN']   # status aceptados (FAIL/EMPTY fuera por defecto)
  MIN_R

In [17]:
# ===================== Celda 13 ‚Äî Backup, checksums y cierre (POLARS + QA + TZ homog√©nea) =====================
# Prop√≥sito:
#   - Manifest de corrida: metadata/manifest.json
#   - Checksums SHA-256 de archivos NUEVOS/MODIFICADOS en esta corrida: metadata/checksums.jsonl
#   - Backup (zip) de: historical_data/m5_clean (o bulk_data/m5_raw si no hay clean) + metadata/ + processed_data/m5_windows/
#   - Validaciones impresas: N¬∫ archivos/bytes, tasa de hashing (files/s y MiB/s), mismatches=0, rutas de backup
#   - Cierre de sesi√≥n MT5 (si aplica)
#   - Progreso en vivo (heartbeats)
#   - Ajuste: indexar archivos a hashear desde cat√°logos/logs usando Polars cuando aplique
# ==============================================================================================================

from __future__ import annotations
import os, sys, json, time, platform, getpass, hashlib, zipfile
from datetime import datetime, timezone
from pathlib import Path
from typing import Iterable, List, Dict, Tuple

# Polars opcional (para indexar desde cat√°logos/logs)
try:
    import polars as pl
    _HAS_PL = True
except Exception:
    _HAS_PL = False

# MetaTrader5 opcional (para shutdown)
try:
    import MetaTrader5 as mt5
    _HAS_MT5 = True
except Exception:
    _HAS_MT5 = False

# ------------------------------- Config / rutas --------------------------------
CELL_LABEL = "13-Backup"
RUN_ID = globals().get("RUN_ID", datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
TIMEZONE_IANA = globals().get("TIMEZONE_IANA", "America/Guayaquil")
HEARTBEAT_SECS = float(globals().get("BACKUP_HEARTBEAT_SECS", 2.0))
MTIME_FALLBACK_HOURS = int(globals().get("MTIME_FALLBACK_HOURS", 12))

DATA_ROOT = Path(globals().get("DATA_ROOT", str(path_contract.data_root()))).resolve()
META_DIR  = DATA_ROOT / "metadata"
CLEAN_DIR = DATA_ROOT / "historical_data" / "m5_clean"
RAW_DIR   = DATA_ROOT / "bulk_data" / "m5_raw"
WIN_DIR   = DATA_ROOT / "processed_data" / "m5_windows"
BACKUPS_DIR = DATA_ROOT / "backups"

RUN_LOG = META_DIR / "run_log.jsonl"
CATALOG_PATH = META_DIR / "dataset_catalog.parquet"
MANIFEST_PATH = META_DIR / "manifest.json"
CHECKSUMS_PATH = META_DIR / "checksums.jsonl"

BACKUPS_DIR.mkdir(parents=True, exist_ok=True)
META_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------------- Utilitarios -----------------------------------
def _now_local_iana() -> str:
    try:
        from zoneinfo import ZoneInfo
        return datetime.now(timezone.utc).astimezone(ZoneInfo(TIMEZONE_IANA)).isoformat(timespec="seconds")
    except Exception:
        return "(instala 'tzdata' para TZ locales)"

def _log(level: str, msg: str):
    ts = _now_local_iana()
    print(f"[{ts}] [{RUN_ID}] [{level}] [{CELL_LABEL}] {msg}", flush=True)

def _iter_files(base: Path) -> Iterable[Path]:
    for root, _, files in os.walk(base):
        for f in files:
            p = Path(root) / f
            if p.is_file():
                yield p

def _size_bytes(p: Path) -> int:
    try: return p.stat().st_size
    except Exception: return 0

def _fmt_bytes(n: int) -> str:
    if n < 1024: return f"{n} B"
    kib = n/1024
    if kib < 1024: return f"{kib:.2f} KiB"
    mib = kib/1024
    if mib < 1024: return f"{mib:.2f} MiB"
    gib = mib/1024
    return f"{gib:.2f} GiB"

def _sha256(path: Path, chunk_size: int = 2*1024*1024) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        while True:
            b = f.read(chunk_size)
            if not b: break
            h.update(b)
    return h.hexdigest()

def _has_any_parquets(base: Path) -> bool:
    for p in _iter_files(base):
        if p.suffix.lower()==".parquet": return True
    return False

# --------------------------- Fuente principal para el backup ---------------------------
if _has_any_parquets(CLEAN_DIR):
    SOURCE_DIR = CLEAN_DIR
    source_label = "historical_data/m5_clean"
else:
    SOURCE_DIR = RAW_DIR
    source_label = "bulk_data/m5_raw"

# --------------------------- Descubrimiento de archivos a hashear ---------------------------
def _paths_from_runlog_polars(run_id: str) -> List[Path]:
    if not _HAS_PL or not RUN_LOG.exists():
        return []
    try:
        df = pl.scan_ndjson(str(RUN_LOG)).filter(pl.col("run_id")==run_id).select(
            pl.coalesce([pl.col("dst"), pl.col("path")]).alias("p")
        ).collect()
        vals = [Path(p) for p in df["p"].drop_nulls().to_list() if p]
        return [p.resolve() for p in vals if p.exists()]
    except Exception:
        return []

def _paths_from_runlog_python(run_id: str) -> List[Path]:
    paths = []
    if not RUN_LOG.exists():
        return paths
    with open(RUN_LOG, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line: continue
            try: obj = json.loads(line)
            except Exception: continue
            if str(obj.get("run_id","")) != run_id:
                continue
            for key in ("dst","path"):
                p = obj.get(key)
                if p:
                    P = Path(p)
                    if P.exists():
                        try: paths.append(P.resolve())
                        except Exception: paths.append(P)
    # √∫nicos
    uniq, seen = [], set()
    for p in paths:
        if p not in seen:
            uniq.append(p); seen.add(p)
    return uniq

def _paths_recent_by_mtime(hours: int) -> List[Path]:
    cutoff = time.time() - hours*3600
    acc = []
    for base in (SOURCE_DIR, META_DIR, WIN_DIR):
        if not base.exists(): continue
        for p in _iter_files(base):
            try:
                if p.stat().st_mtime >= cutoff:
                    acc.append(p.resolve())
            except Exception:
                pass
    return acc

def _paths_from_catalog_polars() -> List[Path]:
    """Ajuste: indexar desde cat√°logos cuando aplique. Si existe dataset_catalog, devolvemos
    paths de 'symbol=*/year=*/month=*/part=*.parquet' s√≥lo para s√≠mbolos presentes (r√°pido)."""
    if not _HAS_PL or not CATALOG_PATH.exists():
        return []
    try:
        df = pl.read_parquet(CATALOG_PATH, columns=["symbol"])
        syms = df["symbol"].unique().drop_nulls().cast(pl.Utf8).to_list()
        out = []
        for s in syms:
            sdir = SOURCE_DIR / f"symbol={s}"
            if sdir.exists():
                for p in sdir.rglob("part=*.parquet"):
                    out.append(p.resolve())
        return out
    except Exception:
        return []

# Construir set de archivos candidatos
changed_files = set()
# 1) run_log con Polars (preferido)
changed_files.update(_paths_from_runlog_polars(RUN_ID))
# 2) fallback a parsing Python
if not changed_files:
    changed_files.update(_paths_from_runlog_python(RUN_ID))
# 3) si sigue vac√≠o, usar mtime reciente
if not changed_files:
    _log("WARNING", f"No se hallaron archivos por RUN_ID en run_log; usando fallback mtime ({MTIME_FALLBACK_HOURS}h).")
    changed_files.update(_paths_recent_by_mtime(MTIME_FALLBACK_HOURS))
# 4) Ajuste: a√±adir √≠ndice desde cat√°logo si aplica (para cubrir promociones en masa)
if _HAS_PL and CATALOG_PATH.exists():
    changed_files.update(_paths_from_catalog_polars())

# Quitar duplicados y asegurar pertenencia a DATA_ROOT
changed_files = sorted({p for p in changed_files if str(p).startswith(str(DATA_ROOT)) and p.is_file()})

# --------------------------- Cabecera homog√©nea TZ --------------------------------
print("="*110, flush=True)
print(f"Inicio Celda 13 ‚Äî Backup, checksums y cierre | TZ local: {TIMEZONE_IANA}", flush=True)
print(f"Hora local: {_now_local_iana()} | Hora UTC: {datetime.now(timezone.utc).isoformat(timespec='seconds')}", flush=True)
print("-"*110, flush=True)
print(f"DATA_ROOT: {DATA_ROOT}", flush=True)
print(f"Origen principal de backup: {source_label} ‚Üí {SOURCE_DIR}", flush=True)
print(f"Ventanas (si existen): {WIN_DIR}", flush=True)
print(f"Metadata: {META_DIR}", flush=True)
print("-"*110, flush=True)

# --------------------------- Manifest --------------------------------------------
def _count_symbols_and_files(base: Path) -> Tuple[int, int, int]:
    if not base.exists(): return 0,0,0
    symbols, files, bytes_total = set(), 0, 0
    for p in _iter_files(base):
        files += 1
        bytes_total += _size_bytes(p)
        for pr in Path(p).parts:
            if pr.startswith("symbol="):
                symbols.add(pr.split("=",1)[-1]); break
    return len(symbols), files, bytes_total

def _pkg_ver(name: str) -> str | None:
    try:
        mod = __import__(name)
        return getattr(mod, "__version__", None)
    except Exception:
        return None

n_symbols_src, n_files_src, bytes_src = _count_symbols_and_files(SOURCE_DIR)

manifest = {
    "run_id": RUN_ID,
    "platform": platform.platform(),
    "user": getpass.getuser(),
    "python": sys.version.split()[0],
    "versions": {"polars": _pkg_ver("polars"), "pyarrow": _pkg_ver("pyarrow"), "MetaTrader5": _pkg_ver("MetaTrader5")},
    "tz_local": TIMEZONE_IANA,
    "data_root": str(DATA_ROOT),
    "paths": {"source": str(SOURCE_DIR), "windows": str(WIN_DIR), "metadata": str(META_DIR)},
    "source_used": source_label,
    "counts": {"symbols_source": n_symbols_src, "files_source": n_files_src, "bytes_source": bytes_src},
    "changed_files_in_run": len(changed_files),
    "timestamps": {"manifest_created_utc": datetime.now(timezone.utc).isoformat()}
}
MANIFEST_PATH.write_text(json.dumps(manifest, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"Manifest escrito en: {MANIFEST_PATH}", flush=True)

# --------------------------- Checksums (SHA-256) con QA ---------------------------
print("-"*110, flush=True)
print("Calculando checksums SHA-256 de archivos nuevos/modificados en esta corrida...", flush=True)

# Cargar √∫ltimas huellas previas por path para detectar mismatches
prev_hash: Dict[str, str] = {}
if CHECKSUMS_PATH.exists():
    try:
        # Leemos s√≥lo √∫ltima ocurrencia por path (si hay muchas l√≠neas puede ser pesado -> usamos Polars si hay)
        if _HAS_PL:
            df_prev = pl.scan_ndjson(str(CHECKSUMS_PATH)).select(["path","sha256"]).collect()
            # nos quedamos con la √∫ltima por path (orden natural del archivo: no garantizado; pero un group_by_last basta)
            df_prev = df_prev.reverse().unique(subset=["path"], keep="first")
            prev_hash = {r["path"]: r["sha256"] for r in df_prev.iter_rows(named=True)}
        else:
            seen = set()
            with open(CHECKSUMS_PATH, "r", encoding="utf-8") as f:
                lines = f.readlines()
            for line in reversed(lines):
                try:
                    obj = json.loads(line)
                    pth = obj.get("path"); sh = obj.get("sha256")
                    if pth and sh and pth not in seen:
                        prev_hash[pth] = sh; seen.add(pth)
                except Exception:
                    continue
    except Exception:
        prev_hash = {}

failures: List[Path] = []
checksums_written = 0
mismatches = 0
hashed_bytes = 0

t0 = time.monotonic()
last_beat = t0

with open(CHECKSUMS_PATH, "a", encoding="utf-8") as outjs:
    for i, p in enumerate(changed_files, start=1):
        try:
            sha = _sha256(p)
            sz = _size_bytes(p)
            rec = {
                "run_id": RUN_ID,
                "path": str(p),
                "size": sz,
                "sha256": sha,
                "mtime_utc": datetime.fromtimestamp(p.stat().st_mtime, tz=timezone.utc).isoformat()
            }
            outjs.write(json.dumps(rec, ensure_ascii=False) + "\n")
            checksums_written += 1
            hashed_bytes += sz

            # Mismatch: si exist√≠a hash previo para este path y difiere ‚Üí incremento
            prev = prev_hash.get(str(p))
            if prev is not None and prev != sha:
                mismatches += 1

        except Exception:
            failures.append(p)

        # Heartbeat
        now = time.monotonic()
        if (i % 50 == 0) or (now - last_beat >= HEARTBEAT_SECS):
            rate_files = i / max(now - t0, 1e-6)
            rate_mib = (hashed_bytes/1048576) / max(now - t0, 1e-6)
            _log("INFO", f"Checksums: {i}/{len(changed_files)} archivos | {rate_files:.1f} files/s | {rate_mib:.2f} MiB/s")
            last_beat = now

rate_files = (checksums_written / max(time.monotonic() - t0, 1e-6))
rate_mib = ((hashed_bytes/1048576) / max(time.monotonic() - t0, 1e-6))

print(f"Checksums escritos en: {CHECKSUMS_PATH} (l√≠neas nuevas={checksums_written})", flush=True)
print(f"Hashing: archivos={checksums_written} | bytes={hashed_bytes} ({_fmt_bytes(hashed_bytes)}) | tasa={rate_files:.1f} files/s, {rate_mib:.2f} MiB/s", flush=True)

if failures:
    print("‚ö†Ô∏è  Archivos con fallo de checksum/lectura:", flush=True)
    for p in failures[:20]:
        print("  -", str(p), flush=True)
    if len(failures) > 20:
        print(f"  ... y {len(failures)-20} m√°s", flush=True)

print(f"Mismatches detectados respecto a √∫ltimo registro previo: {mismatches}", flush=True)
if mismatches == 0:
    print("‚úÖ QA checksums: mismatches=0", flush=True)
else:
    print("‚ö†Ô∏è  QA checksums: existen mismatches (verificar cambios esperados o integridad).", flush=True)

# --------------------------- Backup (ZIP) -----------------------------------------
print("-"*110, flush=True)
backup_name = f"backup_m5_{RUN_ID}.zip"
backup_path = BACKUPS_DIR / backup_name

def _zip_add_dir(zf: zipfile.ZipFile, base: Path):
    base = base.resolve()
    if not base.exists(): return 0, 0
    added, total_bytes = 0, 0
    for p in _iter_files(base):
        rel = p.resolve().relative_to(DATA_ROOT)
        zf.write(p, arcname=str(rel))
        added += 1
        total_bytes += _size_bytes(p)
    return added, total_bytes

print("Creando backup (zip)...", flush=True)
files_in_zip = 0
bytes_uncompressed = 0
with zipfile.ZipFile(backup_path, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=6) as zf:
    _log("INFO", f"A√±adiendo {source_label} ...")
    a, b = _zip_add_dir(zf, SOURCE_DIR); files_in_zip += a; bytes_uncompressed += b
    _log("INFO", "A√±adiendo metadata/ ...")
    a, b = _zip_add_dir(zf, META_DIR);   files_in_zip += a; bytes_uncompressed += b
    if WIN_DIR.exists():
        _log("INFO", "A√±adiendo processed_data/m5_windows/ ...")
        a, b = _zip_add_dir(zf, WIN_DIR); files_in_zip += a; bytes_uncompressed += b

zip_size = _size_bytes(backup_path)
print(f"Backup creado en: {backup_path}", flush=True)
print(f"  - Archivos incluidos: {files_in_zip} | Bytes sin comprimir (aprox): {_fmt_bytes(bytes_uncompressed)}", flush=True)
print(f"  - Tama√±o ZIP: {_fmt_bytes(zip_size)}", flush=True)
if zip_size <= 0:
    print("‚ö†Ô∏è  Backup con tama√±o 0. Revisa permisos/espacio en disco.", flush=True)

# --------------------------- Cierre de MT5 -----------------------------------------
print("-"*110, flush=True)
mt5_status = "NO-OP"
if _HAS_MT5:
    try:
        mt5.shutdown()
        mt5_status = "OK"
    except Exception as e:
        mt5_status = f"ERROR: {e.__class__.__name__}"
print(f"Estado de desconexi√≥n MT5: {mt5_status}", flush=True)

# --------------------------- Resumen final -----------------------------------------
print("-"*110, flush=True)
print("Resumen:", flush=True)
print(f"  ‚Ä¢ Archivos hasheados: {checksums_written} | Bytes: {_fmt_bytes(hashed_bytes)} | Tasa: {rate_files:.1f} files/s, {rate_mib:.2f} MiB/s", flush=True)
print(f"  ‚Ä¢ Mismatches: {mismatches}", flush=True)
print(f"  ‚Ä¢ Backup ZIP: {backup_path}", flush=True)
print("‚úÖ Cierre completo del pipeline de extracci√≥n M5: OK", flush=True)
# ==============================================================================================================


Inicio Celda 13 ‚Äî Backup, checksums y cierre | TZ local: America/Guayaquil
Hora local: 2025-12-02T23:41:05-05:00 | Hora UTC: 2025-12-03T04:41:05+00:00
--------------------------------------------------------------------------------------------------------------
DATA_ROOT: C:\Quant\MT5_Data_Extraction\data
Origen principal de backup: historical_data/m5_clean ‚Üí C:\Quant\MT5_Data_Extraction\data\historical_data\m5_clean
Ventanas (si existen): C:\Quant\MT5_Data_Extraction\data\processed_data\m5_windows
Metadata: C:\Quant\MT5_Data_Extraction\data\metadata
--------------------------------------------------------------------------------------------------------------
Manifest escrito en: C:\Quant\MT5_Data_Extraction\data\metadata\manifest.json
--------------------------------------------------------------------------------------------------------------
Calculando checksums SHA-256 de archivos nuevos/modificados en esta corrida...
[2025-12-02T23:41:10-05:00] [20251202_232253] [INFO] [13-Bac

In [18]:
# ======================= Celda 14 ‚Äî Restore y verificaci√≥n cruzada (POLARS-only) =======================
# Prop√≥sito:
#   1) Restaurar el ZIP de backup m√°s reciente (o por RUN_ID) en DATA_ROOT/restore/restore_<RUN_ID>/
#   2) Verificar integridad con checksums (SHA-256) usando el checksums.jsonl del backup restaurado
#   3) Verificar Parquet con Polars (columnas/orden contra schema_m5.json; filas y rangos)
#   4) Comparar recuentos de archivos/bytes y filas entre ORIGINAL (DATA_ROOT) y RESTORE (restore_<RUN_ID>)
#   5) Imprimir tasas de hashing y heartbeats de progreso
# Notas:
#   - 100% Polars (sin fallback a pandas).
#   - No borra nada en el DATA_ROOT original.
#   - Si FORCE_FULL_CHECKSUM=False, se verifica por muestra; en True, se verifican todos los archivos hashados.
# =======================================================================================================

from __future__ import annotations
import os, json, time, hashlib, zipfile, random
from pathlib import Path
from datetime import datetime, timezone
from typing import Iterable, List, Dict, Tuple, Optional

# ------------------------------------ Config ------------------------------------
CELL_LABEL = "14-Restore-Verify"
RUN_ID = globals().get("RUN_ID", None)  # si None, escoger√° el backup m√°s reciente
DATA_ROOT = Path(globals().get("DATA_ROOT", str(path_contract.data_root()))).resolve()

# Par√°metros de verificaci√≥n
TIMEZONE_IANA         = globals().get("TIMEZONE_IANA", "America/Guayaquil")
FORCE_FULL_CHECKSUM   = bool(globals().get("FORCE_FULL_CHECKSUM", False))
CHECKSUM_SAMPLE_FILES = int(globals().get("CHECKSUM_SAMPLE_FILES", 1500))
PARQUET_SAMPLE_FILES  = int(globals().get("PARQUET_SAMPLE_FILES", 24))
SHOW_SAMPLE_LIST      = bool(globals().get("SHOW_SAMPLE_LIST", True))
SHA256_CHUNK          = int(globals().get("SHA256_CHUNK", 8 * 1024 * 1024))  # 8 MiB
HEARTBEAT_SECS        = float(globals().get("HEARTBEAT_SECS", 2.0))

BACKUPS_DIR = DATA_ROOT / "backups"
BACKUP_PREFIX = "backup_m5_"

# Polars (requerido)
try:
    import polars as pl
except Exception as e:
    raise RuntimeError("Se requiere 'polars'. Inst√°lalo e intenta de nuevo.") from e

# ---------------------------------- Utilidades -----------------------------------
def _now_local_iana() -> str:
    try:
        from zoneinfo import ZoneInfo
        return datetime.now(timezone.utc).astimezone(ZoneInfo(TIMEZONE_IANA)).isoformat(timespec="seconds")
    except Exception:
        return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _log(level: str, msg: str):
    rid = RUN_ID or "NA"
    print(f"[{_now_local_iana()}] [{rid}] [{level}] [{CELL_LABEL}] {msg}", flush=True)

def _fmt_bytes(n: int) -> str:
    if n < 1024: return f"{n} B"
    kib = n/1024
    if kib < 1024: return f"{kib:.2f} KiB"
    mib = kib/1024
    if mib < 1024: return f"{mib:.2f} MiB"
    gib = mib/1024
    if gib < 1024: return f"{gib:.2f} GiB"
    tib = gib/1024
    return f"{tib:.2f} TiB"

def _iter_files(base: Path) -> Iterable[Path]:
    for root, _, files in os.walk(base):
        for f in files:
            p = Path(root) / f
            if p.is_file():
                yield p

def _size_bytes(p: Path) -> int:
    try: return p.stat().st_size
    except Exception: return 0

def _sha256_file(path: Path, chunk: int = SHA256_CHUNK) -> str:
    h = hashlib.sha256()
    with open(path, "rb") as f:
        while True:
            b = f.read(chunk)
            if not b: break
            h.update(b)
    return h.hexdigest()

# --------------------------- Selecci√≥n del backup a usar -------------------------
def _find_backup_zip(run_id: Optional[str]) -> Tuple[Path, str]:
    BACKUPS_DIR.mkdir(parents=True, exist_ok=True)
    if run_id:
        z = BACKUPS_DIR / f"{BACKUP_PREFIX}{run_id}.zip"
        if z.exists():
            return z, run_id
        # fallback: buscar que contenga el run_id
        cands = sorted(BACKUPS_DIR.glob(f"*{run_id}*.zip"), key=lambda p: p.stat().st_mtime, reverse=True)
        if not cands:
            raise FileNotFoundError(f"No se encontr√≥ ZIP para RUN_ID={run_id} en {BACKUPS_DIR}")
        rid = cands[0].stem.replace(BACKUP_PREFIX, "")
        return cands[0], rid
    # sin run_id: m√°s reciente
    zips = sorted(BACKUPS_DIR.glob(f"{BACKUP_PREFIX}*.zip"), key=lambda p: p.stat().st_mtime, reverse=True)
    if not zips:
        raise FileNotFoundError(f"No hay backups en {BACKUPS_DIR}")
    rid = zips[0].stem.replace(BACKUP_PREFIX, "")
    return zips[0], rid

# ------------------------------------ Restore ------------------------------------
def _restore_zip(zip_path: Path, rid: str) -> Path:
    restore_dir = DATA_ROOT / "restore" / f"restore_{rid}"
    restore_dir.mkdir(parents=True, exist_ok=True)
    _log("INFO", f"Restaurando ZIP: {zip_path.name}")
    with zipfile.ZipFile(zip_path, "r") as zf:
        infos = zf.infolist()
        total = len(infos)
        _log("INFO", f"Entradas en ZIP: {total}")
        t0 = time.monotonic()
        lastb = t0
        for i, inf in enumerate(infos, 1):
            zf.extract(inf, path=restore_dir)
            now = time.monotonic()
            if (i % 1000 == 0) or (now - lastb >= HEARTBEAT_SECS) or (i == total):
                rate = i / max(now - t0, 1e-6)
                _log("INFO", f"Restore progreso: {i}/{total} | {rate:.1f} files/s")
                lastb = now
    return restore_dir

# ---------------------- Cargar manifest/checksums del RESTORE --------------------
def _load_restored_manifest(restore_root: Path) -> dict:
    mp = restore_root / "metadata" / "manifest.json"
    if not mp.exists():
        return {}
    try:
        return json.loads(mp.read_text(encoding="utf-8"))
    except Exception:
        return {}

def _load_restored_checksums_df(restore_root: Path, filter_run_id: Optional[str]) -> pl.DataFrame:
    chk_path = restore_root / "metadata" / "checksums.jsonl"
    if not chk_path.exists():
        raise FileNotFoundError(f"No existe {chk_path} dentro del RESTORE.")
    lf = pl.scan_ndjson(str(chk_path))
    if filter_run_id:
        lf = lf.filter(pl.col("run_id") == filter_run_id)
    # si hay m√∫ltiples entradas por path, nos quedamos con la √∫ltima
    df = (lf.with_row_count("rc")
            .select(pl.all())  # rc, run_id, path, size, sha256, mtime_utc...
            .sort("path", pl.col("rc"), descending=[False, True])
            .unique(subset=["path"], keep="first")
            .select(["path", "size", "sha256"]))
    return df.collect()

def _derive_rel_from_abs(abs_path: str, orig_root: str) -> str:
    p = abs_path.replace("\\", "/")
    root = orig_root.replace("\\", "/").rstrip("/")
    if p.lower().startswith(root.lower()):
        rel = p[len(root):].lstrip("/")
        return rel
    # ya ser√≠a relativo
    return p.lstrip("/")

# ---------------------------- Verificaci√≥n de checksums --------------------------
def _verify_checksums_restored(restore_root: Path, df_chk: pl.DataFrame, orig_data_root: Optional[str]) -> Tuple[int,int,int,List[str],float,float]:
    """
    Devuelve: ok, total, mismatches_count, mism_list (‚â§20), rate_files, rate_mib
    """
    # Derivar rutas relativas para mapear a restore_root
    if orig_data_root:
        rel = df_chk.with_columns(
            pl.col("path").map_elements(lambda s: _derive_rel_from_abs(s, orig_data_root)).alias("_rel")
        )
    else:
        rel = df_chk.with_columns(pl.col("path").alias("_rel"))

    rows = rel.to_dicts()
    total = len(rows)
    ok = 0
    mism = []
    hashed_bytes = 0
    t0 = time.monotonic()
    lastb = t0

    # Modo muestra vs completo
    if not FORCE_FULL_CHECKSUM and total > CHECKSUM_SAMPLE_FILES:
        random.shuffle(rows)
        rows = rows[:CHECKSUM_SAMPLE_FILES]
        total = len(rows)

    for i, r in enumerate(rows, 1):
        target = (restore_root / r["_rel"])
        try:
            if target.exists():
                digest = _sha256_file(target, SHA256_CHUNK)
                hashed_bytes += _size_bytes(target)
                if digest == r["sha256"]:
                    ok += 1
                else:
                    if len(mism) < 20:
                        mism.append(f"{r['_rel']} (mismatch)")
            else:
                if len(mism) < 20:
                    mism.append(f"{r['_rel']} (missing)")
        except Exception as e:
            if len(mism) < 20:
                mism.append(f"{r['_rel']} (error: {e.__class__.__name__})")

        now = time.monotonic()
        if (i % 50 == 0) or (now - lastb >= HEARTBEAT_SECS) or (i == total):
            rate_files = i / max(now - t0, 1e-6)
            rate_mib   = (hashed_bytes/1048576) / max(now - t0, 1e-6)
            _log("INFO", f"Checksums: {i}/{total} | {rate_files:.1f} files/s | {rate_mib:.2f} MiB/s")
            lastb = now

    rate_files = total / max(time.monotonic() - t0, 1e-6)
    rate_mib   = (hashed_bytes/1048576) / max(time.monotonic() - t0, 1e-6)
    return ok, total, (total - ok), mism, rate_files, rate_mib

# -------------------------- Verificaci√≥n Parquet (Polars) ------------------------
def _schema_columns_from_restore(restore_root: Path) -> List[str]:
    schema_path = restore_root / "metadata" / "schema_m5.json"
    if not schema_path.exists():
        # fallback: del original (por si el backup no lo conten√≠a)
        schema_path = DATA_ROOT / "metadata" / "schema_m5.json"
    if schema_path.exists():
        try:
            d = json.loads(schema_path.read_text(encoding="utf-8"))
            cols = d.get("column_order", None)
            if isinstance(cols, list) and cols:
                return cols
        except Exception:
            pass
    # fallback razonable
    return ["timestamp_utc","symbol","open","high","low","close",
            "tick_volume","real_volume","spread_points","broker","server_tz"]

def _pick_parquet_samples(restore_root: Path, k: int) -> List[Path]:
    bases = []
    cand1 = restore_root / "historical_data" / "m5_clean"
    cand2 = restore_root / "bulk_data" / "m5_raw"
    cand3 = restore_root / "processed_data" / "m5_windows"
    for c in (cand1, cand2, cand3):
        if c.exists():
            bases.append(c)
    files = []
    for b in bases:
        files += list(b.rglob("part=*.parquet"))
    random.shuffle(files)
    return files[:min(k, len(files))]

def _verify_schema_and_rows(paths: List[Path], schema_cols: List[str]) -> Tuple[int,int,int,int,Optional[int],Optional[int]]:
    """
    Devuelve: col_mismatch, empty_parts, dupes_files, files_read, ts_min, ts_max
    """
    col_mismatch = 0
    empty_parts = 0
    dupes_files = 0
    files_read = 0
    ts_min_glob = None
    ts_max_glob = None

    t0 = time.monotonic()
    lastb = t0

    for i, p in enumerate(paths, 1):
        try:
            df = pl.read_parquet(p)
            files_read += 1

            # Orden de columnas: exigimos que el prefijo coincida con schema_cols
            cols = df.columns
            if schema_cols and cols[:len(schema_cols)] != schema_cols:
                col_mismatch += 1

            n = df.height
            if n == 0:
                empty_parts += 1
            else:
                if "timestamp_utc" in df.columns:
                    dmin = int(df.select(pl.col("timestamp_utc").min()).item())
                    dmax = int(df.select(pl.col("timestamp_utc").max()).item())
                    ts_min_glob = dmin if ts_min_glob is None else min(ts_min_glob, dmin)
                    ts_max_glob = dmax if ts_max_glob is None else max(ts_max_glob, dmax)

                # Duplicados por archivo
                if "timestamp_utc" in df.columns:
                    dups = int(n - int(df.select(pl.col("timestamp_utc").n_unique()).item()))
                    if dups > 0:
                        dupes_files += 1

        except Exception as e:
            col_mismatch += 1  # contar como problema de esquema/lectura

        now = time.monotonic()
        if (i % 6 == 0) or (now - lastb >= HEARTBEAT_SECS) or (i == len(paths)):
            rate = i / max(now - t0, 1e-6)
            _log("INFO", f"Verif Parquet (muestra): {i}/{len(paths)} | {rate:.2f} files/s")
            lastb = now

    return col_mismatch, empty_parts, dupes_files, files_read, ts_min_glob, ts_max_glob

# ---------------------- Conteos/bytes y filas (ORIG vs RESTORE) -------------------
def _count_files_bytes(base: Path) -> Tuple[int,int]:
    files = 0
    bytes_total = 0
    if base.exists():
        for p in _iter_files(base):
            files += 1
            bytes_total += _size_bytes(p)
    return files, bytes_total

def _row_count_glob(paths: List[Path]) -> int:
    if not paths:
        return 0
    # usamos scan_parquet para sumar filas sin cargar todo a memoria
    lf = pl.scan_parquet([str(p) for p in paths])
    return int(lf.select(pl.len()).collect().item())

def _gather_parquet_paths(base: Path) -> List[Path]:
    if not base.exists():
        return []
    return list(base.rglob("part=*.parquet"))

# =================================== EJECUCI√ìN ===================================
# 1) Selecci√≥n y cabecera
zip_path, chosen_rid = _find_backup_zip(RUN_ID)
RUN_ID = chosen_rid  # normalizamos
print("="*110)
print(f"Inicio Celda 14 ‚Äî Restore y verificaci√≥n cruzada | TZ local: {TIMEZONE_IANA}")
print(f"Hora local: {_now_local_iana()} | Hora UTC: {datetime.now(timezone.utc).isoformat(timespec='seconds')}")
print("-"*110)
print(f"DATA_ROOT: {DATA_ROOT}")
print(f"ZIP elegido: {zip_path} | tama√±o={_fmt_bytes(zip_path.stat().st_size)}")
print("-"*110)

# 2) Restore ZIP
restore_root = _restore_zip(zip_path, RUN_ID)
print("-"*110)
print(f"Restaurado en: {restore_root}")

# 3) Cargar manifest/checksums del RESTORE
manifest = _load_restored_manifest(restore_root)
backup_run_id = manifest.get("run_id", RUN_ID)
orig_data_root = manifest.get("data_root", str(DATA_ROOT))

df_chk = _load_restored_checksums_df(restore_root, backup_run_id)
print(f"Checksums encontrados (restored, run_id={backup_run_id}): {len(df_chk)}")

# 4) Verificar checksums (restored) con heartbeats
ok, total, mism_count, mism_list, rate_files, rate_mib = _verify_checksums_restored(
    restore_root, df_chk, orig_data_root
)
print("-"*110)
print(f"Checksums verificados: {total} | OK={ok} | mismatches={mism_count}")
if mism_list:
    print("Mismatches (primeros 20):")
    for x in mism_list:
        print("  -", x)
print(f"Tasa de hashing: {rate_files:.1f} files/s | {rate_mib:.2f} MiB/s")

# 5) Verificaci√≥n Parquet (muestra con Polars)
schema_cols = _schema_columns_from_restore(restore_root)
samples = _pick_parquet_samples(restore_root, PARQUET_SAMPLE_FILES)
print("-"*110)
print(f"Sample Parquet para verificaci√≥n: {len(samples)} archivos")
if SHOW_SAMPLE_LIST:
    for p in samples[:min(12, len(samples))]:
        print("  ¬∑", str(p))

col_mismatch, empty_parts, dupes_files, files_read, ts_min, ts_max = _verify_schema_and_rows(samples, schema_cols)

# 6) Comparaciones ORIG vs RESTORE (files/bytes y filas)
orig_sources = [
    DATA_ROOT / "historical_data" / "m5_clean",
    DATA_ROOT / "bulk_data" / "m5_raw",
    DATA_ROOT / "processed_data" / "m5_windows",
    DATA_ROOT / "metadata",
]
rest_sources = [
    restore_root / "historical_data" / "m5_clean",
    restore_root / "bulk_data" / "m5_raw",
    restore_root / "processed_data" / "m5_windows",
    restore_root / "metadata",
]

orig_files = sum(_count_files_bytes(p)[0] for p in orig_sources)
orig_bytes = sum(_count_files_bytes(p)[1] for p in orig_sources)
rest_files = sum(_count_files_bytes(p)[0] for p in rest_sources)
rest_bytes = sum(_count_files_bytes(p)[1] for p in rest_sources)

# Filas (contamos en Parquet de capas principales)
orig_parquets = _gather_parquet_paths(DATA_ROOT / "historical_data" / "m5_clean")
if not orig_parquets:
    orig_parquets = _gather_parquet_paths(DATA_ROOT / "bulk_data" / "m5_raw")
orig_parquets += _gather_parquet_paths(DATA_ROOT / "processed_data" / "m5_windows")

rest_parquets = _gather_parquet_paths(restore_root / "historical_data" / "m5_clean")
if not rest_parquets:
    rest_parquets = _gather_parquet_paths(restore_root / "bulk_data" / "m5_raw")
rest_parquets += _gather_parquet_paths(restore_root / "processed_data" / "m5_windows")

t_rows0 = time.monotonic()
orig_rows = _row_count_glob(orig_parquets)
rest_rows = _row_count_glob(rest_parquets)
rows_rate = (len(rest_parquets) + len(orig_parquets)) / max(time.monotonic() - t_rows0, 1e-6)

# 7) Impresiones finales
print("="*110)
print("RESUMEN DE VERIFICACI√ìN")
print(f"Backup usado               : {zip_path.name}")
print(f"Restaurado en              : {restore_root}")
print("-"*110)
print("Checksums:")
print(f"  ‚Ä¢ Verificados            : {total}")
print(f"  ‚Ä¢ OK                     : {ok}")
print(f"  ‚Ä¢ Mismatches             : {mism_count}")
print(f"  ‚Ä¢ Tasa de hashing        : {rate_files:.1f} files/s | {rate_mib:.2f} MiB/s")
print("-"*110)
print("Verificaci√≥n Parquet (muestra):")
print(f"  ‚Ä¢ Archivos le√≠dos        : {files_read}")
print(f"  ‚Ä¢ Column order mismatches: {col_mismatch}")
print(f"  ‚Ä¢ Partes vac√≠as          : {empty_parts}")
print(f"  ‚Ä¢ Archivos con dups (ts) : {dupes_files}")
if ts_min is not None and ts_max is not None:
    utc_min = datetime.fromtimestamp(ts_min/1000, tz=timezone.utc).isoformat(timespec="seconds")
    utc_max = datetime.fromtimestamp(ts_max/1000, tz=timezone.utc).isoformat(timespec="seconds")
    print(f"  ‚Ä¢ Rango UTC (muestra)    : {utc_min} ‚Üí {utc_max}")
print("-"*110)
print("Comparaci√≥n ORIG vs RESTORE:")
print(f"  ‚Ä¢ ORIG  archivos/bytes   : {orig_files} / {_fmt_bytes(orig_bytes)}")
print(f"  ‚Ä¢ REST  archivos/bytes   : {rest_files} / {_fmt_bytes(rest_bytes)}")
print(f"  ‚Ä¢ Œî archivos/bytes       : {rest_files-orig_files} / {_fmt_bytes(rest_bytes-orig_bytes)}")
print(f"  ‚Ä¢ ORIG  filas (Parquet)  : {orig_rows:,}")
print(f"  ‚Ä¢ REST  filas (Parquet)  : {rest_rows:,}")
print(f"  ‚Ä¢ Œî filas                : {rest_rows - orig_rows:,}")
print(f"  ‚Ä¢ Tasa conteo filas      : {rows_rate:.2f} files/s (scan_parquet)")
print("-"*110)

ok_checksums = (mism_count == 0) or (not FORCE_FULL_CHECKSUM and ok > 0)
ok_schema    = (col_mismatch == 0)
ok_restore   = (rest_files > 0 and rest_bytes > 0)

if ok_checksums and ok_schema and ok_restore:
    print("‚úÖ Restore y verificaci√≥n cruzada: OK (integridad comprobada, Polars-only).")
else:
    print("‚ö†Ô∏è  Restore/verificaci√≥n con observaciones. Revisa mismatches/column order/partes vac√≠as antes de continuar.")
# =======================================================================================================


Inicio Celda 14 ‚Äî Restore y verificaci√≥n cruzada | TZ local: America/Guayaquil
Hora local: 2025-12-03T05:26:21-05:00 | Hora UTC: 2025-12-03T10:26:21+00:00
--------------------------------------------------------------------------------------------------------------
DATA_ROOT: C:\Quant\MT5_Data_Extraction\data
ZIP elegido: C:\Quant\MT5_Data_Extraction\data\backups\backup_m5_20251202_232253.zip | tama√±o=1.01 GiB
--------------------------------------------------------------------------------------------------------------
[2025-12-03T05:26:21-05:00] [20251202_232253] [INFO] [14-Restore-Verify] Restaurando ZIP: backup_m5_20251202_232253.zip
[2025-12-03T05:26:21-05:00] [20251202_232253] [INFO] [14-Restore-Verify] Entradas en ZIP: 132007
[2025-12-03T05:26:22-05:00] [20251202_232253] [INFO] [14-Restore-Verify] Restore progreso: 1000/132007 | 4000.0 files/s
[2025-12-03T05:26:22-05:00] [20251202_232253] [INFO] [14-Restore-Verify] Restore progreso: 2000/132007 | 4000.0 files/s
[2025-12-03T05

  df = (lf.with_row_count("rc")


[2025-12-03T05:27:08-05:00] [20251202_232253] [INFO] [14-Restore-Verify] Checksums: 50/1500 | 91.4 files/s | 0.92 MiB/s
[2025-12-03T05:27:08-05:00] [20251202_232253] [INFO] [14-Restore-Verify] Checksums: 100/1500 | 88.9 files/s | 0.89 MiB/s
[2025-12-03T05:27:09-05:00] [20251202_232253] [INFO] [14-Restore-Verify] Checksums: 150/1500 | 89.7 files/s | 0.90 MiB/s
[2025-12-03T05:27:09-05:00] [20251202_232253] [INFO] [14-Restore-Verify] Checksums: 200/1500 | 92.1 files/s | 0.93 MiB/s
[2025-12-03T05:27:10-05:00] [20251202_232253] [INFO] [14-Restore-Verify] Checksums: 250/1500 | 95.2 files/s | 0.95 MiB/s
[2025-12-03T05:27:10-05:00] [20251202_232253] [INFO] [14-Restore-Verify] Checksums: 300/1500 | 97.0 files/s | 0.96 MiB/s
[2025-12-03T05:27:11-05:00] [20251202_232253] [INFO] [14-Restore-Verify] Checksums: 350/1500 | 94.9 files/s | 0.94 MiB/s
[2025-12-03T05:27:11-05:00] [20251202_232253] [INFO] [14-Restore-Verify] Checksums: 400/1500 | 95.9 files/s | 0.94 MiB/s
[2025-12-03T05:27:12-05:00] [2025

In [19]:
# ======================= Celda 15 ‚Äî √çndices de lectura y cat√°logo (M5, POLARS-only + fallback ventanas) =======================
# Prop√≥sito:
#   - Construir √≠ndices auxiliares y cat√°logo de ventanas:
#       1) metadata/day_index_m5.parquet
#       2) metadata/symbol_index_m5.parquet
#       3) metadata/window_catalog_m5.parquet
#   - Orden de preferencia para day/symbol index: m5_clean > m5_raw > m5_windows (desdup por d√≠a/s√≠mbolo).
#   - Lectura con scan_parquet (proyecci√≥n tard√≠a), heartbeats y sin abortar el notebook.
# ============================================================================================================================

from __future__ import annotations
import os, time
from pathlib import Path
from datetime import datetime, timezone
from typing import List, Dict, Any, Tuple, Optional

try:
    import polars as pl
except Exception as e:
    raise RuntimeError("Se requiere 'polars'. Inst√°lalo e int√©ntalo de nuevo.") from e

CELL_LABEL = "15-Index-Catalog"
RUN_ID = globals().get("RUN_ID", datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
DATA_ROOT = Path(globals().get("DATA_ROOT", str(path_contract.data_root()))).resolve()

HIST_DIR  = DATA_ROOT / "historical_data" / "m5_clean"
RAW_DIR   = DATA_ROOT / "bulk_data" / "m5_raw"
PROC_DIR  = DATA_ROOT / "processed_data" / "m5_windows"
META_DIR  = DATA_ROOT / "metadata"

DAY_INDEX_PATH    = META_DIR / "day_index_m5.parquet"
SYMBOL_INDEX_PATH = META_DIR / "symbol_index_m5.parquet"
WINDOW_CATALOG    = META_DIR / "window_catalog_m5.parquet"

TIMEZONE_IANA          = globals().get("TIMEZONE_IANA", "America/Guayaquil")
COMPUTE_DUPES          = bool(globals().get("COMPUTE_DUPES", False))
PROGRESS_EVERY_FILES   = int(globals().get("PROGRESS_EVERY_FILES", 500))
PROGRESS_EVERY_SECONDS = float(globals().get("PROGRESS_EVERY_SECONDS", 1.5))
PARQUET_COMP           = "zstd"
PARQUET_STATS          = True

def _now_local_iana() -> str:
    try:
        from zoneinfo import ZoneInfo
        return datetime.now(timezone.utc).astimezone(ZoneInfo(TIMEZONE_IANA)).isoformat(timespec="seconds")
    except Exception:
        return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _log(level: str, msg: str):
    print(f"[{_now_local_iana()}] [{RUN_ID}] [{level.upper()}] [{CELL_LABEL}] {msg}", flush=True)

def _fmt_bytes(n: int) -> str:
    if n < 1024: return f"{n} B"
    kib = n/1024
    if kib < 1024: return f"{kib:.2f} KiB"
    mib = kib/1024
    if mib < 1024: return f"{mib:.2f} MiB"
    gib = mib/1024
    if gib < 1024: return f"{gib:.2f} GiB"
    tib = gib/1024
    return f"{tib:.2f} TiB"

def _schema_str(df: pl.DataFrame):
    return [f"  ‚Ä¢ {c}: {str(t)}" for c, t in zip(df.columns, df.dtypes)]

def _safe_file_size(p: Path) -> int:
    try: return p.stat().st_size
    except Exception: return 0

def _count_parts_quick(base: Path, limit: int = 1) -> int:
    if not base.exists(): return 0
    c = 0
    for _ in base.rglob("part=*.parquet"):
        c += 1
        if c >= limit: break
    return c

def _collect_hist_or_raw_parts(base: Path) -> List[Path]:
    parts: List[Path] = []
    for sdir in sorted(p for p in base.glob("symbol=*") if p.is_dir()):
        parts.extend(sorted(sdir.rglob("part=*.parquet")))
    return parts

def _collect_window_unique_parts(proc_dir: Path) -> List[Path]:
    """Devuelve partes √∫nicas (symbol, ymd) priorizando last_180d > last_90d > last_30d."""
    win_order = ["last_180d", "last_90d", "last_30d"]
    chosen: Dict[Tuple[str,str], Path] = {}
    for w in win_order:
        base = proc_dir / f"window={w}"
        if not base.exists(): continue
        for sdir in sorted(p for p in base.glob("symbol=*") if p.is_dir()):
            sym = sdir.name.split("=",1)[-1]
            for pf in sorted(sdir.rglob("part=*.parquet")):
                ymd = pf.name.split("=")[-1].split(".")[0]
                key = (sym, ymd)
                if key not in chosen:
                    chosen[key] = pf
    return list(chosen.values())

print("="*110)
print(f"Inicio Celda 15 ‚Äî √çndices de lectura y cat√°logo (M5) | TZ local: {TIMEZONE_IANA}")
print(f"Hora local: {_now_local_iana()} | Hora UTC: {datetime.now(timezone.utc).isoformat(timespec='seconds')}")
print("-"*110)
print(f"DATA_ROOT: {DATA_ROOT}")
META_DIR.mkdir(parents=True, exist_ok=True)

# ------------------------ Selecci√≥n de capa para day/symbol index ------------------------
source_kind = None
part_files: List[Path] = []

if _count_parts_quick(HIST_DIR, 1) > 0:
    source_kind = "clean"
    part_files = _collect_hist_or_raw_parts(HIST_DIR)
elif _count_parts_quick(RAW_DIR, 1) > 0:
    source_kind = "raw"
    part_files = _collect_hist_or_raw_parts(RAW_DIR)
elif _count_parts_quick(PROC_DIR, 1) > 0:
    source_kind = "windows"
    part_files = _collect_window_unique_parts(PROC_DIR)

if source_kind is None:
    print("‚ö†Ô∏è  No hay archivos en m5_clean, m5_raw ni m5_windows. Escribiendo √≠ndices vac√≠os.")
    pl.DataFrame({"symbol": [], "date": [], "path": [], "bytes": [], "n_rows": [],
                  "min_ts": [], "max_ts": [], "dups": []}
                ).write_parquet(DAY_INDEX_PATH, compression=PARQUET_COMP, statistics=PARQUET_STATS)
    pl.DataFrame({"symbol": [], "days": [], "rows_total": [], "bytes_total": [],
                  "first_day": [], "last_day": [], "min_ts": [], "max_ts": [], "dups_total": []}
                ).write_parquet(SYMBOL_INDEX_PATH, compression=PARQUET_COMP, statistics=PARQUET_STATS)
else:
    where = {"clean": HIST_DIR, "raw": RAW_DIR, "windows": PROC_DIR}[source_kind]
    label = {"clean": "historical_data/m5_clean",
             "raw": "bulk_data/m5_raw",
             "windows": "processed_data/m5_windows (canonicalizado)"}[source_kind]
    print(f"Origen indexado (auto): {where}  [{label}]")
    print("-"*110)

# ----------------------------- Cat√°logo de ventanas (siempre) -----------------------------
def _window_catalog(proc_dir: Path) -> pl.DataFrame:
    rows = []
    for wname in ("last_30d","last_90d","last_180d"):
        base = proc_dir / f"window={wname}"
        if not base.exists():
            rows.append({"window": wname, "symbols": 0, "files": 0, "bytes": 0, "from_ts": None, "to_ts": None})
            continue
        files = sorted(base.rglob("part=*.parquet"))
        nfiles = len(files)
        bytes_sum = sum(_safe_file_size(p) for p in files)
        sym_count = len([p for p in base.glob("symbol=*") if p.is_dir()])
        ts_min = None; ts_max = None
        if files:
            lf = pl.scan_parquet([str(p) for p in files], low_memory=True).select(pl.col("timestamp_utc"))
            ag = lf.select([pl.min("timestamp_utc").alias("min_ts"),
                            pl.max("timestamp_utc").alias("max_ts")]).collect()
            ts_min = None if ag["min_ts"][0] is None else int(ag["min_ts"][0])
            ts_max = None if ag["max_ts"][0] is None else int(ag["max_ts"][0])
        rows.append({"window": wname, "symbols": int(sym_count), "files": int(nfiles),
                     "bytes": int(bytes_sum), "from_ts": ts_min, "to_ts": ts_max})
    return pl.DataFrame(rows).select(["window","symbols","files","bytes","from_ts","to_ts"])

win_df = _window_catalog(PROC_DIR)
win_df.write_parquet(WINDOW_CATALOG, compression=PARQUET_COMP, statistics=PARQUET_STATS)

# ----------------------------- C√°lculo √≠ndices si hay fuente -----------------------------
if source_kind is None:
    print(f"Cat√°logo de ventanas ‚Üí {WINDOW_CATALOG} | filas={win_df.height}")
    print("‚úÖ √çndices vac√≠os escritos. Finaliza sin abortar.")
else:
    n_files = len(part_files)
    # derivar set de s√≠mbolos detectados
    def _sym_from_parts(path: Path) -> Optional[str]:
        for part in path.parts:
            if part.startswith("symbol="):
                return part.split("=",1)[-1]
        return None
    symbols = sorted(set(s for s in (_sym_from_parts(p) for p in part_files) if s))
    print(f"S√≠mbolos detectados: {len(symbols)} | Archivos can√≥nicos: {n_files}")

    def _stats_from_part(path: Path) -> Tuple[int, Optional[int], Optional[int], Optional[int]]:
        lf = pl.scan_parquet(str(path), low_memory=True).select(pl.col("timestamp_utc"))
        n_rows = int(lf.select(pl.len()).collect().item())
        if n_rows == 0:
            return 0, None, None, (0 if COMPUTE_DUPES else None)
        ag = lf.select([
            pl.min("timestamp_utc").alias("min_ts"),
            pl.max("timestamp_utc").alias("max_ts"),
            (pl.len() - pl.col("timestamp_utc").n_unique()).alias("dups") if COMPUTE_DUPES else pl.lit(None).alias("dups"),
        ]).collect()
        min_ts = ag["min_ts"][0]; max_ts = ag["max_ts"][0]
        dups   = ag["dups"][0] if COMPUTE_DUPES else None
        return int(n_rows), (None if min_ts is None else int(min_ts)), (None if max_ts is None else int(max_ts)), (None if dups is None else int(dups))

    # -------- day_index --------
    t0_day = time.monotonic()
    last_beat = t0_day
    day_rows: List[Dict[str, Any]] = []

    for i, f in enumerate(part_files, 1):
        try:
            sym = _sym_from_parts(f) or ""
            ymd = f.name.split("=")[-1].split(".")[0]
            fsize = _safe_file_size(f)
            n_rows, min_ts, max_ts, dups = _stats_from_part(f)
            day_rows.append({
                "symbol": sym, "date": ymd, "path": str(f), "bytes": int(fsize),
                "n_rows": int(n_rows),
                "min_ts": None if min_ts is None else int(min_ts),
                "max_ts": None if max_ts is None else int(max_ts),
                "dups":   None if dups   is None else int(dups),
            })
        except Exception as e:
            _log("WARNING", f"Error leyendo {f.name}: {e!r}")
        now = time.monotonic()
        if (i % PROGRESS_EVERY_FILES == 0) or (now - last_beat >= PROGRESS_EVERY_SECONDS) or (i == n_files):
            rate = i / max(now - t0_day, 1e-6)
            _log("INFO", f"day_index: {i}/{n_files} archivos | {rate:.1f} files/s")
            last_beat = now

    day_df = pl.DataFrame(day_rows).with_columns([
        pl.col("symbol").cast(pl.Utf8), pl.col("date").cast(pl.Utf8), pl.col("path").cast(pl.Utf8),
        pl.col("bytes").cast(pl.Int64), pl.col("n_rows").cast(pl.Int64),
        pl.col("min_ts").cast(pl.Int64), pl.col("max_ts").cast(pl.Int64),
        (pl.col("dups").cast(pl.Int64) if COMPUTE_DUPES else pl.lit(None).alias("dups").cast(pl.Int64)),
    ]).select(["symbol","date","path","bytes","n_rows","min_ts","max_ts","dups"])
    day_df.write_parquet(DAY_INDEX_PATH, compression=PARQUET_COMP, statistics=PARQUET_STATS)
    t_day = time.monotonic() - t0_day

    # -------- symbol_index --------
    t0_sym = time.monotonic()
    sym_agg = (
        day_df.with_columns(pl.col("date").str.slice(0,8).cast(pl.Int32).alias("date_int"))
              .group_by("symbol")
              .agg([
                  pl.len().alias("days"),
                  pl.col("n_rows").sum().alias("rows_total"),
                  pl.col("bytes").sum().alias("bytes_total"),
                  pl.col("date_int").min().alias("first_day"),
                  pl.col("date_int").max().alias("last_day"),
                  pl.col("min_ts").min().alias("min_ts"),
                  pl.col("max_ts").max().alias("max_ts"),
                  (pl.col("dups").sum().alias("dups_total") if COMPUTE_DUPES else pl.lit(None).alias("dups_total")),
              ])
              .sort("symbol")
    )
    sym_agg.write_parquet(SYMBOL_INDEX_PATH, compression=PARQUET_COMP, statistics=PARQUET_STATS)
    t_sym = time.monotonic() - t0_sym

    # -------- prints finales --------
    print("-"*110)
    print(f"√çndice diario ‚Üí {DAY_INDEX_PATH}")
    print(f"  - Filas: {day_df.height} | Columnas: {len(day_df.columns)}")
    print("  - Esquema:")
    for line in _schema_str(day_df): print(line)
    print(f"  - Tiempo construcci√≥n: {t_day:.2f} s")

    print("-"*110)
    print(f"√çndice por s√≠mbolo ‚Üí {SYMBOL_INDEX_PATH}")
    print(f"  - Filas (s√≠mbolos): {sym_agg.height} | Columnas: {len(sym_agg.columns)}")
    print("  - Esquema:")
    for line in _schema_str(sym_agg): print(line)
    print(f"  - Tiempo construcci√≥n: {t_sym:.2f} s")

# -------- Cat√°logo de ventanas (impresi√≥n) --------
print("-"*110)
print(f"Cat√°logo de ventanas ‚Üí {WINDOW_CATALOG}")
print(f"  - Filas (ventanas): {win_df.height} | Columnas: {len(win_df.columns)}")
for line in _schema_str(win_df): print(line)
for r in win_df.iter_rows(named=True):
    fr, to = r["from_ts"], r["to_ts"]
    if (fr is not None) and (to is not None):
        utc_min = datetime.fromtimestamp(fr/1000, tz=timezone.utc).isoformat(timespec="seconds")
        utc_max = datetime.fromtimestamp(to/1000, tz=timezone.utc).isoformat(timespec="seconds")
        print(f"Ventana {r['window']}: symbols={r['symbols']} | files={r['files']} | bytes={_fmt_bytes(int(r['bytes']))} | rango UTC: {utc_min} ‚Üí {utc_max}")
    else:
        print(f"Ventana {r['window']}: symbols={r['symbols']} | files={r['files']} | bytes={_fmt_bytes(int(r['bytes']))} | rango: (sin filas)")

print("-"*110)
print("‚úÖ √çndices y cat√°logo listos (incluye fallback a ventanas cuando GOLD/RAW est√°n vac√≠os).")
# ============================================================================================================================
 

Inicio Celda 15 ‚Äî √çndices de lectura y cat√°logo (M5) | TZ local: America/Guayaquil
Hora local: 2025-12-03T05:29:45-05:00 | Hora UTC: 2025-12-03T10:29:45+00:00
--------------------------------------------------------------------------------------------------------------
DATA_ROOT: C:\Quant\MT5_Data_Extraction\data
Origen indexado (auto): C:\Quant\MT5_Data_Extraction\data\historical_data\m5_clean  [historical_data/m5_clean]
--------------------------------------------------------------------------------------------------------------
S√≠mbolos detectados: 92 | Archivos can√≥nicos: 103917
[2025-12-03T05:29:56-05:00] [20251202_232253] [INFO] [15-Index-Catalog] day_index: 1090/103917 archivos | 545.0 files/s
[2025-12-03T05:29:58-05:00] [20251202_232253] [INFO] [15-Index-Catalog] day_index: 2000/103917 archivos | 559.0 files/s
[2025-12-03T05:30:00-05:00] [20251202_232253] [INFO] [15-Index-Catalog] day_index: 3164/103917 archivos | 567.2 files/s
[2025-12-03T05:30:01-05:00] [20251202_232253

In [20]:
# ======================= Celda 16 ‚Äî QA Trading-Ready (s√≠ntesis ejecutiva) =======================
# Prop√≥sito:
#   - Construir un resumen ejecutivo de si el dataset M5 actual est√° "TRADING_READY" o no.
#   - Fusiona informaci√≥n de:
#       * filtros 3B / universe base (eligible_symbols_by_cost.*)
#       * capa GOLD (historical_data/m5_clean)
#       * ventanas recientes (processed_data/m5_windows)
#       * costes por s√≠mbolo (costs_summary.parquet, Celda 05)
#       * QA operativa por s√≠mbolo (qa_operativa_summary.parquet, Celda 10)
#       * universe_snapshot_{RUN_ID}.parquet (Celda 12B)
#   - Escribe un √∫nico artefacto JSON:
#       * metadata/qa_trading_ready_summary.json
#
# Notas de dise√±o:
#   - NO recalcula QA ni reconstruye datos; s√≥lo lee artefactos existentes.
#   - Incluye m√©tricas ricas para el siguiente notebook (research / backtests).
#   - El "gate" (TRADING_READY vs NO) es configurable:
#       * ENFORCE_TRADING_GATE = False ‚Üí status="EMPTY" (informativo, no bloqueante).
#       * ENFORCE_TRADING_GATE = True  ‚Üí status="PASS"/"FAIL" seg√∫n criterios definidos.
#   - Por defecto, REQUIRED_WINDOW="last_30d" (modo research, no producci√≥n).
# ===============================================================================================

from __future__ import annotations
import os, json, math, time
from pathlib import Path
from datetime import datetime, timedelta, timezone, date
from typing import Any, Dict, List, Optional, Tuple

try:
    import polars as pl
except Exception as e:
    raise RuntimeError("Se requiere 'polars'. Inst√°lalo e int√©ntalo de nuevo.") from e

# ----------------------------- Identidad / Config base -----------------------------
CELL_LABEL    = "16-QA-Trading-Ready"
RUN_ID        = globals().get("RUN_ID", datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
TIMEZONE_IANA = globals().get("TIMEZONE_IANA", "America/Guayaquil")

# DATA_ROOT: preferimos el definido en Celda 02; si no, env M5_DATA_ROOT; si no, ./data
DATA_ROOT = Path(
    globals().get("DATA_ROOT", os.environ.get("M5_DATA_ROOT", "./data"))
).resolve()

META_DIR = DATA_ROOT / "metadata"
HIST_DIR = DATA_ROOT / "historical_data" / "m5_clean"
PROC_DIR = DATA_ROOT / "processed_data"  / "m5_windows"

META_DIR.mkdir(parents=True, exist_ok=True)

COSTS_SUMMARY_PATH     = META_DIR / "costs_summary.parquet"                # Celda 05
QA_OPER_SUMMARY_PATH   = META_DIR / "qa_operativa_summary.parquet"        # Celda 10
UNIVERSE_SNAPSHOT_PATH = META_DIR / f"universe_snapshot_{RUN_ID}.parquet" # Celda 12B
UNIVERSE_LATEST_PATH   = META_DIR / "universe_snapshot_latest.parquet"    # fallback
QA_TRADING_JSON        = META_DIR / "qa_trading_ready_summary.json"

# Par√°metros del gate (ajustables v√≠a globals() antes de ejecutar esta celda)
WINDOWS = {"last_30d": 30, "last_90d": 90, "last_180d": 180}
REQUIRED_WINDOW           = str(globals().get("REQUIRED_WINDOW", "last_30d"))
GATE_BASE_MIN_FLOOR       = int(globals().get("GATE_BASE_MIN_FLOOR", 50))
GATE_BASE_PCT             = float(globals().get("GATE_BASE_PCT", 0.70))    # 70% del universo base
MIN_COSTS_OK_RATIO_PCT    = float(globals().get("MIN_COSTS_OK_RATIO_PCT", 70.0))
MIN_QA_OK_RATIO_PCT       = float(globals().get("MIN_QA_OK_RATIO_PCT", 70.0))
READY_COST_FLAGS          = list(globals().get("READY_COST_FLAGS", ["OK"]))
READY_QA_FLAGS            = list(globals().get("READY_QA_FLAGS", ["OK"]))
ENFORCE_TRADING_GATE      = bool(globals().get("ENFORCE_TRADING_GATE", False))

# ----------------------------- Utils ---------------------------------------------
def _now_local_iana() -> str:
    try:
        from zoneinfo import ZoneInfo
        return datetime.now(timezone.utc).astimezone(ZoneInfo(TIMEZONE_IANA)).isoformat(timespec="seconds")
    except Exception:
        return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _log(level: str, msg: str):
    """
    Si log_msg(celda, level, message) existe (Celda 03), lo usa.
    Sino, hace print() con timestamp local.
    """
    if "log_msg" in globals():
        try:
            log_msg(CELL_LABEL, level, msg)
            return
        except Exception:
            pass
    print(f"[{_now_local_iana()}] [{RUN_ID}] [{level.upper()}] [{CELL_LABEL}] {msg}", flush=True)

def _fmt_bytes(n: int) -> str:
    if n < 1024: return f"{n} B"
    kib = n/1024
    if kib < 1024: return f"{kib:.2f} KiB"
    mib = kib/1024
    if mib < 1024: return f"{mib:.2f} MiB"
    gib = mib/1024
    if gib < 1024: return f"{gib:.2f} GiB"
    tib = gib/1024
    return f"{tib:.2f} TiB"

def _hist_symbols() -> set[str]:
    """S√≠mbolos presentes en la capa GOLD (m5_clean)."""
    if not HIST_DIR.exists():
        return set()
    out: set[str] = set()
    for d in HIST_DIR.glob("symbol=*"):
        if d.is_dir():
            out.add(d.name.split("=",1)[-1])
    return out

def _expected_ymd(d: date) -> str:
    return f"{d.year:04d}{d.month:02d}{d.day:02d}"

def _expected_ymds(n_days: int) -> set[str]:
    """
    Conjunto de fechas esperadas (YYYYMMDD) para una ventana de n_days,
    terminando en el d√≠a de ayer (UTC).
    """
    end_day = (datetime.now(timezone.utc) - timedelta(days=1)).date()
    start_day = end_day - timedelta(days=n_days-1)
    out: List[str] = []
    cur = start_day
    while cur <= end_day:
        out.append(_expected_ymd(cur))
        cur += timedelta(days=1)
    return set(out)

def _window_days_for_symbol(wname: str, sym: str) -> set[str]:
    """
    Devuelve el conjunto de YYYYMMDD disponibles en m5_windows
    para una ventana dada y s√≠mbolo concreto.
    """
    base = PROC_DIR / f"window={wname}" / f"symbol={sym}"
    if not base.exists():
        return set()
    return {
        p.name.split("=")[-1].split(".")[0]
        for p in base.glob("part=*.parquet")
    }

def _read_eligible_set() -> tuple[Optional[set[str]], str]:
    """
    Lee el universo base 3B (por costes) desde metadata/filters/,
    si existe. Fallback: None + etiqueta.
    """
    filt_dir = META_DIR / "filters"
    p_parq = filt_dir / "eligible_symbols_by_cost.parquet"
    p_txt  = filt_dir / "eligible_symbols_by_cost.txt"
    try:
        if p_parq.exists():
            df = pl.read_parquet(p_parq)
            col = "symbol" if "symbol" in df.columns else df.columns[0]
            return set(df[col].cast(pl.Utf8).to_list()), "eligible_3B(parquet)"
    except Exception:
        pass
    if p_txt.exists():
        s: set[str] = set()
        for ln in p_txt.read_text(encoding="utf-8").splitlines():
            v = ln.strip()
            if v:
                s.add(v)
        return s, "eligible_3B(txt)"
    return None, "eligible_missing"

# ----------------------------- Cabecera de ejecuci√≥n -----------------------------
print("="*110)
print("Celda 16 ‚Äî QA Trading-Ready (s√≠ntesis ejecutiva del Data Engine M5)")
print(f"TZ local: {TIMEZONE_IANA}")
print(f"Hora local: {_now_local_iana()} | Hora UTC: {datetime.now(timezone.utc).isoformat(timespec='seconds')}")
print("-"*110)
print(f"DATA_ROOT : {DATA_ROOT}")
print(f"Metadata  : {META_DIR}")
print(f"GOLD (m5_clean)  : {HIST_DIR}")
print(f"WINDOWS (m5_win) : {PROC_DIR}")
print(f"Universe snapshot esperado: {UNIVERSE_SNAPSHOT_PATH}")
print(f"Resumen QA trading ‚Üí {QA_TRADING_JSON}")
print("-"*110)
print(f"REQUIRED_WINDOW        = '{REQUIRED_WINDOW}'")
print(f"GATE_BASE_MIN_FLOOR    = {GATE_BASE_MIN_FLOOR}")
print(f"GATE_BASE_PCT          = {GATE_BASE_PCT:.2f}")
print(f"MIN_COSTS_OK_RATIO_PCT = {MIN_COSTS_OK_RATIO_PCT:.1f}")
print(f"MIN_QA_OK_RATIO_PCT    = {MIN_QA_OK_RATIO_PCT:.1f}")
print(f"READY_COST_FLAGS       = {READY_COST_FLAGS}")
print(f"READY_QA_FLAGS         = {READY_QA_FLAGS}")
print(f"ENFORCE_TRADING_GATE   = {ENFORCE_TRADING_GATE}")
print("-"*110)

issues: List[str] = []

# ----------------------------- 1) GOLD summary -----------------------------------
hist_syms  = _hist_symbols()
hist_parts = sorted(HIST_DIR.rglob("part=*.parquet")) if HIST_DIR.exists() else []
hist_files = len(hist_parts)
hist_bytes = sum(p.stat().st_size for p in hist_parts)

print(f"üì¶ GOLD (m5_clean): files={hist_files:,} | bytes={_fmt_bytes(hist_bytes)} | symbols={len(hist_syms)}")

# ----------------------------- 2) Universo base (3B) -----------------------------
eligible_set, elig_label = _read_eligible_set()
if eligible_set is None:
    base_universe = set(hist_syms)
    base_label = "clean_fallback"
    issues.append("eligible_symbols_by_cost.* no encontrado; se usa fallback al universo GOLD.")
else:
    base_universe = set(eligible_set)
    base_label = elig_label

print(f"üîé Universo base (costes): {base_label} ‚Üí {len(base_universe)} s√≠mbolo(s)")

intersect_base_clean = base_universe & hist_syms
missing_in_clean     = base_universe - hist_syms
outside_but_clean    = hist_syms - base_universe

print(f"    ‚à© base ‚à© GOLD = {len(intersect_base_clean)} | missing_in_clean={len(missing_in_clean)} | outside_but_clean={len(outside_but_clean)}")

# ----------------------------- 3) Universe snapshot (Celda 12B) ------------------
universe_syms: Optional[set[str]] = None
universe_n = 0
universe_path_used = None

if UNIVERSE_SNAPSHOT_PATH.exists():
    try:
        df_univ = pl.read_parquet(UNIVERSE_SNAPSHOT_PATH)
        if "symbol" not in df_univ.columns:
            raise ValueError("universe_snapshot no tiene columna 'symbol'.")
        universe_syms = set(df_univ.get_column("symbol").cast(pl.Utf8, strict=False).to_list())
        universe_n = len(universe_syms)
        universe_path_used = str(UNIVERSE_SNAPSHOT_PATH)
        _log("INFO", f"Universe snapshot cargado ({universe_n} s√≠mbolos) desde {UNIVERSE_SNAPSHOT_PATH}")
    except Exception as e:
        issues.append(f"universe_snapshot ilegible: {e}")
        _log("WARNING", f"universe_snapshot ilegible: {e}")
elif UNIVERSE_LATEST_PATH.exists():
    try:
        df_univ = pl.read_parquet(UNIVERSE_LATEST_PATH)
        if "symbol" not in df_univ.columns:
            raise ValueError("universe_snapshot_latest no tiene columna 'symbol'.")
        universe_syms = set(df_univ.get_column("symbol").cast(pl.Utf8, strict=False).to_list())
        universe_n = len(universe_syms)
        universe_path_used = str(UNIVERSE_LATEST_PATH)
        _log("WARNING", f"No se encontr√≥ universe_snapshot para RUN_ID={RUN_ID}; usando universe_snapshot_latest.parquet.")
    except Exception as e:
        issues.append(f"universe_snapshot_latest ilegible: {e}")
        _log("WARNING", f"universe_snapshot_latest ilegible: {e}")
else:
    issues.append("No se encontr√≥ universe_snapshot (ni por RUN_ID ni latest).")
    _log("WARNING", "Universe snapshot no encontrado; la auditor√≠a usar√° s√≥lo costes + GOLD.")

# ----------------------------- 4) Costes summary (Celda 05) ----------------------
costs_exists = COSTS_SUMMARY_PATH.exists()
syms_costs: set[str] = set()
costs_flag_dist: Optional[Dict[str, int]] = None
costs_ok_ratio_pct: Optional[float] = None

if costs_exists:
    try:
        df_costs = pl.read_parquet(COSTS_SUMMARY_PATH)
        if "symbol" not in df_costs.columns:
            raise ValueError("costs_summary.parquet no tiene columna 'symbol'.")
        syms_costs = set(df_costs.get_column("symbol").cast(pl.Utf8, strict=False).to_list())

        if "cost_flag" in df_costs.columns:
            flag_counts = (
                df_costs
                .group_by("cost_flag")
                .agg(pl.len().alias("n"))
                .to_dicts()
            )
            costs_flag_dist = {r["cost_flag"]: int(r["n"]) for r in flag_counts}
            total_flags = sum(costs_flag_dist.values())
            if total_flags > 0 and "OK" in costs_flag_dist:
                costs_ok_ratio_pct = round(100.0 * costs_flag_dist["OK"] / total_flags, 1)
        _log("INFO", f"costs_summary.parquet cargado ({len(syms_costs)} s√≠mbolos).")
    except Exception as e:
        costs_exists = False
        issues.append(f"costs_summary ilegible: {e}")
        _log("WARNING", f"costs_summary ilegible: {e}")
else:
    issues.append("costs_summary.parquet no encontrado (Celda 05 no ejecutada o en otra ruta).")
    _log("WARNING", "costs_summary.parquet no encontrado.")

# ----------------------------- 5) QA operativa summary (Celda 10) ----------------
qa_exists = QA_OPER_SUMMARY_PATH.exists()
syms_qa: set[str] = set()
qa_flag_dist: Optional[Dict[str, int]] = None
qa_ok_ratio_pct: Optional[float] = None

if qa_exists:
    try:
        df_qa = pl.read_parquet(QA_OPER_SUMMARY_PATH)
        if "symbol" not in df_qa.columns:
            raise ValueError("qa_operativa_summary.parquet no tiene columna 'symbol'.")
        syms_qa = set(df_qa.get_column("symbol").cast(pl.Utf8, strict=False).to_list())

        if "qa_operativa_flag" in df_qa.columns:
            flag_counts = (
                df_qa
                .group_by("qa_operativa_flag")
                .agg(pl.len().alias("n"))
                .to_dicts()
            )
            qa_flag_dist = {r["qa_operativa_flag"]: int(r["n"]) for r in flag_counts}
            total_flags = sum(qa_flag_dist.values())
            if total_flags > 0 and "OK" in qa_flag_dist:
                qa_ok_ratio_pct = round(100.0 * qa_flag_dist["OK"] / total_flags, 1)
        _log("INFO", f"qa_operativa_summary.parquet cargado ({len(syms_qa)} s√≠mbolos).")
    except Exception as e:
        qa_exists = False
        issues.append(f"qa_operativa_summary ilegible: {e}")
        _log("WARNING", f"qa_operativa_summary ilegible: {e}")
else:
    issues.append("qa_operativa_summary.parquet no encontrado (Celda 10 no ejecutada o en otra ruta).")
    _log("WARNING", "qa_operativa_summary.parquet no encontrado.")

# ----------------------------- 6) Cobertura por ventanas (m5_windows) ------------
# S√≠mbolos que consideramos para ventanas: intersecci√≥n entre universo base y GOLD
symbols_for_windows = sorted(intersect_base_clean)
expected_by_win: Dict[str, set[str]] = {w: _expected_ymds(n) for w, n in WINDOWS.items()}
passed_by_window: Dict[str, set[str]] = {w: set() for w in WINDOWS}

t0 = time.monotonic()
last = t0

for k, sym in enumerate(symbols_for_windows, 1):
    for wname, exp_set in expected_by_win.items():
        have = _window_days_for_symbol(wname, sym)
        if exp_set.issubset(have):
            passed_by_window[wname].add(sym)
    now = time.monotonic()
    if now - last >= 2.0:
        _log("INFO", f"Ventanas: s√≠mbolos evaluados {k}/{len(symbols_for_windows)}")
        last = now

print("-"*110)
for wname in ("last_30d", "last_90d", "last_180d"):
    if wname in passed_by_window:
        n_pass = len(passed_by_window[wname])
        base_n = len(symbols_for_windows)
        pct = (100.0 * n_pass / base_n) if base_n else 0.0
        print(f"ü™ü {wname}: pasan={n_pass} / {base_n} ({pct:.1f}%)")

# ----------------------------- 7) Construcci√≥n de listas de s√≠mbolos ready -------
# Base consolidada por s√≠mbolo
symbols_all = sorted(set(symbols_for_windows))

rows_detail: List[Dict[str, Any]] = []
ready_by_window: Dict[str, List[str]] = {w: [] for w in WINDOWS}

for sym in symbols_all:
    cost_flag = None
    qa_flag   = None

    if costs_exists:
        try:
            row = df_costs.filter(pl.col("symbol") == sym)
            if row.height > 0 and "cost_flag" in row.columns:
                cost_flag = row["cost_flag"][0]
        except Exception:
            cost_flag = None

    if qa_exists:
        try:
            row = df_qa.filter(pl.col("symbol") == sym)
            if row.height > 0 and "qa_operativa_flag" in row.columns:
                qa_flag = row["qa_operativa_flag"][0]
        except Exception:
            qa_flag = None

    has_full_30  = sym in passed_by_window.get("last_30d", set())
    has_full_90  = sym in passed_by_window.get("last_90d", set())
    has_full_180 = sym in passed_by_window.get("last_180d", set())

    # Best window = la m√°s larga que cumple, en orden 180 > 90 > 30
    best_window = None
    if has_full_180:
        best_window = "last_180d"
    elif has_full_90:
        best_window = "last_90d"
    elif has_full_30:
        best_window = "last_30d"

    # Condici√≥n de "ready" por ventana: pasa la ventana Y flags aceptables
    for wname, has_full in [("last_30d", has_full_30),
                            ("last_90d", has_full_90),
                            ("last_180d", has_full_180)]:
        if not has_full:
            continue
        if (cost_flag is not None) and (READY_COST_FLAGS and cost_flag not in READY_COST_FLAGS):
            continue
        if (qa_flag is not None) and (READY_QA_FLAGS and qa_flag not in READY_QA_FLAGS):
            continue
        ready_by_window[wname].append(sym)

    rows_detail.append({
        "symbol": sym,
        "best_window": best_window,
        "cost_flag": cost_flag,
        "qa_operativa_flag": qa_flag,
        "has_full_last_30d": has_full_30,
        "has_full_last_90d": has_full_90,
        "has_full_last_180d": has_full_180,
    })

# Eliminamos duplicados manteniendo orden
for wname, lst in ready_by_window.items():
    seen: set[str] = set()
    uniq: List[str] = []
    for s in lst:
        if s not in seen:
            seen.add(s)
            uniq.append(s)
    ready_by_window[wname] = uniq

print("-"*110)
for wname in ("last_30d", "last_90d", "last_180d"):
    lst = ready_by_window.get(wname, [])
    print(f"‚úÖ S√≠mbolos 'ready' para {wname} (ventana completa + flags costes/QA aceptables): {len(lst)}")

# ----------------------------- 8) Evaluaci√≥n del gate global ---------------------
base_size = len(base_universe)
min_symbols_hist = max(GATE_BASE_MIN_FLOOR, math.ceil(GATE_BASE_PCT * base_size)) if base_size > 0 else 0

approved_syms_required = set(ready_by_window.get(REQUIRED_WINDOW, []))
approved_count = len(approved_syms_required)

cond_symbols = (approved_count >= min_symbols_hist)
cond_costs   = (costs_ok_ratio_pct is None) or (costs_ok_ratio_pct >= MIN_COSTS_OK_RATIO_PCT)
cond_qa      = (qa_ok_ratio_pct is None) or (qa_ok_ratio_pct >= MIN_QA_OK_RATIO_PCT)

computed_status: str
if not symbols_for_windows:
    computed_status = "EMPTY"
    issues.append("No hay s√≠mbolos en la intersecci√≥n base ‚à© GOLD para evaluar ventanas.")
else:
    if cond_symbols and cond_costs and cond_qa:
        computed_status = "PASS"
    else:
        computed_status = "FAIL"
        if not cond_symbols:
            issues.append(
                f"Gate de ventanas no cumplido: {approved_count} < MIN_SYMBOLS_HIST={min_symbols_hist} "
                f"para REQUIRED_WINDOW='{REQUIRED_WINDOW}'."
            )
        if not cond_costs and costs_ok_ratio_pct is not None:
            issues.append(
                f"Ratio de costes OK insuficiente: {costs_ok_ratio_pct:.1f}% < MIN_COSTS_OK_RATIO_PCT={MIN_COSTS_OK_RATIO_PCT:.1f}%."
            )
        if not cond_qa and qa_ok_ratio_pct is not None:
            issues.append(
                f"Ratio de QA operativa OK insuficiente: {qa_ok_ratio_pct:.1f}% < MIN_QA_OK_RATIO_PCT={MIN_QA_OK_RATIO_PCT:.1f}%."
            )

# Estado efectivo que ver√° el resto del pipeline
if ENFORCE_TRADING_GATE:
    final_status = computed_status
else:
    # Modo informativo: el Data Engine no bloquea, s√≥lo informa.
    final_status = "EMPTY"

print("-"*110)
print(f"üìè Umbral din√°mico MIN_SYMBOLS_HIST = {min_symbols_hist} (base={base_size}, regla={int(GATE_BASE_PCT*100)}%)")
print(f"üß∞ Ventana requerida  : {REQUIRED_WINDOW}")
print(f"üß∞ S√≠mbolos aprobados : {approved_count}")
print(f"üìä Ratio costes OK     : {costs_ok_ratio_pct}%")
print(f"üìä Ratio QA op. OK     : {qa_ok_ratio_pct}%")
print(f"üü¶ computed_status     : {computed_status}")
print(f"üü© final status (JSON) : {final_status}  (ENFORCE_TRADING_GATE={ENFORCE_TRADING_GATE})")

# ----------------------------- 9) Construcci√≥n del JSON de salida ----------------
qa_summary: Dict[str, Any] = {
    "run_id": RUN_ID,
    "ts_utc": datetime.now(timezone.utc).isoformat(),
    "data_root": str(DATA_ROOT),
    "status": final_status,  # lo que ver√° Celda 18
    "gate_config": {
        "required_window": REQUIRED_WINDOW,
        "windows": WINDOWS,
        "base_min_floor": GATE_BASE_MIN_FLOOR,
        "base_pct": GATE_BASE_PCT,
        "min_costs_ok_ratio_pct": MIN_COSTS_OK_RATIO_PCT,
        "min_qa_ok_ratio_pct": MIN_QA_OK_RATIO_PCT,
        "ready_cost_flags": READY_COST_FLAGS,
        "ready_qa_flags": READY_QA_FLAGS,
        "enforce_trading_gate": ENFORCE_TRADING_GATE,
    },
    "gold": {
        "files": hist_files,
        "bytes": hist_bytes,
        "bytes_human": _fmt_bytes(hist_bytes),
        "symbols": len(hist_syms),
    },
    "universe_summary": {
        "base_label": base_label,
        "eligible_3b_symbols": len(base_universe),
        "gold_symbols": len(hist_syms),
        "intersection_base_vs_gold": len(intersect_base_clean),
        "missing_in_clean": len(missing_in_clean),
        "outside_but_clean": len(outside_but_clean),
    },
    "universe_snapshot": {
        "path_used": universe_path_used,
        "n_symbols": universe_n,
    },
    "costs_summary": {
        "path": str(COSTS_SUMMARY_PATH),
        "exists": costs_exists,
        "n_symbols_costs": len(syms_costs),
        "cost_flag_distribution": costs_flag_dist,
        "cost_flag_ok_ratio_pct": costs_ok_ratio_pct,
    },
    "qa_operativa_summary": {
        "path": str(QA_OPER_SUMMARY_PATH),
        "exists": qa_exists,
        "n_symbols_qa_operativa": len(syms_qa),
        "qa_operativa_flag_distribution": qa_flag_dist,
        "qa_operativa_flag_ok_ratio_pct": qa_ok_ratio_pct,
    },
    "windows_coverage": {
        w: {
            "symbols_passing": len(passed_by_window.get(w, set())),
            "base_intersection": len(symbols_for_windows),
            "pct_intersection_passing": (
                round(100.0 * len(passed_by_window.get(w, set())) / len(symbols_for_windows), 1)
                if symbols_for_windows else 0.0
            ),
        }
        for w in WINDOWS
    },
    "gate_result": {
        "computed_status": computed_status,
        "final_status": final_status,
        "required_window": REQUIRED_WINDOW,
        "approved_symbols_required_window": approved_count,
        "min_symbols_hist": min_symbols_hist,
    },
    "ready_symbols": ready_by_window,
    "ready_symbols_detail": rows_detail,
    "issues": issues,
}

QA_TRADING_JSON.write_text(json.dumps(qa_summary, ensure_ascii=False, indent=2), encoding="utf-8")

print("-"*110)
print(f"üìÑ qa_trading_ready_summary.json escrito en: {QA_TRADING_JSON}")
print(f"    status={final_status} | computed_status={computed_status} | issues={len(issues)}")
print("="*110)
print("‚úÖ Celda 16 ‚Äî QA Trading-Ready completada (resumen ejecutivo disponible para el siguiente notebook).")
  

Celda 16 ‚Äî QA Trading-Ready (s√≠ntesis ejecutiva del Data Engine M5)
TZ local: America/Guayaquil
Hora local: 2025-12-03T05:32:30-05:00 | Hora UTC: 2025-12-03T10:32:30+00:00
--------------------------------------------------------------------------------------------------------------
DATA_ROOT : C:\Quant\MT5_Data_Extraction\data
Metadata  : C:\Quant\MT5_Data_Extraction\data\metadata
GOLD (m5_clean)  : C:\Quant\MT5_Data_Extraction\data\historical_data\m5_clean
WINDOWS (m5_win) : C:\Quant\MT5_Data_Extraction\data\processed_data\m5_windows
Universe snapshot esperado: C:\Quant\MT5_Data_Extraction\data\metadata\universe_snapshot_20251202_232253.parquet
Resumen QA trading ‚Üí C:\Quant\MT5_Data_Extraction\data\metadata\qa_trading_ready_summary.json
--------------------------------------------------------------------------------------------------------------
REQUIRED_WINDOW        = 'last_30d'
GATE_BASE_MIN_FLOOR    = 50
GATE_BASE_PCT          = 0.70
MIN_COSTS_OK_RATIO_PCT = 70.0
MIN_QA_OK_RA

In [21]:
# ======================= Celda 18 ‚Äî Auditor√≠a integral de PADs (opcional, NO reemplazo) =======================
# Prop√≥sito:
#   - Auditor√≠a de salud del pipeline M5 leyendo artefactos existentes (Celdas 10/12/15/16/18).
#   - NO reemplaza pasos previos; NO reconstruye nada. S√≥lo verifica y resume estado.
#   - Sin hardcode de rutas: usa DATA_ROOT de globals() o env M5_DATA_ROOT (fallback ./data).
#   - Polars-only + scan_parquet con proyecci√≥n/predicado tard√≠o (homog√©neo).
# Salidas:
#   - metadata/pipeline_health_report.json  (reporte consolidado, legible por m√°quinas)
# Prints:
#   - Recuentos, rangos, tasas de verificaci√≥n, heartbeats suaves.
# Flags:
#   - Por defecto NO crea flags TRADING_READY/NOT_READY. Se puede habilitar con WRITE_FLAGS=True.
# ==============================================================================================================

from __future__ import annotations
import os, json, time, math, re
from pathlib import Path
from datetime import datetime, timedelta, timezone, date
from typing import Any, Dict, List, Optional, Tuple

# ----------------------------- Dependencia requerida -----------------------------
try:
    import polars as pl
except Exception as e:
    raise RuntimeError("Se requiere 'polars'. Inst√°lalo e int√©ntalo de nuevo.") from e

# ----------------------------- Configuraci√≥n b√°sica ------------------------------
CELL_LABEL = "18-Health-Audit"
RUN_ID = globals().get("RUN_ID", datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S"))
TIMEZONE_IANA = globals().get("TIMEZONE_IANA", "America/Guayaquil")

# DATA_ROOT: globals() -> env M5_DATA_ROOT -> ./data
DATA_ROOT = Path(
    globals().get("DATA_ROOT", os.environ.get("M5_DATA_ROOT", "./data"))
).resolve()

HIST_DIR    = DATA_ROOT / "historical_data" / "m5_clean"         # producido por Celda 10
PROC_DIR    = DATA_ROOT / "processed_data"  / "m5_windows"       # producido por Celda 12
META_DIR    = DATA_ROOT / "metadata"                             # √≠ndices/QA de Celdas 15/16
BACKUPS_DIR = DATA_ROOT / "backups"                              # por Celda 13 (backup)

DAY_INDEX_PATH    = META_DIR / "day_index_m5.parquet"            # Celda 15
SYMBOL_INDEX_PATH = META_DIR / "symbol_index_m5.parquet"         # Celda 15
WINDOW_CATALOG    = META_DIR / "window_catalog_m5.parquet"       # Celda 15
QA_SUMMARY_JSON   = META_DIR / "qa_trading_ready_summary.json"   # Celda 16
HEALTH_JSON       = META_DIR / "pipeline_health_report.json"

# Nuevos artefactos (Celdas 05, 10, 12B)
COSTS_SUMMARY_PATH        = META_DIR / "costs_summary.parquet"                   # Celda 05
QA_OPER_SUMMARY_PATH      = META_DIR / "qa_operativa_summary.parquet"           # Celda 10
UNIVERSE_SNAPSHOT_PATH    = META_DIR / f"universe_snapshot_{RUN_ID}.parquet"    # Celda 12B

# Flags (opt-in)
WRITE_FLAGS = bool(globals().get("WRITE_FLAGS", True))
FLAG_READY  = META_DIR / "TRADING_READY.flag"
FLAG_NOT    = META_DIR / "TRADING_NOT_READY.flag"

# Ventanas y umbrales (ajustables por globals())
WINDOWS = {"last_30d": 30, "last_90d": 90, "last_180d": 180}
REQUIRED_WINDOW = str(globals().get("REQUIRED_WINDOW", "last_180d"))
BASE_MIN_FLOOR  = int(globals().get("GATE_BASE_MIN_FLOOR", 50))
BASE_PCT        = float(globals().get("GATE_BASE_PCT", 0.70))   # 70% del universo base
MIN_DAY_INDEX_ROWS  = int(globals().get("MIN_DAY_INDEX_ROWS", 50_000))
MIN_QA_MAX_AGE_DAYS = int(globals().get("MIN_QA_MAX_AGE_DAYS", 3))
MIN_BACKUP_SIZE_MB  = int(globals().get("MIN_BACKUP_SIZE_MB", 100))

# Heartbeats
HEARTBEAT_SECS = float(globals().get("HEALTH_HEARTBEAT_SECS", 2.0))

# ----------------------------- Utils --------------------------------------------
def _now_local_iana() -> str:
    try:
        from zoneinfo import ZoneInfo
        return datetime.now(timezone.utc).astimezone(ZoneInfo(TIMEZONE_IANA)).isoformat(timespec="seconds")
    except Exception:
        return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _log(level: str, msg: str):
    """
    Usa log_msg(celda, level, message) si est√° disponible (Celda 03),
    si no, hace un print() formateado.
    """
    if "log_msg" in globals():
        try:
            log_msg(CELL_LABEL, level, msg)
            return
        except Exception:
            # Fallback a print si el logger falla
            pass
    print(f"[{_now_local_iana()}] [{RUN_ID}] [{level.upper()}] [{CELL_LABEL}] {msg}", flush=True)

def _fmt_bytes(n: int) -> str:
    if n < 1024: return f"{n} B"
    kib = n/1024
    if kib < 1024: return f"{kib:.2f} KiB"
    mib = kib/1024
    if mib < 1024: return f"{mib:.2f} MiB"
    gib = mib/1024
    if gib < 1024: return f"{gib:.2f} GiB"
    tib = gib/1024
    return f"{tib:.2f} TiB"

def _safe_size(p: Path) -> int:
    try:
        return p.stat().st_size
    except Exception:
        return 0

def _scan_len(paths: List[str]) -> int:
    if not paths:
        return 0
    return int(pl.scan_parquet(paths, low_memory=True).select(pl.len()).collect().item())

def _ymd(d: date) -> str:
    return f"{d.year:04d}{d.month:02d}{d.day:02d}"

def _expected_ymds(n_days: int) -> set[str]:
    end_day = (datetime.now(timezone.utc) - timedelta(days=1)).date()
    start_day = end_day - timedelta(days=n_days-1)
    out = []
    cur = start_day
    while cur <= end_day:
        out.append(_ymd(cur))
        cur += timedelta(days=1)
    return set(out)

def _read_eligible_set() -> tuple[Optional[set[str]], str]:
    filt_dir = META_DIR / "filters"
    p_parq = filt_dir / "eligible_symbols_by_cost.parquet"
    p_txt  = filt_dir / "eligible_symbols_by_cost.txt"
    try:
        if p_parq.exists():
            df = pl.read_parquet(p_parq)
            col = "symbol" if "symbol" in df.columns else df.columns[0]
            return set(df[col].cast(pl.Utf8).to_list()), "eligible_3B(parquet)"
    except Exception:
        pass
    if p_txt.exists():
        s = set()
        for ln in p_txt.read_text(encoding="utf-8").splitlines():
            v = ln.strip()
            if v:
                s.add(v)
        return s, "eligible_3B(txt)"
    return None, "eligible_missing"

def _hist_symbols() -> set[str]:
    if not HIST_DIR.exists():
        return set()
    out = set()
    for d in HIST_DIR.glob("symbol=*"):
        if d.is_dir():
            out.add(d.name.split("=",1)[-1])
    return out

def _window_days_for_symbol(wname: str, sym: str) -> set[str]:
    base = PROC_DIR / f"window={wname}" / f"symbol={sym}"
    if not base.exists():
        return set()
    return {p.name.split("=")[-1].split(".")[0] for p in base.glob("part=*.parquet")}

# ----------------------------- Cabecera -----------------------------------------
print("="*110)
print("Celda 18 ‚Äî Auditor√≠a integral de PADs (opcional). NO reemplaza 10/12/15.")
print(f"TZ local: {TIMEZONE_IANA}")
print(f"Hora local: {_now_local_iana()} | Hora UTC: {datetime.now(timezone.utc).isoformat(timespec='seconds')}")
print("-"*110)
print(f"DATA_ROOT: {DATA_ROOT}")
print(f"Metadata:  {META_DIR}")
print("Artefactos clave esperados:")
print(f"  - GOLD (m5_clean)        : {HIST_DIR}")
print(f"  - WINDOWS (m5_windows)   : {PROC_DIR}")
print(f"  - √çndices (Celda 15)     : {DAY_INDEX_PATH}, {SYMBOL_INDEX_PATH}, {WINDOW_CATALOG}")
print(f"  - QA trading ready (16)  : {QA_SUMMARY_JSON}")
print(f"  - Costes summary (05)    : {COSTS_SUMMARY_PATH}")
print(f"  - QA operativa (10)      : {QA_OPER_SUMMARY_PATH}")
print(f"  - Universe snapshot (12B): {UNIVERSE_SNAPSHOT_PATH}")
print("-"*110)
META_DIR.mkdir(parents=True, exist_ok=True)

issues: List[str] = []

# ----------------------------- 1) Estado general de GOLD ------------------------
hist_parts = sorted(HIST_DIR.rglob("part=*.parquet")) if HIST_DIR.exists() else []
hist_files = len(hist_parts)
hist_bytes = sum(_safe_size(p) for p in hist_parts)
hist_syms  = _hist_symbols()

print(f"üì¶ GOLD (m5_clean): files={hist_files:,} | bytes={_fmt_bytes(hist_bytes)} | symbols={len(hist_syms)}")

# ----------------------------- 2) √çndices (Celda 15) ----------------------------
def _rows_or_none(p: Path) -> Optional[int]:
    if not p.exists():
        return None
    try:
        return int(pl.scan_parquet(str(p), low_memory=True).select(pl.len()).collect().item())
    except Exception:
        return None

rows_day   = _rows_or_none(DAY_INDEX_PATH)
rows_sym   = _rows_or_none(SYMBOL_INDEX_PATH)
rows_win   = _rows_or_none(WINDOW_CATALOG)

print(f"üß≠ day_index_m5.parquet       : filas={rows_day}")
print(f"üß≠ symbol_index_m5.parquet    : filas={rows_sym}")
print(f"üß≠ window_catalog_m5.parquet  : filas={rows_win}")

if rows_day is None or rows_day < MIN_DAY_INDEX_ROWS:
    issues.append(f"day_index_m5.parquet inv√°lido o con pocas filas ({rows_day}).")
if rows_sym is None or rows_sym == 0:
    issues.append("symbol_index_m5.parquet inexistente o vac√≠o.")
if rows_win is None or rows_win == 0:
    issues.append("window_catalog_m5.parquet inexistente o vac√≠o.")

# ----------------------------- 3) QA resumen (Celda 16) -------------------------
qa_status = "MISSING"
qa_age_days = None
if QA_SUMMARY_JSON.exists():
    try:
        obj = json.loads(QA_SUMMARY_JSON.read_text(encoding="utf-8"))
        qa_status = str(obj.get("status", "UNKNOWN"))
        mtime = QA_SUMMARY_JSON.stat().st_mtime
        qa_age_days = (datetime.now() - datetime.fromtimestamp(mtime)).days
    except Exception:
        qa_status = "UNREADABLE"

print(f"üß™ QA summary: status={qa_status} | age_days={qa_age_days} | path={QA_SUMMARY_JSON}")
if qa_status not in ("PASS", "EMPTY"):
    issues.append("√öltimo QA no es PASS/EMPTY.")
if qa_age_days is not None and qa_age_days > MIN_QA_MAX_AGE_DAYS:
    issues.append(f"QA summary demasiado antiguo ({qa_age_days} d√≠as).")

# ----------------------------- 4) Backup reciente (Celda 13) --------------------
zips = sorted(
    BACKUPS_DIR.glob("*.zip"),
    key=lambda p: p.stat().st_mtime if p.exists() else 0,
    reverse=True
) if BACKUPS_DIR.exists() else []
if zips:
    latest_zip = zips[0]
    size_mb = latest_zip.stat().st_size / (1024 * 1024)
    print(f"üíæ Backup m√°s reciente: {latest_zip.name} | ~{size_mb:.2f} MiB")
    if size_mb < MIN_BACKUP_SIZE_MB:
        issues.append(f"Backup m√°s reciente demasiado peque√±o ({size_mb:.2f} MiB).")
else:
    latest_zip = None
    print("üíæ Backup m√°s reciente: (no encontrado)")
    issues.append("No hay ZIP de backup en backups/.")

# ----------------------------- 5) Gate opcional por ventanas --------------------
eligible_set, elig_label = _read_eligible_set()
base_universe = eligible_set if eligible_set is not None else set(hist_syms)
base_label = elig_label if eligible_set is not None else "clean_fallback"

intersect = base_universe & hist_syms
missing_clean = base_universe - hist_syms
outside_base  = hist_syms - base_universe

print("-"*110)
print(f"üîé Universo base: {base_label} ‚Üí {len(base_universe)} s√≠mbolo(s)")
print(f"‚à© Intersecci√≥n (base ‚à© clean): {len(intersect)} | missing_in_clean={len(missing_clean)} | outside_but_clean={len(outside_base)}")

if REQUIRED_WINDOW not in WINDOWS:
    print(f"‚ö†Ô∏è  REQUIRED_WINDOW='{REQUIRED_WINDOW}' inv√°lida. Usando 'last_180d'.")
    REQUIRED_WINDOW = "last_180d"

expected_by_win = {w: _expected_ymds(n) for w, n in WINDOWS.items()}
passed_by_window: Dict[str, set[str]] = {w: set() for w in WINDOWS}
t0 = time.monotonic()
last = t0

for k, sym in enumerate(sorted(intersect), 1):
    for wname, exp_set in expected_by_win.items():
        have = _window_days_for_symbol(wname, sym)
        if exp_set.issubset(have):
            passed_by_window[wname].add(sym)
    if (time.monotonic() - last) >= HEARTBEAT_SECS:
        _log("INFO", f"Ventanas: s√≠mbolos evaluados {k}/{len(intersect)}")
        last = time.monotonic()

print("-"*110)
for wname in ("last_30d", "last_90d", "last_180d"):
    if wname in passed_by_window:
        print(f"ü™ü {wname}: pasan={len(passed_by_window[wname])} / {len(intersect)}")

min_symbols_hist = max(BASE_MIN_FLOOR, math.ceil(BASE_PCT * max(len(base_universe), 0)))
approved_syms = passed_by_window.get(REQUIRED_WINDOW, set())
approved_count = len(approved_syms)

print(f"üìè Umbral din√°mico MIN_SYMBOLS_HIST={min_symbols_hist} (base={len(base_universe)}, regla={int(BASE_PCT*100)}%)")
print(f"üß∞ Ventana requerida: {REQUIRED_WINDOW} ‚Üí aprobados={approved_count}/{len(intersect)}")

# ----------------------------- 6) Universe snapshot + costes + QA operativa -----
print("-"*110)
print("üìä Verificando universe_snapshot, costs_summary y qa_operativa_summary...")

# Universe snapshot (Celda 12B)
universe_syms: Optional[set[str]] = None
universe_exists = False
universe_n = 0

if UNIVERSE_SNAPSHOT_PATH.exists():
    try:
        df_univ = pl.read_parquet(UNIVERSE_SNAPSHOT_PATH)
        if "symbol" not in df_univ.columns:
            raise ValueError("universe_snapshot no tiene columna 'symbol'.")
        universe_syms = set(
            df_univ.get_column("symbol").cast(pl.Utf8, strict=False).to_list()
        )
        universe_n = len(universe_syms)
        universe_exists = True
        _log("INFO", f"Universe snapshot cargado ({universe_n} s√≠mbolos) desde {UNIVERSE_SNAPSHOT_PATH}")
    except Exception as e:
        _log("WARNING", f"No se pudo leer universe snapshot {UNIVERSE_SNAPSHOT_PATH}: {e}")
        issues.append("universe_snapshot ilegible o sin columna 'symbol'.")
else:
    _log("WARNING", f"Universe snapshot no encontrado para RUN_ID={RUN_ID}: {UNIVERSE_SNAPSHOT_PATH}")
    issues.append("universe_snapshot inexistente (ejecuta Celda 12B).")

# Costes summary (Celda 05)
costs_exists = COSTS_SUMMARY_PATH.exists()
costs_n_syms = None
costs_n_universe = None
costs_pct_universe = None
costs_flag_dist: Optional[Dict[str, int]] = None
costs_ok_ratio = None

if costs_exists:
    try:
        df_costs = pl.read_parquet(COSTS_SUMMARY_PATH)
        if "symbol" not in df_costs.columns:
            raise ValueError("costs_summary.parquet no tiene columna 'symbol'.")
        syms_costs = set(
            df_costs.get_column("symbol").cast(pl.Utf8, strict=False).to_list()
        )
        costs_n_syms = len(syms_costs)
        _log("INFO", f"costs_summary.parquet cargado ({costs_n_syms} s√≠mbolos)")

        if universe_syms is not None and len(universe_syms) > 0:
            inter_costs_univ = syms_costs & universe_syms
            costs_n_universe = len(inter_costs_univ)
            costs_pct_universe = round(
                100.0 * costs_n_universe / len(universe_syms), 1
            )

        if "cost_flag" in df_costs.columns:
            flag_counts = (
                df_costs
                .group_by("cost_flag")
                .agg(pl.len().alias("n"))
                .to_dicts()
            )
            costs_flag_dist = {
                r["cost_flag"]: int(r["n"]) for r in flag_counts
            }
            total_flags = sum(costs_flag_dist.values())
            if total_flags > 0 and "OK" in costs_flag_dist:
                costs_ok_ratio = round(
                    100.0 * costs_flag_dist["OK"] / total_flags, 1
                )
    except Exception as e:
        _log("WARNING", f"No se pudo leer costs_summary.parquet: {e}")
        issues.append("costs_summary ilegible o sin columna 'symbol'.")
        costs_exists = False
else:
    _log("WARNING", f"costs_summary.parquet no encontrado en {COSTS_SUMMARY_PATH}")

# QA operativa summary (Celda 10)
qa_oper_exists = QA_OPER_SUMMARY_PATH.exists()
qa_n_syms = None
qa_n_universe = None
qa_pct_universe = None
qa_flag_dist: Optional[Dict[str, int]] = None
qa_ok_ratio = None

if qa_oper_exists:
    try:
        df_qa = pl.read_parquet(QA_OPER_SUMMARY_PATH)
        if "symbol" not in df_qa.columns:
            raise ValueError("qa_operativa_summary.parquet no tiene columna 'symbol'.")
        syms_qa = set(
            df_qa.get_column("symbol").cast(pl.Utf8, strict=False).to_list()
        )
        qa_n_syms = len(syms_qa)
        _log("INFO", f"qa_operativa_summary.parquet cargado ({qa_n_syms} s√≠mbolos)")

        if universe_syms is not None and len(universe_syms) > 0:
            inter_qa_univ = syms_qa & universe_syms
            qa_n_universe = len(inter_qa_univ)
            qa_pct_universe = round(
                100.0 * qa_n_universe / len(universe_syms), 1
            )

        if "qa_operativa_flag" in df_qa.columns:
            flag_counts = (
                df_qa
                .group_by("qa_operativa_flag")
                .agg(pl.len().alias("n"))
                .to_dicts()
            )
            qa_flag_dist = {
                r["qa_operativa_flag"]: int(r["n"]) for r in flag_counts
            }
            total_flags_qa = sum(qa_flag_dist.values())
            if total_flags_qa > 0 and "OK" in qa_flag_dist:
                qa_ok_ratio = round(
                    100.0 * qa_flag_dist["OK"] / total_flags_qa, 1
                )
    except Exception as e:
        _log("WARNING", f"No se pudo leer qa_operativa_summary.parquet: {e}")
        issues.append("qa_operativa_summary ilegible o sin columna 'symbol'.")
        qa_oper_exists = False
else:
    _log("WARNING", f"qa_operativa_summary.parquet no encontrado en {QA_OPER_SUMMARY_PATH}")

print("Resumen costes / QA operativa:")
print(f"  - Universe snapshot: exists={universe_exists} | n_symbols={universe_n}")
print(f"  - costs_summary.parquet: exists={costs_exists} | n_symbols={costs_n_syms}")
print(f"  - qa_operativa_summary.parquet: exists={qa_oper_exists} | n_symbols={qa_n_syms}")
if universe_syms is not None and len(universe_syms) > 0:
    print(
        f"  - Universo con costes: {costs_n_universe}/{len(universe_syms)} "
        f"({costs_pct_universe}%)"
        if costs_n_universe is not None else
        "  - Universo con costes: (no disponible)"
    )
    print(
        f"  - Universo con QA op.: {qa_n_universe}/{len(universe_syms)} "
        f"({qa_pct_universe}%)"
        if qa_n_universe is not None else
        "  - Universo con QA op.: (no disponible)"
    )

# ----------------------------- 7) Resultado & persistencia ----------------------
health = {
    "run_id": RUN_ID,
    "ts_utc": datetime.now(timezone.utc).isoformat(),
    "data_root": str(DATA_ROOT),
    "gold": {
        "files": hist_files,
        "bytes": hist_bytes,
        "symbols": len(hist_syms),
    },
    "indices": {
        "day_rows": rows_day,
        "symbol_rows": rows_sym,
        "window_rows": rows_win,
    },
    "qa": {
        "status": qa_status,
        "age_days": qa_age_days,
        "path": str(QA_SUMMARY_JSON),
    },
    "backup": {
        "latest": (latest_zip.name if latest_zip else None),
        "size_mb": (
            round(latest_zip.stat().st_size / 1024 / 1024, 2)
            if latest_zip else None
        ),
    },
    "universe": {
        "label": base_label,
        "size": len(base_universe),
    },
    "intersection": {
        "size": len(intersect),
    },
    "windows": {
        w: len(passed_by_window.get(w, set())) for w in WINDOWS
    },
    "required_window": {
        "name": REQUIRED_WINDOW,
        "approved": approved_count,
        "min_symbols_hist": min_symbols_hist,
    },
    "universe_snapshot": {
        "path": str(UNIVERSE_SNAPSHOT_PATH),
        "exists": universe_exists,
        "n_symbols": universe_n,
    },
    "costs": {
        "path": str(COSTS_SUMMARY_PATH),
        "has_costs_summary": costs_exists,
        "exists": costs_exists,
        "n_symbols_costs": costs_n_syms,
        "n_symbols_universe_with_costs": costs_n_universe,
        "pct_universe_with_costs": costs_pct_universe,
        "cost_flag_distribution": costs_flag_dist,
        "cost_flag_ok_ratio_pct": costs_ok_ratio,
    },
    "qa_operativa": {
        "path": str(QA_OPER_SUMMARY_PATH),
        "has_qa_operativa_summary": qa_oper_exists,
        "exists": qa_oper_exists,
        "n_symbols_qa_operativa": qa_n_syms,
        "n_symbols_universe_with_qa_operativa": qa_n_universe,
        "pct_universe_with_qa_operativa": qa_pct_universe,
        "qa_operativa_flag_distribution": qa_flag_dist,
        "qa_operativa_flag_ok_ratio_pct": qa_ok_ratio,
    },
    "issues": issues,
}

HEALTH_JSON.write_text(json.dumps(health, ensure_ascii=False, indent=2), encoding="utf-8")

print("-"*110)
print(f"üìÑ Reporte de salud escrito en: {HEALTH_JSON}")

# Flags opcionales (no obligatorio; desactivado por defecto)
if WRITE_FLAGS:
    gate_ok = (approved_count >= min_symbols_hist) and (len(issues) == 0)
    if gate_ok:
        FLAG_READY.write_text(f"[{RUN_ID}] TRADING_READY\n", encoding="utf-8")
        if FLAG_NOT.exists():
            try:
                FLAG_NOT.unlink()
            except Exception:
                pass
        print("‚úÖ Gate OK ‚Üí TRADING_READY.flag escrito (WRITE_FLAGS=True).")
    else:
        FLAG_NOT.write_text(
            f"[{RUN_ID}] TRADING_NOT_READY\n" + "\n".join(issues),
            encoding="utf-8"
        )
        if FLAG_READY.exists():
            try:
                FLAG_READY.unlink()
            except Exception:
                pass
        print("‚ùå Gate NO-OK ‚Üí TRADING_NOT_READY.flag escrito (WRITE_FLAGS=True).")
else:
    print("‚ÑπÔ∏è  WRITE_FLAGS=False (no se escriben flags). Usa WRITE_FLAGS=True para emitir banderas.")

print("="*110)
print("‚úÖ Auditor√≠a integral completada (lectura-only, sin reemplazar pasos del pipeline).")
_log("INFO", "Auditor√≠a integral completada y pipeline_health_report.json actualizado")
# ==============================================================================================================
 

Celda 18 ‚Äî Auditor√≠a integral de PADs (opcional). NO reemplaza 10/12/15.
TZ local: America/Guayaquil
Hora local: 2025-12-03T05:32:33-05:00 | Hora UTC: 2025-12-03T10:32:33+00:00
--------------------------------------------------------------------------------------------------------------
DATA_ROOT: C:\Quant\MT5_Data_Extraction\data
Metadata:  C:\Quant\MT5_Data_Extraction\data\metadata
Artefactos clave esperados:
  - GOLD (m5_clean)        : C:\Quant\MT5_Data_Extraction\data\historical_data\m5_clean
  - WINDOWS (m5_windows)   : C:\Quant\MT5_Data_Extraction\data\processed_data\m5_windows
  - √çndices (Celda 15)     : C:\Quant\MT5_Data_Extraction\data\metadata\day_index_m5.parquet, C:\Quant\MT5_Data_Extraction\data\metadata\symbol_index_m5.parquet, C:\Quant\MT5_Data_Extraction\data\metadata\window_catalog_m5.parquet
  - QA trading ready (16)  : C:\Quant\MT5_Data_Extraction\data\metadata\qa_trading_ready_summary.json
  - Costes summary (05)    : C:\Quant\MT5_Data_Extraction\data\metadata

Muestreo ligero de archivos PARQUET
DATA_ROOT : C:\Quant\MT5_Data_Extraction\data
LOG_PATH  : C:\Quant\MT5_Data_Extraction\data\logs\parquet_sample_20251203_231541.txt
Total de archivos .parquet encontrados: 1326898
----------------------------------------------------------------------------------------------------
Resumen por carpeta ra√≠z:
  - bulk_data           : 121457 parquets
  - historical_data     : 103917 parquets
  - metadata            :     22 parquets
  - processed_data      :  28054 parquets
  - restore             : 1073448 parquets
----------------------------------------------------------------------------------------------------
Archivos 'interesantes' por nombre (cost/param/fee/...): 40
  ‚Üí costs_summary.parquet encontrado : True
  ‚Üí ea_params.parquet encontrado     : False
  ‚Üí ea_params.csv encontrado         : False
Muestras seleccionadas para el log: 40 archivos
[1/40] metadata\costs_summary.parquet
   columnas: ['symbol', 'asset_class', 'spread_bps', 'comm

In [31]:
# ========== Celda ‚Äî Construir ea_params.parquet con comisi√≥n rellena (valores t√≠picos) ==========
from pathlib import Path
import polars as pl

# -----------------------------------------------------------------------------------
# 0) Par√°metros de comisi√≥n por defecto (en bps de notional)
#    Ajusta estos n√∫meros a algo razonable para tu broker.
# -----------------------------------------------------------------------------------
GLOBAL_DEFAULT_COMMISSION_BPS = 0.5  # fallback global si no hay nada mejor

ASSET_CLASS_DEFAULTS = {
    "FX_MAJOR":   0.2,
    "FX_MINOR":   0.3,
    "FX_EXOTIC":  0.5,
    "INDEX":      0.2,
    "STOCK_US":   0.5,
    "STOCK_EU":   0.5,
    "STOCK":      0.5,
    "CRYPTO":     1.0,
    "COMMODITY":  0.4,
}

# 1) Rutas
DATA_ROOT   = Path(globals().get("DATA_ROOT", str(path_contract.data_root()))).resolve()
META_DIR    = DATA_ROOT / "metadata"
FILTERS_DIR = META_DIR / "filters"

COSTS_PATH    = META_DIR / "costs_summary.parquet"
ELIGIBLE_PATH = FILTERS_DIR / "eligible_symbols_by_cost.parquet"
EA_PARAMS_OUT = DATA_ROOT / "ea_params.parquet"

print("=" * 90)
print("Construyendo ea_params.parquet a partir de costs_summary (rellenando comisi√≥n)...")
print(f"DATA_ROOT        : {DATA_ROOT}")
print(f"COSTS_PATH       : {COSTS_PATH}")
print(f"ELIGIBLE_PATH    : {ELIGIBLE_PATH}")
print(f"EA_PARAMS_OUT    : {EA_PARAMS_OUT}")
print(f"GLOBAL_DEFAULT_COMMISSION_BPS = {GLOBAL_DEFAULT_COMMISSION_BPS}")
print("=" * 90)

# 2) Cargar tabla de costes principal
if not COSTS_PATH.exists():
    raise FileNotFoundError(f"No existe {COSTS_PATH}. Ejecuta antes la celda de costes (Celda 05).")

costs = pl.read_parquet(str(COSTS_PATH))
costs = costs.rename({c: c.lower() for c in costs.columns})

required_cols = {"symbol", "asset_class", "spread_bps", "commission_bps", "cost_bps"}
missing = required_cols - set(costs.columns)
if missing:
    raise RuntimeError(f"costs_summary.parquet NO tiene columnas requeridas: {missing}")

print(f"costs_summary: {costs.height} filas, columnas = {costs.columns}")

# 3) (Opcional) cruzar con lista de s√≠mbolos elegibles por coste
if ELIGIBLE_PATH.exists():
    elig = pl.read_parquet(str(ELIGIBLE_PATH))
    elig = elig.rename({c: c.lower() for c in elig.columns})
else:
    elig = None  # hook para futuro

if "eligible" in costs.columns:
    costs_valid = costs.filter(pl.col("eligible") == True)
    print(f"  -> usando solo s√≠mbolos elegibles de costs_summary: {costs_valid.height} filas.")
else:
    costs_valid = costs
    print("  -> columna 'eligible' no encontrada, se usan todos los s√≠mbolos.")

# 4) Calcular comisi√≥n t√≠pica por asset_class a partir de valores > 0
print("-" * 90)
print("Calculando comisi√≥n t√≠pica por asset_class (solo commission_bps > 0)...")

non_zero = costs_valid.filter(pl.col("commission_bps") > 0)

if non_zero.height > 0:
    typical_by_class = (
        non_zero
        .group_by("asset_class")
        .agg(
            pl.col("commission_bps")
            .median()
            .alias("typical_commission_bps")
        )
    )
else:
    print("‚ö† No hay ninguna commission_bps > 0. No se puede calcular mediana por asset_class.")
    # Creamos una tabla vac√≠a pero con el esquema correcto para que el join no explote
    typical_by_class = (
        costs_valid
        .select("asset_class")
        .unique()
        .with_columns(
            pl.lit(None)
            .cast(pl.Float64)
            .alias("typical_commission_bps")
        )
    )

print("Tabla de comisiones t√≠picas por asset_class:")
print(typical_by_class)

# 5) Rellenar commission_bps = 0 con t√≠picos o default por asset_class / global
costs_filled = (
    costs_valid
    .join(typical_by_class, on="asset_class", how="left")
    .with_columns(
        # Comisi√≥n por defecto seg√∫n asset_class (o GLOBAL_DEFAULT si no est√° en el diccionario)
        pl.col("asset_class")
        .replace(ASSET_CLASS_DEFAULTS, default=GLOBAL_DEFAULT_COMMISSION_BPS)
        .alias("asset_class_default_commission")
    )
    .with_columns(
        pl.when(pl.col("commission_bps") > 0)
          .then(pl.col("commission_bps"))
          .when(pl.col("typical_commission_bps").is_not_null())
          .then(pl.col("typical_commission_bps"))
          .otherwise(pl.col("asset_class_default_commission"))
          .alias("commission_eff_bps")
    )
)

# Peque√±o resumen antes/despu√©s (solo diagn√≥stico)
n_zero_before = costs_valid.filter(pl.col("commission_bps") == 0).height
n_zero_after  = costs_filled.filter(pl.col("commission_eff_bps") == 0).height

print("-" * 90)
print(f"Filas con commission_bps == 0 ANTES del relleno : {n_zero_before}")
print(f"Filas con commission_eff_bps == 0 DESPU√âS      : {n_zero_after}")
print("Ejemplo de commission_eff_bps por s√≠mbolo:")
print(
    costs_filled
    .select(["symbol", "asset_class", "commission_bps",
             "typical_commission_bps", "asset_class_default_commission",
             "commission_eff_bps"])
    .head(10)
)

# 6) Construir tabla de par√°metros para ER_FILTER_5M
params_df = (
    costs_filled
    .select(
        pl.col("symbol").cast(pl.Utf8),
        pl.col("commission_eff_bps").cast(pl.Float64).alias("commission"),
        pl.lit("from_costs_summary_bps_filled").alias("spread_rule"),
        pl.col("spread_bps").cast(pl.Float64).alias("spread_est"),
    )
    .unique(subset=["symbol"])
)

print("-" * 90)
print(f"ea_params a escribir: {params_df.height} s√≠mbolos √∫nicos.")
print(params_df.head(5))

# 7) Guardar en data/ea_params.parquet
params_df.write_parquet(str(EA_PARAMS_OUT))
print("=" * 90)
print(f"‚úÖ ea_params.parquet generado en: {EA_PARAMS_OUT}")
print("   Columnas: ", params_df.columns)
print("=" * 90)


Construyendo ea_params.parquet a partir de costs_summary (rellenando comisi√≥n)...
DATA_ROOT        : C:\Quant\MT5_Data_Extraction\data
COSTS_PATH       : C:\Quant\MT5_Data_Extraction\data\metadata\costs_summary.parquet
ELIGIBLE_PATH    : C:\Quant\MT5_Data_Extraction\data\metadata\filters\eligible_symbols_by_cost.parquet
EA_PARAMS_OUT    : C:\Quant\MT5_Data_Extraction\data\ea_params.parquet
GLOBAL_DEFAULT_COMMISSION_BPS = 0.5
costs_summary: 131 filas, columnas = ['symbol', 'asset_class', 'spread_bps', 'commission_bps', 'slippage_bps', 'cost_bps', 'threshold_bps', 'eligible', 'reason', 'cost_flag']
  -> usando solo s√≠mbolos elegibles de costs_summary: 107 filas.
------------------------------------------------------------------------------------------
Calculando comisi√≥n t√≠pica por asset_class (solo commission_bps > 0)...
‚ö† No hay ninguna commission_bps > 0. No se puede calcular mediana por asset_class.
Tabla de comisiones t√≠picas por asset_class:
shape: (5, 2)
‚îå‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚

(Deprecated in version 1.0.0)
  .replace(ASSET_CLASS_DEFAULTS, default=GLOBAL_DEFAULT_COMMISSION_BPS)


In [32]:
 # ===================== Celda ‚Äî Muestreo ligero de PARQUET + Conclusiones =====================
# - Resume cu√°ntos parquets hay por carpeta de primer nivel.
# - Busca archivos "interesantes" por nombre: cost, fee, param, summary, ea_, watchlist.
# - Toma SOLO unas pocas muestras y genera un log peque√±o con esquemas y 3 filas de ejemplo.
# =============================================================================================

from __future__ import annotations

from pathlib import Path
from datetime import datetime
import polars as pl

# ---------------------------- Configuraci√≥n ----------------------------------

DATA_ROOT = Path(globals().get("DATA_ROOT", str(path_contract.data_root()))).resolve()

MAX_ROWS_PER_FILE       = 3   # filas de ejemplo por archivo
MAX_SAMPLES_PER_FOLDER  = 2   # m√°ximo de parquets muestreados por carpeta ra√≠z
MAX_RANDOM_SAMPLES      = 10  # n√∫mero m√°ximo de parquets "random" extra (por si no hay interesantes)

INTERESTING_KEYWORDS = ["cost", "fee", "param", "summary", "ea_", "watchlist"]

LOGS_DIR = DATA_ROOT / "logs"
LOGS_DIR.mkdir(parents=True, exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
LOG_PATH = LOGS_DIR / f"parquet_sample_{timestamp}.txt"

print("=" * 100)
print("Muestreo ligero de archivos PARQUET")
print(f"DATA_ROOT : {DATA_ROOT}")
print(f"LOG_PATH  : {LOG_PATH}")
print("=" * 100)

# ---------------------------- Localizar parquets ------------------------------

all_parquets = list(DATA_ROOT.rglob("*.parquet"))
total_files  = len(all_parquets)

print(f"Total de archivos .parquet encontrados: {total_files}")
if total_files == 0:
    print("‚ö†Ô∏è  No hay parquets bajo DATA_ROOT, paro aqu√≠.")
    raise SystemExit

# Agrupar por carpeta de primer nivel (bulk_data, metadata, etc.)
from collections import defaultdict
by_root = defaultdict(list)

for p in all_parquets:
    rel = p.relative_to(DATA_ROOT)
    parts = rel.parts
    root = parts[0] if len(parts) > 1 else "."
    by_root[root].append(p)

print("-" * 100)
print("Resumen por carpeta ra√≠z:")
for root, files in sorted(by_root.items(), key=lambda kv: kv[0]):
    print(f"  - {root:<20s}: {len(files):6d} parquets")

print("-" * 100)

# ---------------------------- Buscar archivos "interesantes" ------------------

interesting = []
for p in all_parquets:
    name = p.name.lower()
    if any(k in name for k in INTERESTING_KEYWORDS):
        interesting.append(p)

# quitar duplicados manteniendo orden
seen = set()
interesting_unique = []
for p in interesting:
    if p not in seen:
        seen.add(p)
        interesting_unique.append(p)

print(f"Archivos 'interesantes' por nombre (cost/param/fee/...): {len(interesting_unique)}")

# Marcar si vemos alguno clave
found_costs_summary = any(p.name.lower() == "costs_summary.parquet" for p in interesting_unique)
found_ea_params_parquet = any(p.name.lower() == "ea_params.parquet" for p in interesting_unique)
found_ea_params_csv     = any(p.name.lower() == "ea_params.csv" for p in all_parquets)

print(f"  ‚Üí costs_summary.parquet encontrado : {found_costs_summary}")
print(f"  ‚Üí ea_params.parquet encontrado     : {found_ea_params_parquet}")
print(f"  ‚Üí ea_params.csv encontrado         : {found_ea_params_csv}")

# ---------------------------- Seleccionar muestras ----------------------------

samples = []

# 1) Primero, todos los interesantes (pero limitando a 30 m√°ximo)
MAX_INTERESTING = 30
for p in interesting_unique[:MAX_INTERESTING]:
    samples.append(p)

# 2) Luego, hasta N por carpeta ra√≠z, si a√∫n no llenamos
for root, files in sorted(by_root.items(), key=lambda kv: kv[0]):
    # ya tenemos suficientes muestras ‚Üí salimos
    if len(samples) >= MAX_INTERESTING + MAX_RANDOM_SAMPLES:
        break
    # coger los primeros MAX_SAMPLES_PER_FOLDER de esa carpeta que no est√©n ya
    count_for_root = 0
    for p in sorted(files):
        if p in samples:
            continue
        samples.append(p)
        count_for_root += 1
        if count_for_root >= MAX_SAMPLES_PER_FOLDER:
            break

print(f"Muestras seleccionadas para el log: {len(samples)} archivos")

# ---------------------------- Volcar esquemas al log -------------------------

with open(LOG_PATH, "w", encoding="utf-8") as logf:
    logf.write(f"PARQUET SAMPLE - {datetime.now().isoformat(timespec='seconds')}\n")
    logf.write(f"DATA_ROOT = {DATA_ROOT}\n")
    logf.write(f"TOTAL PARQUETS  = {total_files}\n")
    logf.write(f"SAMPLED PARQUETS= {len(samples)}\n")
    logf.write("=" * 100 + "\n\n")

    for idx, path in enumerate(samples, start=1):
        rel = path.relative_to(DATA_ROOT)
        hdr = f"[{idx}/{len(samples)}] {rel}"
        print(hdr)  # resumen en pantalla
        logf.write("=" * 100 + "\n")
        logf.write(hdr + "\n")

        try:
            df_head = pl.read_parquet(str(path), n_rows=MAX_ROWS_PER_FILE)
            schema = df_head.schema
            logf.write(f"Columns ({len(schema)}): {list(schema.keys())}\n")
            logf.write(f"DTypes : {list(schema.values())}\n\n")
            logf.write("Sample rows:\n")
            logf.write(df_head.__repr__() + "\n\n")

            # peque√±a impresi√≥n en notebook para que veas algo
            print(f"   columnas: {list(schema.keys())}")
        except Exception as e:
            msg = f"ERROR leyendo parquet: {repr(e)}"
            logf.write(msg + "\n\n")
            print("   ‚ùå " + msg)

print("-" * 100)
print("‚úÖ Muestreo completado.")
print(f"Log (ligero) generado en: {LOG_PATH}")
print("Sube ese TXT si quieres que lo revisemos aqu√≠.")
print("=" * 100)

# ---------------------------- Conclusi√≥n r√°pida --------------------------------

print("\nCONCLUSI√ìN R√ÅPIDA SOBRE COSTES / PARAMS")
if found_costs_summary:
    print("  ‚úî Existe un costs_summary.parquet (costes por s√≠mbolo).")
else:
    print("  ‚úñ NO se encontr√≥ costs_summary.parquet por nombre. Puede estar con otro nombre.")

if found_ea_params_parquet or found_ea_params_csv:
    print("  ‚úî Existe alg√∫n ea_params.* ‚Üí el ER_FILTER_5M deber√≠a usarlo en vez de autogen.")
else:
    print("  ‚úñ No hay ea_params.parquet ni ea_params.csv ‚Üí el ER_FILTER_5M seguir√° en AUTOGEN.")
print("=" * 100)


Muestreo ligero de archivos PARQUET
DATA_ROOT : C:\Quant\MT5_Data_Extraction\data
LOG_PATH  : C:\Quant\MT5_Data_Extraction\data\logs\parquet_sample_20251203_235118.txt
Total de archivos .parquet encontrados: 1326899
----------------------------------------------------------------------------------------------------
Resumen por carpeta ra√≠z:
  - .                   :      1 parquets
  - bulk_data           : 121457 parquets
  - historical_data     : 103917 parquets
  - metadata            :     22 parquets
  - processed_data      :  28054 parquets
  - restore             : 1073448 parquets
----------------------------------------------------------------------------------------------------
Archivos 'interesantes' por nombre (cost/param/fee/...): 41
  ‚Üí costs_summary.parquet encontrado : True
  ‚Üí ea_params.parquet encontrado     : True
  ‚Üí ea_params.csv encontrado         : False
Muestras seleccionadas para el log: 40 archivos
[1/40] ea_params.parquet
   columnas: ['symbol', 'commi