In [1]:
# ======================================================================================
# Celda 00 v2.0.3 — Run Manifest + Paths + Canonical Schema
# Política institucional (FINAL):
#   - Por defecto SIEMPRE crea un run nuevo (NEW_RUN_DEFAULT).
#   - Solo reutiliza run si:
#       A) TREND_M5_RUN_ID está seteado (FORCED_RUN_ID)
#       B) TREND_M5_RESUME_LATEST=1 y existe _latest_run.txt (RESUME_LATEST)
#
# Nota:
#   - "outputs" NO es "cargar corridas anteriores": es el directorio de salida del run.
# ======================================================================================

from __future__ import annotations

import os
import json
import sys
import platform
import hashlib
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, Optional

# --- Unified path contract (single source of truth) ---
for _p in [Path.cwd().resolve()] + list(Path.cwd().resolve().parents):
    _contract = _p / "shared" / "contracts" / "path_contract.py"
    if _contract.exists():
        if str(_contract.parent) not in sys.path:
            sys.path.insert(0, str(_contract.parent))
        break
import path_contract  # noqa: E402

# ---------------------------
# Helpers
# ---------------------------
def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

def _safe_mkdir(p: Path) -> None:
    p.mkdir(parents=True, exist_ok=True)

def _write_json(path: Path, obj: Dict[str, Any]) -> None:
    _safe_mkdir(path.parent)
    path.write_text(json.dumps(obj, indent=2, ensure_ascii=False), encoding="utf-8")

def _read_json(path: Path) -> Dict[str, Any]:
    return json.loads(path.read_text(encoding="utf-8"))

def _write_text(path: Path, text: str) -> None:
    _safe_mkdir(path.parent)
    path.write_text(text, encoding="utf-8")

def _read_text(path: Path) -> str:
    return path.read_text(encoding="utf-8").strip()

def _sha1(s: str) -> str:
    return hashlib.sha1(s.encode("utf-8")).hexdigest()

def _env(name: str, default: Optional[str] = None) -> Optional[str]:
    v = os.getenv(name)
    return v if v not in (None, "") else default

def _env_bool(name: str, default: bool = False) -> bool:
    v = os.getenv(name, "")
    if v is None or v.strip() == "":
        return default
    return v.strip().lower() in ("1", "true", "yes", "y")

# ---------------------------
# Detectar PROJECT_ROOT (repo raíz) — determinístico
# ---------------------------
def _detect_project_root() -> Path:
    forced = _env("TREND_M5_ROOT")
    if forced:
        return Path(forced).resolve()
    return path_contract.detect_project_root()

PROJECT_ROOT = _detect_project_root()

# ---------------------------
# OUTPUTS_ROOT (salida del strategy notebook)
# ---------------------------
WORKDIR = Path.cwd().resolve()  # normalmente ...\ER_STRATEGY_LAB\notebooks
OUTPUTS_ROOT = Path(_env("TREND_M5_OUTPUTS_ROOT", str(path_contract.trend_outputs_dir(PROJECT_ROOT)))).resolve()
LATEST_RUN_MARKER = OUTPUTS_ROOT / "_latest_run.txt"

# ---------------------------
# RUN_ID policy (FINAL)
# ---------------------------
FORCED_RUN_ID = (_env("TREND_M5_RUN_ID") or "").strip() or None
RESUME_LATEST = _env_bool("TREND_M5_RESUME_LATEST", default=False)

def _new_run_id() -> str:
    ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
    salt = _sha1(f"{ts}|{platform.node()}|{os.getpid()}")[:8]
    return f"{ts}_{salt}"

if FORCED_RUN_ID:
    RUN_MODE = "FORCED_RUN_ID"
    RUN_ID = FORCED_RUN_ID
elif RESUME_LATEST and LATEST_RUN_MARKER.exists():
    RUN_MODE = "RESUME_LATEST"
    RUN_ID = _read_text(LATEST_RUN_MARKER) or _new_run_id()
else:
    RUN_MODE = "NEW_RUN_DEFAULT"
    RUN_ID = _new_run_id()

RUN_DIR = OUTPUTS_ROOT / f"run_{RUN_ID}"
RUN_MANIFEST_PATH = RUN_DIR / "run_manifest_v2.json"
RUN_MANIFEST_LATEST_PATH = OUTPUTS_ROOT / "run_manifest_v2_latest.json"

# ---------------------------
# Versionado
# ---------------------------
SCHEMA_VERSION = "v2.0.3"
ENGINE_VERSION = "v2.0.3"
COST_MODEL_VERSION = "v2.0.3"
WFO_VERSION = "v2.0.3"

# ---------------------------
# Canonical Schema (contrato)
# ---------------------------
CANONICAL_SCHEMA = {
    "ohlcv_m5": {
        "required_columns": ["time_utc", "open", "high", "low", "close", "volume", "spread", "symbol"],
        "notes": "UTC. time_utc monotónico por símbolo. M5 = 300s."
    },
    "engine_trades": {
        "required_columns": [
            "symbol","fold_id","segment","side",
            "signal_time_utc","entry_time_utc","exit_time_utc",
            "entry_price","exit_price",
            "gross_pnl","net_pnl_base","net_pnl_stress",
            "hold_bars","exit_reason"
        ],
        "notes": "Mon–Fri se aplica sobre entry_time_utc (t+1)."
    }
}

# ---------------------------
# Artifacts (salidas del run)
# ---------------------------
def _build_artifacts(run_dir: Path) -> Dict[str, str]:
    return {
        "instrument_specs": str(run_dir / "instrument_specs_v2.parquet"),
        "instrument_specs_snapshot": str(run_dir / "instrument_specs_snapshot_v2.json"),

        "ohlcv_clean": str(run_dir / "ohlcv_clean_m5.parquet"),
        "data_qa_report": str(run_dir / "data_qa_report_v2.json"),

        "cost_model_snapshot": str(run_dir / "cost_model_snapshot_v2.json"),
        "wfo_folds": str(run_dir / "wfo_folds_v2.parquet"),
        "wfo_folds_snapshot": str(run_dir / "wfo_folds_snapshot_v2.json"),

        "features_m5": str(run_dir / "features_m5_v2.parquet"),
        "regime_params_by_fold": str(run_dir / "regime_params_by_fold_v2.parquet"),
        "regime_params_snapshot": str(run_dir / "regime_params_snapshot_v2.json"),

        "trades_baseline": str(run_dir / "trades_baseline_v2.parquet"),
        "summary_baseline": str(run_dir / "summary_baseline_v2.parquet"),

        "alpha_multi_horizon_report": str(run_dir / "alpha_multi_horizon_report_v2.parquet"),
        "alpha_multi_horizon_snapshot": str(run_dir / "alpha_multi_horizon_snapshot_v2.json"),

        "trades_engine": str(run_dir / "trades_engine_v2.parquet"),
        "summary_engine": str(run_dir / "summary_engine_v2.parquet"),
        "equity_engine": str(run_dir / "equity_curve_engine_v2.parquet"),
        "engine_qa_report": str(run_dir / "engine_qa_report_v2.json"),
        "engine_report_snapshot": str(run_dir / "engine_report_snapshot_v2.json"),
        "signals_all": str(run_dir / "signals_all_v2.parquet"),
        "signals_snapshot": str(run_dir / "signals_snapshot_v2.json"),
        "qa_timing": str(run_dir / "qa_timing_v2.parquet"),
        "tuning_results": str(run_dir / "tuning_results_v2.parquet"),
        "tuning_best_params": str(run_dir / "tuning_best_params_v2.parquet"),
        "tuning_snapshot": str(run_dir / "tuning_snapshot_v2.json"),
        "alpha_design": str(run_dir / "alpha_design_v2.parquet"),
        "alpha_design_snapshot": str(run_dir / "alpha_design_snapshot_v2.json"),
        "selection": str(run_dir / "selection_v2.parquet"),
        "selection_snapshot": str(run_dir / "selection_snapshot_v2.json"),
        "overlay_trades": str(run_dir / "overlay_trades_v2.parquet"),
        "overlay_summary": str(run_dir / "overlay_summary_v2.parquet"),
        "overlay_snapshot": str(run_dir / "overlay_snapshot_v2.json"),
        "deploy_pack": str(run_dir / "deploy_pack_v2.parquet"),
        "deploy_pack_json": str(run_dir / "deploy_pack_v2.json"),
        "qa_alignment": str(run_dir / "qa_alignment_v2.parquet"),
        "qa_alignment_snapshot": str(run_dir / "qa_alignment_snapshot_v2.json"),
        "diagnostics": str(run_dir / "diagnostics_v2.parquet"),
        "diagnostics_snapshot": str(run_dir / "diagnostics_snapshot_v2.json"),
    }

def _build_manifest() -> Dict[str, Any]:
    return {
        "schema_version": SCHEMA_VERSION,
        "engine_version": ENGINE_VERSION,
        "cost_model_version": COST_MODEL_VERSION,
        "wfo_version": WFO_VERSION,
        "run_mode": RUN_MODE,
        "run_id": RUN_ID,
        "created_utc": _now_utc_iso(),
        "project_root": str(PROJECT_ROOT),
        "workdir": str(WORKDIR),
        "outputs_root": str(OUTPUTS_ROOT),
        "run_dir": str(RUN_DIR),
        "artifacts": _build_artifacts(RUN_DIR),
        "canonical_schema": CANONICAL_SCHEMA,
        "runtime": {
            "python": sys.version.replace("\n", " "),
            "platform": platform.platform(),
            "node": platform.node(),
            "pid": os.getpid(),
        },
    }

# ---------------------------
# Guardado (solo carga manifest si el modo es RESUME/forced y existe)
# ---------------------------
_safe_mkdir(RUN_DIR)
_safe_mkdir(OUTPUTS_ROOT)

manifest: Dict[str, Any]
if RUN_MANIFEST_PATH.exists() and RUN_MODE in ("RESUME_LATEST", "FORCED_RUN_ID"):
    manifest = _read_json(RUN_MANIFEST_PATH)
    # normaliza/bump versiones
    manifest["schema_version"] = SCHEMA_VERSION
    manifest["engine_version"] = ENGINE_VERSION
    manifest["cost_model_version"] = COST_MODEL_VERSION
    manifest["wfo_version"] = WFO_VERSION
    manifest["run_mode"] = RUN_MODE
    manifest["project_root"] = str(PROJECT_ROOT)
    manifest["workdir"] = str(WORKDIR)
    manifest["outputs_root"] = str(OUTPUTS_ROOT)
    manifest["canonical_schema"] = CANONICAL_SCHEMA
    manifest["artifacts"] = _build_artifacts(Path(manifest.get("run_dir", str(RUN_DIR))))
    _write_json(RUN_MANIFEST_PATH, manifest)
    print(f"[Celda 00 v2.0.3] Manifest CARGADO (resume/forced) y normalizado: {RUN_MANIFEST_PATH}")
else:
    manifest = _build_manifest()
    _write_json(RUN_MANIFEST_PATH, manifest)
    print(f"[Celda 00 v2.0.3] Manifest CREADO (nuevo run): {RUN_MANIFEST_PATH}")

# latest pointers
_write_text(LATEST_RUN_MARKER, RUN_ID)
_write_json(RUN_MANIFEST_LATEST_PATH, manifest)

# RUN object (downstream)
RUN: Dict[str, Any] = {
    "RUN_ID": manifest["run_id"],
    "RUN_MODE": manifest["run_mode"],
    "RUN_DIR": Path(manifest["run_dir"]),
    "PROJECT_ROOT": Path(manifest["project_root"]),
    "WORKDIR": Path(manifest["workdir"]),
    "OUTPUTS_ROOT": Path(manifest["outputs_root"]),
    "ARTIFACTS": {k: Path(v) for k, v in manifest["artifacts"].items()},
    "SCHEMA_VERSION": manifest["schema_version"],
    "ENGINE_VERSION": manifest["engine_version"],
    "CANONICAL_SCHEMA": manifest["canonical_schema"],
}

# ---------------------------
# PRINTS exhaustivos
# ---------------------------
print("\n--- Celda 00 v2.0.3 | Estado final ---")
print("RUN_MODE             :", RUN["RUN_MODE"])
print("PROJECT_ROOT         :", RUN["PROJECT_ROOT"])
print("WORKDIR              :", RUN["WORKDIR"])
print("OUTPUTS_ROOT         :", RUN["OUTPUTS_ROOT"])
print("RUN_ID               :", RUN["RUN_ID"])
print("RUN_DIR              :", RUN["RUN_DIR"])
print("RUN_MANIFEST_PATH    :", RUN_MANIFEST_PATH)
print("RUN_MANIFEST_LATEST  :", RUN_MANIFEST_LATEST_PATH)
print("LATEST_RUN_MARKER    :", LATEST_RUN_MARKER)
print("SCHEMA_VERSION       :", RUN["SCHEMA_VERSION"])
print("ENGINE_VERSION       :", RUN["ENGINE_VERSION"])

keys = sorted(RUN["ARTIFACTS"].keys())
print("\n--- ARTIFACTS keys ---")
print("N_KEYS:", len(keys))
print(keys)

critical = ["instrument_specs","instrument_specs_snapshot","ohlcv_clean","data_qa_report"]
missing_critical = [k for k in critical if k not in RUN["ARTIFACTS"]]
if missing_critical:
    raise RuntimeError(f"[Celda 00 v2.0.3] ERROR: faltan artifacts críticos: {missing_critical}")

print("\n--- Dependencias ---")
import polars as pl
print("polars:", pl.__version__)
try:
    import pandas as pd
    print("pandas:", pd.__version__)
except Exception as e:
    print("pandas: no disponible:", e)

print("\n[Celda 00 v2.0.3] OK — NEW_RUN por defecto + RUN listo.")


[Celda 00 v2.0.3] Manifest CREADO (nuevo run): C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\run_manifest_v2.json

--- Celda 00 v2.0.3 | Estado final ---
RUN_MODE             : NEW_RUN_DEFAULT
PROJECT_ROOT         : C:\Quant\projects\MT5_Data_Extraction
WORKDIR              : C:\Quant\projects\MT5_Data_Extraction\03_STRATEGY_LAB\notebooks
OUTPUTS_ROOT         : C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2
RUN_ID               : 20260218_000143_164d8480
RUN_DIR              : C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480
RUN_MANIFEST_PATH    : C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\run_manifest_v2.json
RUN_MANIFEST_LATEST  : C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_manifest_v2_latest.json
LATEST_RUN_MARKER    : C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\_latest_run.txt
SCHEMA_VERSION       : v2.0.3
ENGINE_VERSION       : v2.0.3


In [None]:
# ======================================================================================
# Celda 01 v2.1.0 — Universe & Instrument Specs (por simbolo)
# CAMBIO v2.1.0: Lee universe desde NB2 basket_trend_core.parquet via path_contract.
# Fallback: FALLBACK_UNIVERSE hardcodeado (legacy).
#
# Inputs:  RUN (Celda 00)
# Outputs: instrument_specs parquet + snapshot JSON
# ======================================================================================

from __future__ import annotations

import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional

import polars as pl

if "RUN" not in globals():
    raise RuntimeError("[Celda 01 v2.1.0] ERROR: No existe RUN. Ejecuta Celda 00 primero.")

ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

print("\n--- Celda 01 v2.1.0 | Basket Integration ---")
print("RUN_ID   :", RUN["RUN_ID"])
print("RUN_MODE :", RUN["RUN_MODE"])
print("RUN_DIR  :", RUN["RUN_DIR"])

OUT_SPECS_PARQUET = ARTIFACTS["instrument_specs"]
OUT_SPECS_SNAPSHOT = ARTIFACTS["instrument_specs_snapshot"]

# --- Cargar universe desde NB2 basket ---
FALLBACK_UNIVERSE = ["BNBUSD", "BTCUSD", "LVMH", "XAUAUD"]

basket_path = path_contract.nb2_basket("trend", PROJECT_ROOT)

# Fallback: legacy location
if basket_path is None:
    _legacy = path_contract.nb2_outputs_dir(PROJECT_ROOT) / "basket_trend_core.parquet"
    if _legacy.exists():
        basket_path = _legacy

if basket_path is not None and basket_path.exists():
    basket_df = pl.read_parquet(basket_path)
    if "symbol" in basket_df.columns:
        symbols = basket_df.get_column("symbol").unique().sort().to_list()
        universe_source = "nb2_basket_trend"
    else:
        symbols = FALLBACK_UNIVERSE
        universe_source = "fallback_basket_no_symbol_col"
        print("[Celda 01] WARNING: basket sin columna 'symbol', usando fallback.")
else:
    symbols = FALLBACK_UNIVERSE
    universe_source = "fallback_hardcoded"
    basket_path = None
    print(f"[Celda 01] WARNING: basket_trend_core no encontrado, usando fallback: {symbols}")

symbols = [s.upper().strip() for s in symbols]

print(f"[Celda 01] UNIVERSE_SOURCE   : {universe_source}")
print(f"[Celda 01] UNIVERSE_EFFECTIVE : {symbols}")
print(f"[Celda 01] basket_path       : {basket_path}")

# --- Guardar en RUN ---
RUN["UNIVERSE_EFFECTIVE"] = symbols
RUN["UNIVERSE_SOURCE"] = universe_source
RUN["INPUTS"] = RUN.get("INPUTS", {})
RUN["INPUTS"]["basket_path"] = str(basket_path) if basket_path else None

# --- Instrument Specs (defaults + overrides por simbolo) ---
def _t(h: int, m: int = 0) -> str:
    return f"{h:02d}:{m:02d}"

DEFAULT_SPEC: Dict[str, Any] = {
    "asset_class": "forex",
    "base_cost_bps": 3.0,
    "stress_cost_bps": 6.0,
    "entry_weekdays_only": True,
    "flatten_before_weekend": False,
    "session_weekdays_only": True,
    "session_windows_utc_json": "[]",
    "research_only": False,
    "research_reason": None,
    "tick_size_hint": None,
    "contract_hint": None,
}

OVERRIDES: Dict[str, Dict[str, Any]] = {
    "BNBUSD": {"asset_class": "crypto", "base_cost_bps": 8.0, "stress_cost_bps": 16.0},
    "BTCUSD": {"asset_class": "crypto", "base_cost_bps": 8.0, "stress_cost_bps": 16.0},
    "ETHUSD": {"asset_class": "crypto", "base_cost_bps": 3.0, "stress_cost_bps": 6.0},
    "LVMH":   {"asset_class": "equity", "base_cost_bps": 12.0, "stress_cost_bps": 25.0,
               "flatten_before_weekend": True,
               "session_windows_utc_json": json.dumps([{"start": _t(8, 0), "end": _t(16, 30)}])},
    "XAUAUD": {"asset_class": "fx_metal", "base_cost_bps": 4.0, "stress_cost_bps": 8.0,
               "flatten_before_weekend": True},
    "XAUUSD": {"asset_class": "fx_metal", "base_cost_bps": 5.0, "stress_cost_bps": 10.0,
               "flatten_before_weekend": True},
}

rows: List[Dict[str, Any]] = []
for sym in symbols:
    spec = dict(DEFAULT_SPEC)
    spec.update(OVERRIDES.get(sym, {}))
    spec["symbol"] = sym
    rows.append(spec)

specs = pl.DataFrame(rows)

print(f"\n[Celda 01] specs shape: {specs.shape}")
print(specs)

# Gates duros
bad_costs = specs.filter(
    (pl.col("base_cost_bps") <= 0) |
    (pl.col("stress_cost_bps") <= 0) |
    (pl.col("stress_cost_bps") < pl.col("base_cost_bps"))
)
if bad_costs.height > 0:
    raise RuntimeError(f"[Celda 01] ERROR: costos invalidos:\n{bad_costs}")

n_unique = specs.select(pl.col("symbol").n_unique()).item()
if n_unique != specs.height:
    raise RuntimeError("[Celda 01] ERROR: simbolos duplicados en instrument_specs.")

# Persistencia
OUT_SPECS_PARQUET.parent.mkdir(parents=True, exist_ok=True)
specs.write_parquet(OUT_SPECS_PARQUET)

snapshot = {
    "cell": "01 v2.1.0",
    "created_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
    "run_id": RUN["RUN_ID"],
    "run_mode": RUN["RUN_MODE"],
    "universe_source": universe_source,
    "universe_effective": symbols,
    "basket_path": str(basket_path) if basket_path else None,
    "n_symbols": len(symbols),
    "rows": specs.to_dicts(),
}
OUT_SPECS_SNAPSHOT.write_text(
    json.dumps(snapshot, indent=2, ensure_ascii=False), encoding="utf-8"
)

print(f"\n[Celda 01 v2.1.0] OK — {len(symbols)} symbols desde {universe_source}")
print(f"  parquet : {OUT_SPECS_PARQUET}")
print(f"  snapshot: {OUT_SPECS_SNAPSHOT}")

In [3]:
# ======================================================================================
# Celda 02 v2.0.4 — Load M5 (m5_clean) + Canonicalize + QA (AUTO-RUTAS estilo v1)
# Propósito:
#   - Construir ohlcv_clean_m5.parquet (schema canónico) desde tu M5 limpio REAL (v1).
#   - QA mínimo institucional: dedup, monotonic, gaps total + intraday, share_300s.
#
# Inputs:
#   - RUN (Celda 00 v2.0.3)
#   - instrument_specs (Celda 01)
#
# Política de rutas (FINAL):
#   - Si defines TREND_M5_M5_CLEAN_DIR -> usa esa (prioridad absoluta).
#   - Si no, autodetecta en candidatos reales (como v1):
#       * <PROJECT_ROOT>/data/historical_data/m5_clean
#       * <PROJECT_ROOT>/data/rates_5m
#       * <PROJECT_ROOT>/data/historical_data/rates_5m
#       * <PROJECT_ROOT>/data/bulk_data/m5_raw   (último fallback)
#
# Outputs:
#   - RUN["ARTIFACTS"]["ohlcv_clean"]
#   - RUN["ARTIFACTS"]["data_qa_report"]
# ======================================================================================

from __future__ import annotations

import os
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
import itertools

import polars as pl

if "RUN" not in globals():
    raise RuntimeError("[Celda 02 v2.0.4] ERROR: No existe RUN. Ejecuta Celda 00 v2.0.3 primero.")

ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]
PROJECT_ROOT: Path = RUN["PROJECT_ROOT"]
RUN_DIR: Path = RUN["RUN_DIR"]

SPECS_PATH = ARTIFACTS["instrument_specs"]
OUT_OHLCV = ARTIFACTS["ohlcv_clean"]
OUT_QA = ARTIFACTS["data_qa_report"]

print("\n--- Celda 02 v2.0.4 | Preflight ---")
print("RUN_ID     :", RUN["RUN_ID"])
print("RUN_MODE   :", RUN.get("RUN_MODE"))
print("PROJECT_ROOT:", PROJECT_ROOT)
print("RUN_DIR    :", RUN_DIR)
print("SPECS_PATH :", SPECS_PATH)
print("OUT_OHLCV  :", OUT_OHLCV)
print("OUT_QA     :", OUT_QA)

if not SPECS_PATH.exists():
    raise RuntimeError(f"[Celda 02 v2.0.4] ERROR: Falta instrument_specs: {SPECS_PATH}. Ejecuta Celda 01 primero.")

# -----------------------------
# Config / constants
# -----------------------------
EXPECTED_BAR_SECONDS = 300  # M5
FORCED_M5_DIR = os.getenv("TREND_M5_M5_CLEAN_DIR", "").strip()

# -----------------------------
# Helpers
# -----------------------------
TIME_CANDS = ["time_utc", "timestamp_utc", "datetime", "timestamp", "time", "date"]
O_CANDS = ["open", "o"]
H_CANDS = ["high", "h"]
L_CANDS = ["low", "l"]
C_CANDS = ["close", "c"]
V_CANDS = ["volume", "vol", "tick_volume"]

def _pick_col(cols: List[str], cands: List[str]) -> Optional[str]:
    m = {c.lower(): c for c in cols}
    for x in cands:
        if x.lower() in m:
            return m[x.lower()]
    return None

def _count_parquets_quick(p: Path, limit: int = 2000) -> int:
    if not p.exists():
        return 0
    it = p.rglob("*.parquet")
    return sum(1 for _ in itertools.islice(it, limit))

def _detect_m5_clean_dir() -> Tuple[Path, str]:
    """
    Retorna: (dir, mode)
      mode:
        - FORCED_ENV
        - AUTO_CANDIDATE
    """
    if FORCED_M5_DIR:
        d = Path(FORCED_M5_DIR).resolve()
        return d, "FORCED_ENV"

    candidates = [
        path_contract.m5_clean_dir(PROJECT_ROOT),   # data/historical_data/m5_clean
        path_contract.m5_raw_dir(PROJECT_ROOT),      # data/bulk_data/m5_raw
    ]

    print("\n[Celda 02 v2.0.4] Candidatos M5 probados (exist/parquets):")
    best = None
    best_n = -1
    for c in candidates:
        n = _count_parquets_quick(c)
        print(f"  - {c} | exists={c.exists()} | parquet_count~={n}")
        if c.exists() and n > best_n:
            best = c
            best_n = n

    if best is None or not best.exists() or best_n <= 0:
        raise RuntimeError(
            "[Celda 02 v2.0.4] ERROR: No pude detectar el directorio real de datos M5.\n"
            "Solución: define TREND_M5_M5_CLEAN_DIR apuntando a tu carpeta m5_clean.\n"
            "Ejemplo (PowerShell):\n"
            "  $env:TREND_M5_M5_CLEAN_DIR = '<PROJECT_ROOT>\\data\\historical_data\\m5_clean'\n"
        )
    return best, "AUTO_CANDIDATE"

def _coerce_time_expr(col: str, dtype: pl.DataType) -> pl.Expr:
    if dtype in (pl.Int64, pl.Int32, pl.UInt64, pl.UInt32):
        # epoch detect (best-effort)
        return pl.when(pl.col(col) > 10**17).then(pl.from_epoch(pl.col(col), time_unit="ns")) \
                 .when(pl.col(col) > 10**14).then(pl.from_epoch(pl.col(col), time_unit="us")) \
                 .when(pl.col(col) > 10**11).then(pl.from_epoch(pl.col(col), time_unit="ms")) \
                 .otherwise(pl.from_epoch(pl.col(col), time_unit="s")) \
                 .alias("time_utc")
    return pl.col(col).cast(pl.Datetime, strict=False).alias("time_utc")

def _gap_stats_total(times: pl.Series) -> Dict[str, Any]:
    if times.len() < 2:
        return {"gap_count": 0, "gap_rate": 0.0, "missing_bars_est": 0, "max_gap_seconds": 0}
    dt = times.diff().dt.total_seconds().drop_nulls()
    if dt.len() == 0:
        return {"gap_count": 0, "gap_rate": 0.0, "missing_bars_est": 0, "max_gap_seconds": 0}
    gap_count = int((dt > EXPECTED_BAR_SECONDS).sum())
    max_gap = int(dt.max() or 0)
    missing = ((dt // EXPECTED_BAR_SECONDS) - 1).clip(lower_bound=0)
    missing_est = int(missing.sum() or 0)
    gap_rate = float(gap_count / max(dt.len(), 1))
    return {"gap_count": gap_count, "gap_rate": gap_rate, "missing_bars_est": missing_est, "max_gap_seconds": max_gap}

def _gap_stats_intraday(df_times: pl.DataFrame) -> Dict[str, Any]:
    if df_times.height < 2:
        return {"gap_count": 0, "gap_rate": 0.0, "missing_bars_est": 0, "max_gap_seconds": 0, "share_300s": 1.0}
    tmp = (
        df_times
        .with_columns(pl.col("time_utc").dt.truncate("1d").alias("_day"))
        .select(((pl.col("time_utc").diff().dt.total_seconds()).over("_day")).alias("dt_sec"))
        .drop_nulls()
    )
    if tmp.height == 0:
        return {"gap_count": 0, "gap_rate": 0.0, "missing_bars_est": 0, "max_gap_seconds": 0, "share_300s": 1.0}
    dt = tmp["dt_sec"]
    gap_count = int((dt > EXPECTED_BAR_SECONDS).sum())
    max_gap = int(dt.max() or 0)
    missing = ((dt // EXPECTED_BAR_SECONDS) - 1).clip(lower_bound=0)
    missing_est = int(missing.sum() or 0)
    gap_rate = float(gap_count / max(dt.len(), 1))
    share_300s = float((dt == EXPECTED_BAR_SECONDS).sum() / max(dt.len(), 1))
    return {"gap_count": gap_count, "gap_rate": gap_rate, "missing_bars_est": missing_est, "max_gap_seconds": max_gap, "share_300s": share_300s}

def _expected_bars_data_driven(df_times: pl.DataFrame) -> int:
    if df_times.height == 0:
        return 0
    per_day = (
        df_times
        .with_columns(pl.col("time_utc").dt.truncate("1d").alias("_day"))
        .group_by("_day")
        .agg([pl.min("time_utc").alias("t0"), pl.max("time_utc").alias("t1")])
        .with_columns(((pl.col("t1") - pl.col("t0")).dt.total_seconds() // EXPECTED_BAR_SECONDS + 1).cast(pl.Int64).alias("exp"))
    )
    return int(per_day.select(pl.col("exp").sum()).item() or 0)

# -----------------------------
# Detect M5 dir
# -----------------------------
M5_DIR, M5_DIR_MODE = _detect_m5_clean_dir()
print("\n[Celda 02 v2.0.4] M5_DIR seleccionado:", M5_DIR)
print("[Celda 02 v2.0.4] M5_DIR_MODE       :", M5_DIR_MODE)

# -----------------------------
# Universe
# -----------------------------
specs = pl.read_parquet(SPECS_PATH)
universe = specs.select("symbol").to_series().to_list()
print("\n[Celda 02 v2.0.4] Universe:", universe)

# -----------------------------
# Detect layout: hive partition symbol=...
# -----------------------------
symbol_partitions = [d for d in M5_DIR.iterdir() if d.is_dir() and d.name.lower().startswith("symbol=")]
IS_HIVE = len(symbol_partitions) > 0
print("\n[Celda 02 v2.0.4] Layout detectado:")
print("  IS_HIVE(symbol=...):", IS_HIVE)
if IS_HIVE:
    print("  sample partitions:", [d.name for d in symbol_partitions[:5]])

# -----------------------------
# Cargar por símbolo + canonicalizar
# -----------------------------
required_cols = ["time_utc","open","high","low","close","volume","spread","symbol"]
dfs: List[pl.DataFrame] = []
qa_rows: List[Dict[str, Any]] = []

for sym in universe:
    print("\n" + "-"*100)
    print(f"[Celda 02 v2.0.4] Loading symbol={sym}")

    if IS_HIVE:
        sym_dir = M5_DIR / f"symbol={sym}"
        if not sym_dir.exists():
            raise RuntimeError(f"[Celda 02 v2.0.4] ERROR: No existe partición {sym_dir}")
        glob = str(sym_dir / "**" / "*.parquet")
        lf = pl.scan_parquet(glob)
    else:
        glob = str(M5_DIR / "**" / "*.parquet")
        lf = pl.scan_parquet(glob)

    schema = lf.collect_schema()
    cols = schema.names()
    print("[Celda 02 v2.0.4] Columns(sample):", cols[:20], ("..." if len(cols) > 20 else ""))

    tcol = _pick_col(cols, TIME_CANDS)
    ocol = _pick_col(cols, O_CANDS)
    hcol = _pick_col(cols, H_CANDS)
    lcol = _pick_col(cols, L_CANDS)
    ccol = _pick_col(cols, C_CANDS)
    vcol = _pick_col(cols, V_CANDS)

    if tcol is None or ocol is None or hcol is None or lcol is None or ccol is None:
        raise RuntimeError(
            f"[Celda 02 v2.0.4] ERROR: columnas OHLCV faltantes en {sym}. "
            f"time={tcol}, open={ocol}, high={hcol}, low={lcol}, close={ccol}. cols={cols}"
        )

    spread_col = next((c for c in cols if c.lower() in ("spread", "spread_points")), None)
    sym_col = next((c for c in cols if c.lower() == "symbol"), None)

    # construir select canónico
    time_expr = _coerce_time_expr(tcol, schema[tcol])

    volume_expr = (pl.col(vcol).cast(pl.Float64).alias("volume")) if vcol else pl.lit(0.0).cast(pl.Float64).alias("volume")

    if spread_col:
        spread_expr = pl.col(spread_col).cast(pl.Float64).alias("spread")
    else:
        spread_expr = pl.lit(0.0).cast(pl.Float64).alias("spread")

    if IS_HIVE:
        # en hive particionado, puede no existir 'symbol' dentro del parquet
        sym_expr = (pl.col(sym_col).cast(pl.Utf8).alias("symbol")) if sym_col else pl.lit(sym).cast(pl.Utf8).alias("symbol")
    else:
        # si no es hive, debería existir symbol o filtramos por columna si existe
        sym_expr = (pl.col(sym_col).cast(pl.Utf8).alias("symbol")) if sym_col else pl.lit(sym).cast(pl.Utf8).alias("symbol")

    lf2 = lf.select([
        time_expr,
        pl.col(ocol).cast(pl.Float64).alias("open"),
        pl.col(hcol).cast(pl.Float64).alias("high"),
        pl.col(lcol).cast(pl.Float64).alias("low"),
        pl.col(ccol).cast(pl.Float64).alias("close"),
        volume_expr,
        spread_expr,
        sym_expr,
    ]).drop_nulls(["time_utc"])

    if (not IS_HIVE) and sym_col:
        lf2 = lf2.filter(pl.col("symbol") == sym)
    elif IS_HIVE:
        lf2 = lf2.with_columns(pl.lit(sym).alias("symbol"))

    df = lf2.collect()
    print("[Celda 02 v2.0.4] Loaded rows (raw):", df.height)

    # sort + dedup
    n_before = df.height
    df = df.sort("time_utc").unique(subset=["time_utc"], keep="last")
    n_after = df.height
    dup_removed = n_before - n_after
    print("[Celda 02 v2.0.4] After dedup rows:", n_after, "| dup_removed:", dup_removed)

    if n_after == 0:
        raise RuntimeError(f"[Celda 02 v2.0.4] ERROR: {sym} quedó vacío tras limpiar.")

    # monotonic sanity
    min_dt = df.select(pl.col("time_utc").diff().dt.total_seconds().min()).item()
    if min_dt is not None and float(min_dt) < 0:
        raise RuntimeError(f"[Celda 02 v2.0.4] ERROR: {sym} no es monotónico (dt_min={min_dt}).")

    times = df["time_utc"]
    start = times.min()
    end = times.max()

    total_gaps = _gap_stats_total(times)
    intraday_gaps = _gap_stats_intraday(df.select(["time_utc"]))

    expected_intraday = _expected_bars_data_driven(df.select(["time_utc"]))
    coverage_intraday_pct = float(n_after / expected_intraday * 100.0) if expected_intraday > 0 else 0.0

    print("[Celda 02 v2.0.4] start_utc:", start)
    print("[Celda 02 v2.0.4] end_utc  :", end)
    print("[Celda 02 v2.0.4] intraday share_300s:", intraday_gaps["share_300s"])
    print("[Celda 02 v2.0.4] coverage_intraday_pct:", f"{coverage_intraday_pct:.2f}%")
    print("[Celda 02 v2.0.4] gaps_total   :", total_gaps)
    print("[Celda 02 v2.0.4] gaps_intraday:", intraday_gaps)

    # Gate duro: intraday M5 consistente
    if float(intraday_gaps["share_300s"]) < 0.90:
        raise RuntimeError(
            f"[Celda 02 v2.0.4] ERROR: {sym} intraday share_300s={intraday_gaps['share_300s']:.3f} < 0.90. "
            "Tu dataset NO es M5 consistente intradía."
        )

    qa_rows.append({
        "symbol": sym,
        "m5_dir": str(M5_DIR),
        "layout_hive": bool(IS_HIVE),
        "rows": int(n_after),
        "dup_removed": int(dup_removed),
        "start_utc": str(start),
        "end_utc": str(end),
        "coverage_intraday_pct": float(coverage_intraday_pct),
        "gaps_total": total_gaps,
        "gaps_intraday": intraday_gaps,
    })

    dfs.append(df.select(required_cols))

# Concatenar todo (schema canónico)
ohlcv = pl.concat(dfs, how="vertical").sort(["symbol", "time_utc"])

missing_cols = [c for c in required_cols if c not in ohlcv.columns]
if missing_cols:
    raise RuntimeError(f"[Celda 02 v2.0.4] ERROR: dataset no canónico, faltan columnas: {missing_cols}")

# Persistir
OUT_OHLCV.parent.mkdir(parents=True, exist_ok=True)
ohlcv.write_parquet(OUT_OHLCV)

qa_report = {
    "cell": "02 v2.0.4",
    "created_utc": datetime.now(timezone.utc).isoformat(timespec="seconds"),
    "run_id": RUN["RUN_ID"],
    "run_dir": str(RUN_DIR),
    "project_root": str(PROJECT_ROOT),
    "m5_dir": str(M5_DIR),
    "m5_dir_mode": M5_DIR_MODE,
    "layout_hive": bool(IS_HIVE),
    "expected_bar_seconds": EXPECTED_BAR_SECONDS,
    "n_rows_total": int(ohlcv.height),
    "n_symbols": int(ohlcv.select(pl.col("symbol").n_unique()).item()),
    "per_symbol": qa_rows,
    "notes": [
        "QA intraday evita penalizar overnight/weekend.",
        "Gate duro: share_300s >= 0.90 para consistencia M5 intradía.",
        "Si no detecta M5_DIR, setea TREND_M5_M5_CLEAN_DIR explícitamente."
    ],
}
OUT_QA.write_text(json.dumps(qa_report, indent=2, ensure_ascii=False), encoding="utf-8")

print(f"\n[Celda 02 v2.0.4] OK — ohlcv_clean creado: {OUT_OHLCV} | exists: {OUT_OHLCV.exists()}")
print(f"[Celda 02 v2.0.4] OK — data_qa_report creado: {OUT_QA} | exists: {OUT_QA.exists()}")

print("\n--- Preview (head) ---")
print(ohlcv.head(5))

print("\n--- QA (resumen) ---")
print(pl.DataFrame(qa_rows).select(["symbol","rows","dup_removed","coverage_intraday_pct"]))



--- Celda 02 v2.0.4 | Preflight ---
RUN_ID     : 20260218_000143_164d8480
RUN_MODE   : NEW_RUN_DEFAULT
PROJECT_ROOT: C:\Quant\projects\MT5_Data_Extraction
RUN_DIR    : C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480
SPECS_PATH : C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\instrument_specs_v2.parquet
OUT_OHLCV  : C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\ohlcv_clean_m5.parquet
OUT_QA     : C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\data_qa_report_v2.json

[Celda 02 v2.0.4] Candidatos M5 probados (exist/parquets):
  - C:\Quant\projects\MT5_Data_Extraction\data\historical_data\m5_clean | exists=True | parquet_count~=2000
  - C:\Quant\projects\MT5_Data_Extraction\data\bulk_data\m5_raw | exists=True | parquet_count~=2000

[Celda 02 v2.0.4] M5_DIR seleccionado: C:\Quant\projects\MT5_Data_Extraction\data\historical_data\m5_clean
[Celda

In [4]:
# ======================================================================================
# Celda 03 v2.0.1 — Cost Model (base/stress + slippage proxy + gap proxy) [RETURNS POR SÍMBOLO OK]
# Propósito:
#   - Costos reproducibles net-of-costs:
#       * base_cost_bps / stress_cost_bps (instrument_specs)
#       * slippage proxy: spread (si existe) o proxy por volatilidad (abs-return) POR SÍMBOLO
#       * gap proxy (equity/fx) como add-on conservador
#   - Prints explícitos por símbolo (componentes y totales).
#
# Inputs:
#   - RUN (Celda 00)
#   - instrument_specs_v2.parquet (Celda 01)
#   - ohlcv_clean_m5.parquet (Celda 02)
#   - data_qa_report.json (Celda 02) [preferible v2.0.4]
#
# Outputs:
#   - cost_model_snapshot.json
#   - cost_model_v2.parquet
#
# ENV:
#   - TREND_M5_FORCE_REBUILD_COST_MODEL=1 => fuerza rebuild
# ======================================================================================

from __future__ import annotations

import os
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Any, Dict, List

import polars as pl

# -----------------------------
# Preflight
# -----------------------------
if "RUN" not in globals():
    raise RuntimeError("[Celda 03 v2.0.1] ERROR: No existe RUN en memoria. Ejecuta primero Celda 00 v2.0.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

INSTRUMENT_SPECS_PATH = RUN_DIR / "instrument_specs_v2.parquet"
OHLCV_CLEAN_PATH = ARTIFACTS["ohlcv_clean"]
QA_REPORT_PATH = ARTIFACTS["data_qa_report"]

if not INSTRUMENT_SPECS_PATH.exists():
    raise RuntimeError("[Celda 03 v2.0.1] ERROR: Falta instrument_specs_v2.parquet. Ejecuta Celda 01 v2.0.")
if not OHLCV_CLEAN_PATH.exists():
    raise RuntimeError("[Celda 03 v2.0.1] ERROR: Falta ohlcv_clean_m5.parquet. Ejecuta Celda 02.")

OUT_COST_SNAPSHOT = ARTIFACTS["cost_model_snapshot"]
OUT_COST_TABLE = RUN_DIR / "cost_model_v2.parquet"

FORCE_REBUILD_COST = os.getenv("TREND_M5_FORCE_REBUILD_COST_MODEL", "").strip().lower() in ("1", "true", "yes")

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

# -----------------------------
# Cache
# -----------------------------
if OUT_COST_SNAPSHOT.exists() and OUT_COST_TABLE.exists() and (not FORCE_REBUILD_COST):
    print(f"[Celda 03 v2.0.1] Cache detectado. Usando cost model existente:\n  - {OUT_COST_SNAPSHOT}\n  - {OUT_COST_TABLE}")
    snap = json.loads(OUT_COST_SNAPSHOT.read_text(encoding="utf-8"))
    print("\n--- Cost Model Snapshot (resumen) ---")
    for r in snap.get("per_symbol", []):
        print(f"  {r['symbol']}: total_base_bps={r['total_base_bps']:.2f}, total_stress_bps={r['total_stress_bps']:.2f} "
              f"(slip_base={r['slippage_base_bps']:.2f}, slip_stress={r['slippage_stress_bps']:.2f}, gap_base={r['gap_base_bps']:.2f}, gap_stress={r['gap_stress_bps']:.2f})")
    print("\n[Celda 03 v2.0.1] OK — cost model listo.")
else:
    specs = pl.read_parquet(INSTRUMENT_SPECS_PATH)

    # QA flag (no bloquea, solo imprime)
    qa_session_aware = False
    if QA_REPORT_PATH.exists():
        try:
            qa = json.loads(QA_REPORT_PATH.read_text(encoding="utf-8"))
            qa_session_aware = bool(qa.get("cell", "").startswith("02 v2."))
        except Exception:
            qa_session_aware = False
    print(f"[Celda 03 v2.0.1] QA detected: {qa_session_aware} | QA path: {QA_REPORT_PATH}")

    # -----------------------------
    # Microstructure proxies (POR SÍMBOLO)
    # -----------------------------
    # Nota institucional:
    # - spread_bps proxy depende de que spread exista.
    # - abs_ret_bps proxy usa retornos close-to-close POR SÍMBOLO.
    # - Si el archivo no estuviera ordenado por symbol/time, esto debería fallar en QA previo. Aquí imprimimos sanity.
    sanity = (
        pl.scan_parquet(OHLCV_CLEAN_PATH)
        .select(["symbol", "time_utc"])
        .group_by("symbol")
        .agg([
            (pl.col("time_utc").diff().dt.total_seconds().min()).alias("min_dt_sec"),
            (pl.col("time_utc").diff().dt.total_seconds().max()).alias("max_dt_sec"),
            pl.len().alias("n_rows"),
        ])
        .collect()
        .sort("symbol")
    )
    print("\n--- Sanity order/spacing (time_utc diff stats) ---")
    print(sanity)
    bad_order = sanity.filter(pl.col("min_dt_sec") < 0)
    if bad_order.height > 0:
        raise RuntimeError(f"[Celda 03 v2.0.1] ERROR: time_utc no está ordenado (min_dt_sec<0) en: {bad_order.select('symbol').to_series().to_list()}")

    df_stats = (
        pl.scan_parquet(OHLCV_CLEAN_PATH)
        .select(["symbol", "time_utc", "close", "spread"])
        .with_columns([
            pl.col("close").shift(1).over("symbol").alias("close_prev"),
        ])
        .with_columns([
            pl.when(pl.col("close_prev").is_not_null() & (pl.col("close_prev") > 0))
              .then((pl.col("close") / pl.col("close_prev") - 1.0).abs())
              .otherwise(None)
              .alias("abs_ret"),
        ])
        .with_columns([
            (pl.col("abs_ret") * 10_000).alias("abs_ret_bps"),
            pl.when(pl.col("spread").is_not_null() & (pl.col("close") > 0))
              .then((pl.col("spread") / pl.col("close")) * 10_000)
              .otherwise(None)
              .alias("spread_bps"),
        ])
        .group_by("symbol")
        .agg([
            pl.len().alias("n_rows"),
            pl.col("spread_bps").drop_nulls().len().alias("n_spread_nonnull"),
            pl.col("spread_bps").median().alias("spread_med_bps"),
            pl.col("spread_bps").quantile(0.95, "nearest").alias("spread_p95_bps"),
            pl.col("abs_ret_bps").median().alias("vol_med_absret_bps"),
            pl.col("abs_ret_bps").quantile(0.95, "nearest").alias("vol_p95_absret_bps"),
        ])
        .collect()
        .sort("symbol")
    )

    print("\n--- Microstructure proxies (from OHLCV clean) ---")
    print(df_stats)

    # -----------------------------
    # Construir cost model por símbolo (con prints explícitos)
    # -----------------------------
    rows: List[Dict[str, Any]] = []

    for r in specs.to_dicts():
        sym = r["symbol"]
        asset_class = r["asset_class"]
        base_bps = float(r["base_cost_bps"])
        stress_bps = float(r["stress_cost_bps"])

        s = df_stats.filter(pl.col("symbol") == sym)
        if s.height != 1:
            raise RuntimeError(f"[Celda 03 v2.0.1] ERROR: no encuentro stats para {sym} en OHLCV clean.")
        srow = s.row(0, named=True)

        n_spread_nonnull = int(srow["n_spread_nonnull"])
        n_rows = int(srow["n_rows"])
        spread_med = srow["spread_med_bps"]
        spread_p95 = srow["spread_p95_bps"]
        vol_med = srow["vol_med_absret_bps"]
        vol_p95 = srow["vol_p95_absret_bps"]

        spread_usable = (n_spread_nonnull / max(n_rows, 1)) >= 0.10

        if spread_usable and (spread_med is not None):
            slip_base = float(max(spread_med, 0.0)) * 0.50
            slip_stress = float(max(spread_p95 or spread_med, 0.0)) * 0.75
            slip_method = "spread_bps_proxy"
        else:
            slip_base = float(max(vol_med or 0.0, 0.0)) * 0.10
            slip_stress = float(max(vol_p95 or vol_med or 0.0, 0.0)) * 0.15
            slip_method = "vol_absret_bps_proxy"

        if asset_class == "equity":
            gap_base = 2.0
            gap_stress = 6.0
        elif asset_class == "fx_metal":
            gap_base = 1.0
            gap_stress = 3.0
        else:
            gap_base = 0.0
            gap_stress = 0.0

        total_base = base_bps + slip_base + gap_base
        total_stress = stress_bps + slip_stress + gap_stress

        # Print explícito por símbolo
        print("\n" + "-" * 100)
        print(f"[Celda 03 v2.0.1] {sym} | asset_class={asset_class}")
        print(f"  base_cost_bps={base_bps:.2f} | stress_cost_bps={stress_bps:.2f}")
        print(f"  slippage_method={slip_method} | spread_coverage_pct={(n_spread_nonnull/max(n_rows,1))*100.0:.2f}%")
        print(f"  spread_med_bps={spread_med} | spread_p95_bps={spread_p95}")
        print(f"  vol_med_absret_bps={vol_med} | vol_p95_absret_bps={vol_p95}")
        print(f"  slippage_base_bps={slip_base:.3f} | slippage_stress_bps={slip_stress:.3f}")
        print(f"  gap_base_bps={gap_base:.2f} | gap_stress_bps={gap_stress:.2f}")
        print(f"  >>> TOTAL_BASE_BPS={total_base:.3f} | TOTAL_STRESS_BPS={total_stress:.3f}")

        rows.append({
            "symbol": sym,
            "asset_class": asset_class,
            "base_cost_bps": base_bps,
            "stress_cost_bps": stress_bps,
            "slippage_base_bps": slip_base,
            "slippage_stress_bps": slip_stress,
            "gap_base_bps": gap_base,
            "gap_stress_bps": gap_stress,
            "total_base_bps": total_base,
            "total_stress_bps": total_stress,
            "slippage_method": slip_method,
            "spread_med_bps": float(spread_med) if spread_med is not None else None,
            "spread_p95_bps": float(spread_p95) if spread_p95 is not None else None,
            "vol_med_absret_bps": float(vol_med) if vol_med is not None else None,
            "vol_p95_absret_bps": float(vol_p95) if vol_p95 is not None else None,
            "spread_coverage_pct": float(n_spread_nonnull / max(n_rows, 1) * 100.0),
        })

    cost_table = pl.DataFrame(rows).sort("symbol")

    # Gates
    bad = cost_table.filter(
        (pl.col("total_base_bps") <= 0) |
        (pl.col("total_stress_bps") <= 0) |
        (pl.col("total_stress_bps") < pl.col("total_base_bps"))
    )
    if bad.height > 0:
        raise RuntimeError(f"[Celda 03 v2.0.1] ERROR: cost model inválido:\n{bad}")

    OUT_COST_TABLE.parent.mkdir(parents=True, exist_ok=True)
    cost_table.write_parquet(OUT_COST_TABLE)

    snapshot = {
        "cell": "03 v2.0.1",
        "created_utc": _now_utc_iso(),
        "qa_detected": qa_session_aware,
        "notes": [
            "total_*_bps = base/stress (spec) + slippage proxy + gap proxy.",
            "slippage proxy: usa spread si existe; caso contrario, proxy por abs-return POR SÍMBOLO.",
            "gap proxy es add-on conservador; se refina más adelante si se requiere.",
        ],
        "per_symbol": cost_table.to_dicts(),
    }

    OUT_COST_SNAPSHOT.parent.mkdir(parents=True, exist_ok=True)
    OUT_COST_SNAPSHOT.write_text(json.dumps(snapshot, indent=2, ensure_ascii=False), encoding="utf-8")

    print(f"\n[Celda 03 v2.0.1] OK — cost model guardado:")
    print(f"  - {OUT_COST_SNAPSHOT}")
    print(f"  - {OUT_COST_TABLE}")

    print("\n--- Cost Model Table (v2.0.1) ---")
    print(cost_table)

    warn = cost_table.filter(pl.col("slippage_method") == "vol_absret_bps_proxy")
    if warn.height > 0:
        print("\n[Celda 03 v2.0.1] AVISO: spread no utilizable; slippage estimado por proxy de volatilidad:")
        print(warn.select(["symbol", "slippage_method", "spread_coverage_pct", "vol_med_absret_bps", "slippage_base_bps", "slippage_stress_bps", "total_base_bps", "total_stress_bps"]))

    print("\n[Celda 03 v2.0.1] OK — costos listos para net-of-costs en baseline/alpha/engine.")
 

[Celda 03 v2.0.1] QA detected: True | QA path: C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\data_qa_report_v2.json

--- Sanity order/spacing (time_utc diff stats) ---
shape: (4, 4)
┌────────┬────────────┬────────────┬────────┐
│ symbol ┆ min_dt_sec ┆ max_dt_sec ┆ n_rows │
│ ---    ┆ ---        ┆ ---        ┆ ---    │
│ str    ┆ i64        ┆ i64        ┆ u32    │
╞════════╪════════════╪════════════╪════════╡
│ BNBUSD ┆ 300        ┆ 125100     ┆ 430323 │
│ BTCUSD ┆ 300        ┆ 172800     ┆ 358028 │
│ LVMH   ┆ 300        ┆ 488400     ┆ 109289 │
│ XAUAUD ┆ 300        ┆ 264000     ┆ 297135 │
└────────┴────────────┴────────────┴────────┘

--- Microstructure proxies (from OHLCV clean) ---
shape: (4, 7)
┌────────┬────────┬────────────────┬───────────────┬───────────────┬───────────────┬───────────────┐
│ symbol ┆ n_rows ┆ n_spread_nonnu ┆ spread_med_bp ┆ spread_p95_bp ┆ vol_med_absre ┆ vol_p95_absre │
│ ---    ┆ ---    ┆ ll             ┆ s             ┆ 

In [5]:
# ======================================================================================
# Celda 04 v2.0.1 — WFO Builder (≥6 folds + embargo/purge) [SESSION-AWARE GATES]
# Fix vs v2.0:
#   - Gate de tamaño ahora es por asset_class y además por "trading days" (más defendible).
#   - Evita bloquear equities session-only (LVMH) con thresholds 24/7.
# ======================================================================================

from __future__ import annotations

import os
import json
import calendar
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from pathlib import Path
from typing import Any, Dict, List, Tuple, Optional

import polars as pl

# -----------------------------
# Preflight
# -----------------------------
if "RUN" not in globals():
    raise RuntimeError("[Celda 04 v2.0.1] ERROR: No existe RUN en memoria. Ejecuta primero Celda 00 v2.0.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

INSTRUMENT_SPECS_PATH = RUN_DIR / "instrument_specs_v2.parquet"
OHLCV_CLEAN_PATH = ARTIFACTS["ohlcv_clean"]
QA_REPORT_PATH = ARTIFACTS["data_qa_report"]

if not INSTRUMENT_SPECS_PATH.exists():
    raise RuntimeError("[Celda 04 v2.0.1] ERROR: Falta instrument_specs_v2.parquet. Ejecuta Celda 01 v2.0.")
if not OHLCV_CLEAN_PATH.exists():
    raise RuntimeError("[Celda 04 v2.0.1] ERROR: Falta ohlcv_clean_m5.parquet. Ejecuta Celda 02.")

OUT_WFO_FOLDS = ARTIFACTS["wfo_folds"]
OUT_WFO_SNAPSHOT = ARTIFACTS["wfo_folds_snapshot"]

FORCE_REBUILD = os.getenv("TREND_M5_FORCE_REBUILD_WFO", "1").strip().lower() in ("1", "true", "yes")  # default=1 aquí
WFO_MODE = os.getenv("TREND_M5_WFO_MODE", "expanding").strip().lower()
if WFO_MODE not in ("expanding", "rolling"):
    raise ValueError("[Celda 04 v2.0.1] ERROR: TREND_M5_WFO_MODE debe ser 'expanding' o 'rolling'.")

MIN_FOLDS = int(os.getenv("TREND_M5_MIN_FOLDS", "6"))
MAX_HOLD_BARS = int(os.getenv("TREND_M5_MAX_HOLD_BARS", "2016"))  # ~1 semana
EXPECTED_BAR_SECONDS = 300  # M5
EMBARGO = timedelta(seconds=MAX_HOLD_BARS * EXPECTED_BAR_SECONDS)

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

def add_months(dt: datetime, months: int) -> datetime:
    y = dt.year
    m = dt.month + months
    while m > 12:
        y += 1
        m -= 12
    while m < 1:
        y -= 1
        m += 12
    last_day = calendar.monthrange(y, m)[1]
    d = min(dt.day, last_day)
    return dt.replace(year=y, month=m, day=d)

@dataclass
class WFOConfig:
    is_months: int
    oos_months: int
    step_months: int

CANDIDATE_CONFIGS_BY_ASSET = {
    "crypto": [
        WFOConfig(18, 3, 3),
        WFOConfig(12, 3, 3),
        WFOConfig(12, 2, 2),
    ],
    "equity": [
        WFOConfig(18, 3, 3),  # se mantiene; el gate ahora es session-aware
        WFOConfig(12, 3, 3),
        WFOConfig(12, 2, 2),
    ],
    "fx_metal": [
        WFOConfig(18, 3, 3),
        WFOConfig(12, 3, 3),
        WFOConfig(12, 2, 2),
    ],
}

# -----------------------------
# Gates session-aware (por asset_class)
# -----------------------------
# Días son el gate principal (defendible). Barras son sanity.
GATES = {
    "crypto":   {"min_is_days": 365, "min_oos_days": 60, "min_is_bars": 70_000, "min_oos_bars": 20_000},
    "fx_metal": {"min_is_days": 365, "min_oos_days": 60, "min_is_bars": 80_000, "min_oos_bars": 12_000},
    "equity":   {"min_is_days": 250, "min_oos_days": 60, "min_is_bars": 35_000, "min_oos_bars": 6_000},
    "unknown":  {"min_is_days": 250, "min_oos_days": 60, "min_is_bars": 35_000, "min_oos_bars": 6_000},
}

# -----------------------------
# Construcción (sin cache por defecto)
# -----------------------------
specs = pl.read_parquet(INSTRUMENT_SPECS_PATH)
universe = specs.select("symbol").to_series().to_list()
spec_map = {r["symbol"]: r for r in specs.to_dicts()}

df_ranges = (
    pl.scan_parquet(OHLCV_CLEAN_PATH)
    .group_by("symbol")
    .agg([
        pl.min("time_utc").alias("start_utc"),
        pl.max("time_utc").alias("end_utc"),
        pl.len().alias("n_rows"),
    ])
    .collect()
    .sort("symbol")
)

print("[Celda 04 v2.0.1] Universe:", universe)
print("\n--- Data ranges (ohlcv_clean) ---")
print(df_ranges)

qa_cell = None
if QA_REPORT_PATH.exists():
    try:
        qa_cell = json.loads(QA_REPORT_PATH.read_text(encoding="utf-8")).get("cell")
    except Exception:
        qa_cell = None

# Precompute daily calendar por símbolo (más eficiente que re-scan por fold)
daily = (
    pl.scan_parquet(OHLCV_CLEAN_PATH)
    .select([
        pl.col("symbol"),
        pl.col("time_utc").dt.truncate("1d").alias("day"),
    ])
    .unique()
    .collect()
)

def count_bars(sym: str, t0: datetime, t1: datetime) -> int:
    return int(
        pl.scan_parquet(OHLCV_CLEAN_PATH)
        .filter((pl.col("symbol") == sym) & (pl.col("time_utc") >= t0) & (pl.col("time_utc") < t1))
        .select(pl.len())
        .collect()
        .item()
    )

def count_days(sym: str, t0: datetime, t1: datetime) -> int:
    return int(
        daily.filter((pl.col("symbol") == sym) & (pl.col("day") >= t0.replace(hour=0, minute=0, second=0, microsecond=0))
                     & (pl.col("day") <  t1.replace(hour=0, minute=0, second=0, microsecond=0)))
        .select(pl.len())
        .item()
    )

def possible_fold_count(start: datetime, end: datetime, cfg: WFOConfig, embargo: timedelta) -> int:
    is_start = start
    is_end = add_months(is_start, cfg.is_months)
    n = 0
    while True:
        oos_start = is_end + embargo
        oos_end = add_months(oos_start, cfg.oos_months)
        if oos_end > end:
            break
        n += 1
        is_end = add_months(is_end, cfg.step_months)
        if is_end >= end:
            break
    return n

all_folds_rows: List[Dict[str, Any]] = []
snapshot_per_symbol: List[Dict[str, Any]] = []

for sym in universe:
    r = df_ranges.filter(pl.col("symbol") == sym)
    start_dt = r.select("start_utc").item()
    end_dt = r.select("end_utc").item()
    n_rows = int(r.select("n_rows").item())

    asset_class = spec_map[sym].get("asset_class", "unknown")
    cfg_candidates = CANDIDATE_CONFIGS_BY_ASSET.get(asset_class, [WFOConfig(12, 3, 3)])

    chosen_cfg: Optional[WFOConfig] = None
    chosen_count = 0

    for cfg in cfg_candidates:
        cnt = possible_fold_count(start_dt, end_dt, cfg, EMBARGO)
        if cnt >= MIN_FOLDS:
            chosen_cfg = cfg
            chosen_count = cnt
            break
        if cnt > chosen_count:
            chosen_cfg = cfg
            chosen_count = cnt

    if chosen_cfg is None or chosen_count < 3:
        raise RuntimeError(f"[Celda 04 v2.0.1] ERROR: WFO indefendible para {sym}. folds_possible={chosen_count}")

    print("\n" + "-" * 100)
    print(f"[Celda 04 v2.0.1] {sym} | asset_class={asset_class}")
    print(f"  data: start={start_dt}  end={end_dt}  n_rows={n_rows:,}")
    print(f"  chosen_cfg: IS={chosen_cfg.is_months}m  OOS={chosen_cfg.oos_months}m  STEP={chosen_cfg.step_months}m")
    print(f"  embargo_days={EMBARGO.total_seconds()/86400.0:.2f} | folds_possible={chosen_count} | mode={WFO_MODE}")

    fold_id = 1
    is_start = start_dt
    is_end = add_months(is_start, chosen_cfg.is_months)

    while True:
        embargo_start = is_end
        embargo_end = is_end + EMBARGO
        oos_start = embargo_end
        oos_end = add_months(oos_start, chosen_cfg.oos_months)

        if oos_end > end_dt:
            break

        if WFO_MODE == "rolling":
            is_start_eff = add_months(is_start, (fold_id - 1) * chosen_cfg.step_months)
            is_end_eff = is_end
        else:
            is_start_eff = is_start
            is_end_eff = is_end

        is_bars = count_bars(sym, is_start_eff, is_end_eff)
        oos_bars = count_bars(sym, oos_start, oos_end)

        is_days = count_days(sym, is_start_eff, is_end_eff)
        oos_days = count_days(sym, oos_start, oos_end)

        all_folds_rows.append({
            "symbol": sym,
            "fold_id": int(fold_id),
            "asset_class": asset_class,
            "wfo_mode": WFO_MODE,
            "is_start_utc": is_start_eff,
            "is_end_utc": is_end_eff,
            "embargo_start_utc": embargo_start,
            "embargo_end_utc": embargo_end,
            "oos_start_utc": oos_start,
            "oos_end_utc": oos_end,
            "is_bars": int(is_bars),
            "oos_bars": int(oos_bars),
            "is_days": int(is_days),
            "oos_days": int(oos_days),
            "cfg_is_months": int(chosen_cfg.is_months),
            "cfg_oos_months": int(chosen_cfg.oos_months),
            "cfg_step_months": int(chosen_cfg.step_months),
            "embargo_bars": int(MAX_HOLD_BARS),
        })

        print(f"  fold={fold_id:02d} | IS bars={is_bars:,} days={is_days} | OOS bars={oos_bars:,} days={oos_days}")

        fold_id += 1
        is_end = add_months(is_end, chosen_cfg.step_months)
        if is_end >= end_dt:
            break

    snapshot_per_symbol.append({
        "symbol": sym,
        "asset_class": asset_class,
        "config": {"is_months": chosen_cfg.is_months, "oos_months": chosen_cfg.oos_months, "step_months": chosen_cfg.step_months},
        "wfo_mode": WFO_MODE,
        "n_folds": int(fold_id - 1),
        "embargo_bars": int(MAX_HOLD_BARS),
        "embargo_days": float(EMBARGO.total_seconds() / 86400.0),
        "data_start_utc": str(start_dt),
        "data_end_utc": str(end_dt),
        "n_rows": int(n_rows),
    })

wfo_df = pl.DataFrame(all_folds_rows).sort(["symbol", "fold_id"])

# Gate A: folds por símbolo
folds_by_sym = wfo_df.group_by("symbol").agg(pl.len().alias("n_folds")).sort("symbol")
print("\n--- Folds por símbolo ---")
print(folds_by_sym)

too_few = folds_by_sym.filter(pl.col("n_folds") < 3)
if too_few.height > 0:
    raise RuntimeError(f"[Celda 04 v2.0.1] ERROR: símbolos con <3 folds: {too_few}")

# Gate B: session-aware (por asset_class)
def gate_row(asset_class: str) -> Dict[str, int]:
    g = GATES.get(asset_class, GATES["unknown"])
    return {k: int(v) for k, v in g.items()}

bad_rows = []
for row in wfo_df.iter_rows(named=True):
    g = gate_row(row["asset_class"])
    if (row["is_days"] < g["min_is_days"]) or (row["oos_days"] < g["min_oos_days"]) or (row["is_bars"] < g["min_is_bars"]) or (row["oos_bars"] < g["min_oos_bars"]):
        bad_rows.append({
            "symbol": row["symbol"],
            "fold_id": row["fold_id"],
            "asset_class": row["asset_class"],
            "is_bars": row["is_bars"], "oos_bars": row["oos_bars"],
            "is_days": row["is_days"], "oos_days": row["oos_days"],
            **{f"gate_{k}": v for k, v in g.items()}
        })

if bad_rows:
    bad_df = pl.DataFrame(bad_rows).sort(["symbol", "fold_id"])
    print("\n[Celda 04 v2.0.1] Detalle folds que NO pasan gates session-aware (se detiene):")
    print(bad_df)
    raise RuntimeError(
        "[Celda 04 v2.0.1] ERROR: Hay folds que no cumplen mínimos day-aware y bar-aware por asset_class. "
        "Si esto pasa, tu data o tus ventanas son insuficientes para WFO defendible."
    )

# Gate C: no OOS overlap por símbolo
bad_overlap = []
for sym in wfo_df.select("symbol").unique().to_series().to_list():
    s = wfo_df.filter(pl.col("symbol") == sym).sort("fold_id")
    prev_end = None
    for row in s.iter_rows(named=True):
        if prev_end is not None and row["oos_start_utc"] < prev_end:
            bad_overlap.append((sym, row["fold_id"]))
        prev_end = row["oos_end_utc"]
if bad_overlap:
    raise RuntimeError(f"[Celda 04 v2.0.1] ERROR: OOS overlap detectado: {bad_overlap}")

# Persistir
OUT_WFO_FOLDS.parent.mkdir(parents=True, exist_ok=True)
wfo_df.write_parquet(OUT_WFO_FOLDS)

snapshot = {
    "cell": "04 v2.0.1",
    "created_utc": _now_utc_iso(),
    "qa_cell_detected": qa_cell,
    "wfo_mode": WFO_MODE,
    "min_folds_target": MIN_FOLDS,
    "max_hold_bars": MAX_HOLD_BARS,
    "embargo_days": float(EMBARGO.total_seconds() / 86400.0),
    "gates": GATES,
    "notes": [
        "Gates cambiados a day-aware + bar-aware por asset_class (session-aware).",
        "Esto evita bloquear equities session-only con umbrales 24/7.",
    ],
    "per_symbol": snapshot_per_symbol,
    "folds_path": str(OUT_WFO_FOLDS),
}

OUT_WFO_SNAPSHOT.parent.mkdir(parents=True, exist_ok=True)
OUT_WFO_SNAPSHOT.write_text(json.dumps(snapshot, indent=2, ensure_ascii=False), encoding="utf-8")

print(f"\n[Celda 04 v2.0.1] OK — WFO folds guardados:")
print(f"  - {OUT_WFO_FOLDS}")
print(f"  - {OUT_WFO_SNAPSHOT}")

print("\n--- WFO Folds (preview) ---")
print(wfo_df.head(12))

print("\n[Celda 04 v2.0.1] OK — Se permite avanzar a Celda 05.")
 

[Celda 04 v2.0.1] Universe: ['BNBUSD', 'BTCUSD', 'LVMH', 'XAUAUD']

--- Data ranges (ohlcv_clean) ---
shape: (4, 4)
┌────────┬─────────────────────┬─────────────────────┬────────┐
│ symbol ┆ start_utc           ┆ end_utc             ┆ n_rows │
│ ---    ┆ ---                 ┆ ---                 ┆ ---    │
│ str    ┆ datetime[ms]        ┆ datetime[ms]        ┆ u32    │
╞════════╪═════════════════════╪═════════════════════╪════════╡
│ BNBUSD ┆ 2021-11-19 00:00:00 ┆ 2026-02-16 23:50:00 ┆ 430323 │
│ BTCUSD ┆ 2021-11-19 00:00:00 ┆ 2026-02-16 23:50:00 ┆ 358028 │
│ LVMH   ┆ 2021-11-19 10:00:00 ┆ 2026-02-16 18:25:00 ┆ 109289 │
│ XAUAUD ┆ 2021-11-19 01:05:00 ┆ 2026-02-16 21:25:00 ┆ 297135 │
└────────┴─────────────────────┴─────────────────────┴────────┘

----------------------------------------------------------------------------------------------------
[Celda 04 v2.0.1] BNBUSD | asset_class=crypto
  data: start=2021-11-19 00:00:00  end=2026-02-16 23:50:00  n_rows=430,323
  chosen_cfg: IS=18m 

In [6]:
# ======================================================================================
# Celda 05 v2.0.3 — Feature Set (Causal): Trendiness + Direction [FIX nested windows]
# Fix vs v2.0.2:
#   - Elimina "window dentro de rolling/window": primero materializa columnas base (ret, true_range, abs_diff),
#     luego aplica rollings/EMAs usando pl.col("...") (sin anidar ventanas).
#   - Mantiene Lazy-friendly y sin ColumnNotFound.
# ======================================================================================

from __future__ import annotations

import os
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict

import polars as pl

# -----------------------------
# Preflight
# -----------------------------
if "RUN" not in globals():
    raise RuntimeError("[Celda 05 v2.0.3] ERROR: No existe RUN en memoria. Ejecuta primero Celda 00 v2.0.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

OHLCV_CLEAN_PATH = ARTIFACTS.get("ohlcv_clean", RUN_DIR / "ohlcv_clean_m5.parquet")
QA_REPORT_PATH = ARTIFACTS.get("data_qa_report", RUN_DIR / "data_qa_report.json")
WFO_FOLDS_PATH = ARTIFACTS.get("wfo_folds", RUN_DIR / "wfo_folds.parquet")

if not OHLCV_CLEAN_PATH.exists():
    raise RuntimeError("[Celda 05 v2.0.3] ERROR: Falta ohlcv_clean_m5.parquet. Ejecuta Celda 02.")
if not WFO_FOLDS_PATH.exists():
    raise RuntimeError("[Celda 05 v2.0.3] ERROR: Falta wfo_folds.parquet. Ejecuta Celda 04.")

OUT_FEATURES = RUN_DIR / "features_m5_v2.parquet"
OUT_SNAPSHOT = RUN_DIR / "features_snapshot_v2.json"

RUN["ARTIFACTS"]["features_m5"] = OUT_FEATURES
RUN["ARTIFACTS"]["features_snapshot"] = OUT_SNAPSHOT

FORCE_REBUILD = os.getenv("TREND_M5_FORCE_REBUILD_FEATURES", "").strip().lower() in ("1", "true", "yes")

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

# -----------------------------
# Parámetros
# -----------------------------
EXPECTED_BAR_SECONDS = 300

ER_WIN = int(os.getenv("TREND_M5_ER_WIN", "288"))
VOL_WIN = int(os.getenv("TREND_M5_VOL_WIN", "288"))
MOM_WIN = int(os.getenv("TREND_M5_MOM_WIN", "288"))
ATR_WIN = int(os.getenv("TREND_M5_ATR_WIN", "96"))

EMA_FAST = int(os.getenv("TREND_M5_EMA_FAST", "200"))
EMA_SLOW = int(os.getenv("TREND_M5_EMA_SLOW", "600"))
SLOPE_WIN = int(os.getenv("TREND_M5_SLOPE_WIN", "50"))

EPS = 1e-12

# -----------------------------
# Cache
# -----------------------------
if OUT_FEATURES.exists() and OUT_SNAPSHOT.exists() and (not FORCE_REBUILD):
    print(f"[Celda 05 v2.0.3] Cache detectado. Usando features existentes:\n  - {OUT_FEATURES}\n  - {OUT_SNAPSHOT}")
    snap = json.loads(OUT_SNAPSHOT.read_text(encoding="utf-8"))
    print("\n--- Features Snapshot (resumen) ---")
    print("  params:", snap.get("params", {}))
    print("  symbols:", snap.get("symbols", []))
    print("  schema_cols(sample):", snap.get("schema_cols", [])[:20], "...")
    print("\n[Celda 05 v2.0.3] OK — features listos.")
else:
    lf0 = (
        pl.scan_parquet(OHLCV_CLEAN_PATH)
        .select(["symbol", "time_utc", "open", "high", "low", "close", "volume", "spread"])
        .sort(["symbol", "time_utc"])
    )

    cols = lf0.collect_schema().names()
    required = ["symbol", "time_utc", "open", "high", "low", "close"]
    missing = [c for c in required if c not in cols]
    if missing:
        raise RuntimeError(f"[Celda 05 v2.0.3] ERROR: faltan columnas en ohlcv_clean: {missing}")

    # ============================================================
    # Stage 1: columnas base (permitido usar .over aquí)
    #   - IMPORTANT: no usar estas expresiones dentro de rolling posteriormente.
    # ============================================================
    close_prev = pl.col("close").shift(1).over("symbol")

    ret_expr = (
        pl.when(close_prev.is_not_null() & (close_prev > 0))
        .then(pl.col("close") / close_prev - 1.0)
        .otherwise(None)
    )

    # abs_diff por símbolo (evita diff().over, y evita nested windows)
    abs_diff_expr = (pl.col("close") - pl.col("close").shift(1).over("symbol")).abs()

    # true range base (por símbolo)
    tr_expr = pl.max_horizontal([
        (pl.col("high") - pl.col("low")),
        (pl.col("high") - pl.col("close").shift(1).over("symbol")).abs(),
        (pl.col("low")  - pl.col("close").shift(1).over("symbol")).abs(),
    ])

    lf1 = (
        lf0.with_columns([
            close_prev.alias("close_prev"),
            ret_expr.alias("ret"),
            ret_expr.abs().alias("abs_ret"),
            abs_diff_expr.alias("abs_diff"),
            tr_expr.alias("true_range"),
        ])
    )

    # ============================================================
    # Stage 2: rollings sobre columnas materializadas (NO nested windows)
    # ============================================================
    lf2 = (
        lf1.with_columns([
            (pl.col("ret").rolling_std(window_size=VOL_WIN, min_samples=VOL_WIN).over("symbol") * 10_000)
                .alias(f"vol_bps_{VOL_WIN}"),

            (pl.col("true_range").rolling_mean(window_size=ATR_WIN, min_samples=ATR_WIN).over("symbol") / pl.col("close") * 10_000)
                .alias(f"atr_bps_{ATR_WIN}"),

            ((pl.col("close") / pl.col("close").shift(MOM_WIN).over("symbol") - 1.0) * 10_000)
                .alias(f"mom_bps_{MOM_WIN}"),
        ])
    )

    lf3 = (
        lf2.with_columns([
            (pl.col(f"mom_bps_{MOM_WIN}").abs() / (pl.col(f"vol_bps_{VOL_WIN}") + EPS))
                .alias(f"mom_eff_{MOM_WIN}"),

            ((pl.col("close") - pl.col("close").shift(ER_WIN).over("symbol")).abs() /
             (pl.col("abs_diff").rolling_sum(window_size=ER_WIN, min_samples=ER_WIN).over("symbol") + EPS))
                .alias(f"er_{ER_WIN}"),
        ])
    )

    # ============================================================
    # Stage 3: EMAs y dirección (columnas materializadas => combinaciones seguras)
    # ============================================================
    lf4 = (
        lf3.with_columns([
            pl.col("close").ewm_mean(span=EMA_FAST, adjust=False).over("symbol").alias(f"ema_{EMA_FAST}"),
            pl.col("close").ewm_mean(span=EMA_SLOW, adjust=False).over("symbol").alias(f"ema_{EMA_SLOW}"),
        ])
        .with_columns([
            pl.when(pl.col(f"ema_{EMA_FAST}") > pl.col(f"ema_{EMA_SLOW}")).then(1)
              .when(pl.col(f"ema_{EMA_FAST}") < pl.col(f"ema_{EMA_SLOW}")).then(-1)
              .otherwise(0)
              .alias("trend_dir"),

            (((pl.col(f"ema_{EMA_FAST}") - pl.col(f"ema_{EMA_SLOW}")).abs() / pl.col("close")) * 10_000)
              .alias("trend_strength_bps"),

            (((pl.col(f"ema_{EMA_SLOW}") / pl.col(f"ema_{EMA_SLOW}").shift(SLOPE_WIN).over("symbol")) - 1.0) * 10_000)
              .alias(f"trend_slope_bps_{SLOPE_WIN}"),
        ])
    )

    # -----------------------------
    # Final select (canónico)
    # -----------------------------
    lf_feat = (
        lf4.select([
            "symbol",
            "time_utc",
            "open", "high", "low", "close",
            "volume", "spread",
            "ret",
            f"vol_bps_{VOL_WIN}",
            f"atr_bps_{ATR_WIN}",
            f"mom_bps_{MOM_WIN}",
            f"mom_eff_{MOM_WIN}",
            f"er_{ER_WIN}",
            f"ema_{EMA_FAST}",
            f"ema_{EMA_SLOW}",
            "trend_dir",
            "trend_strength_bps",
            f"trend_slope_bps_{SLOPE_WIN}",
        ])
        .sort(["symbol", "time_utc"])
    )

    df_feat = lf_feat.collect()

    # -----------------------------
    # QA / sanity
    # -----------------------------
    mono = (
        df_feat.group_by("symbol")
        .agg((pl.col("time_utc").diff().drop_nulls().min() >= 0).alias("is_sorted"))
        .sort("symbol")
    )
    if mono.filter(pl.col("is_sorted") == False).height > 0:
        raise RuntimeError(f"[Celda 05 v2.0.3] ERROR: time_utc no está ordenado en features:\n{mono}")

    key_cols = [
        f"er_{ER_WIN}",
        f"vol_bps_{VOL_WIN}",
        f"atr_bps_{ATR_WIN}",
        f"mom_bps_{MOM_WIN}",
        f"ema_{EMA_FAST}",
        f"ema_{EMA_SLOW}",
    ]
    null_report = (
        df_feat.group_by("symbol")
        .agg([(pl.col(c).is_null().mean() * 100.0).alias(f"null_pct_{c}") for c in key_cols])
        .sort("symbol")
    )

    OUT_FEATURES.parent.mkdir(parents=True, exist_ok=True)
    df_feat.write_parquet(OUT_FEATURES)

    qa_cell = None
    if QA_REPORT_PATH.exists():
        try:
            qa_cell = json.loads(QA_REPORT_PATH.read_text(encoding="utf-8")).get("cell")
        except Exception:
            qa_cell = None

    snapshot = {
        "cell": "05 v2.0.3",
        "created_utc": _now_utc_iso(),
        "qa_cell_detected": qa_cell,
        "params": {
            "ER_WIN": ER_WIN,
            "VOL_WIN": VOL_WIN,
            "MOM_WIN": MOM_WIN,
            "ATR_WIN": ATR_WIN,
            "EMA_FAST": EMA_FAST,
            "EMA_SLOW": EMA_SLOW,
            "SLOPE_WIN": SLOPE_WIN,
            "EXPECTED_BAR_SECONDS": EXPECTED_BAR_SECONDS,
        },
        "schema_cols": df_feat.columns,
        "symbols": df_feat.select("symbol").unique().to_series().to_list(),
        "notes": [
            "Fix nested windows: primero columnas base, luego rollings/EMAs sobre pl.col(...) materializadas.",
            "Features causales (<=t) alineadas con entrada t+1.",
        ],
    }
    OUT_SNAPSHOT.write_text(json.dumps(snapshot, indent=2, ensure_ascii=False), encoding="utf-8")

    print(f"[Celda 05 v2.0.3] OK — features guardados:")
    print(f"  - {OUT_FEATURES}")
    print(f"  - {OUT_SNAPSHOT}")

    print("\n--- Features preview ---")
    print(df_feat.head(8))

    print("\n--- Monotonicidad time_utc por símbolo ---")
    print(mono)

    print("\n--- Null% (warmup) en features clave ---")
    print(null_report)

    dist = (
        df_feat.group_by(["symbol", "trend_dir"])
        .agg(pl.len().alias("n"))
        .sort(["symbol", "trend_dir"])
    )
    print("\n--- Distribución trend_dir (sanity) ---")
    print(dist)

    print("\n[Celda 05 v2.0.3] OK — Se permite avanzar a Celda 06 (Regime Gate ON/OFF + hysteresis).")


[Celda 05 v2.0.3] OK — features guardados:
  - C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\features_m5_v2.parquet
  - C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\features_snapshot_v2.json

--- Features preview ---
shape: (8, 19)
┌────────┬──────────────┬────────┬────────┬───┬────────────┬───────────┬─────────────┬─────────────┐
│ symbol ┆ time_utc     ┆ open   ┆ high   ┆ … ┆ ema_600    ┆ trend_dir ┆ trend_stren ┆ trend_slope │
│ ---    ┆ ---          ┆ ---    ┆ ---    ┆   ┆ ---        ┆ ---       ┆ gth_bps     ┆ _bps_50     │
│ str    ┆ datetime[ms] ┆ f64    ┆ f64    ┆   ┆ f64        ┆ i32       ┆ ---         ┆ ---         │
│        ┆              ┆        ┆        ┆   ┆            ┆           ┆ f64         ┆ f64         │
╞════════╪══════════════╪════════╪════════╪═══╪════════════╪═══════════╪═════════════╪═════════════╡
│ BNBUSD ┆ 2021-11-19   ┆ 537.88 ┆ 540.08 ┆ … ┆ 539.68     ┆ 0         ┆ 0.0         

In [7]:
# ======================================================================================
# Celda 06 v2.0.1 — Regime Gate por Fold (TREND, M5) [IS-only, no leakage]
# BUG FIX vs v1: SHORT gate calibrado independientemente (thr_mom_short separado)
# Guardrails de cobertura: 5% <= coverage_IS <= 80% (cada side por separado)
# ======================================================================================

from __future__ import annotations
import json, math
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict
import polars as pl

print(">>> Celda 06 v2.0.1 :: Regime Gate por Fold (TREND, M5)")

# ---------- Preflight ----------
if "RUN" not in globals():
    raise RuntimeError("[Celda 06] ERROR: RUN no existe. Ejecuta Celda 00.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

FEATURES_PATH = ARTIFACTS.get("features_m5", RUN_DIR / "features_m5_v2.parquet")
WFO_FOLDS_PATH = ARTIFACTS.get("wfo_folds", RUN_DIR / "wfo_folds_v2.parquet")

if not FEATURES_PATH.exists():
    raise RuntimeError(f"[Celda 06] ERROR: features no encontradas: {FEATURES_PATH}")
if not WFO_FOLDS_PATH.exists():
    raise RuntimeError(f"[Celda 06] ERROR: wfo_folds no encontrados: {WFO_FOLDS_PATH}")

OUT_REGIME = ARTIFACTS.get("regime_params_by_fold", RUN_DIR / "regime_params_by_fold_v2.parquet")
OUT_SNAP = ARTIFACTS.get("regime_params_snapshot", RUN_DIR / "regime_params_snapshot_v2.json")

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

# ---------- Parametros ----------
ER_COL = "er_288"
MOM_COL = "mom_bps_288"
VOL_COL = "vol_bps_288"

Q_SCHEMES = [
    {"name": "BASE",   "q_er": 0.60, "q_mom": 0.55, "q_vol": 0.90},
    {"name": "RELAX1", "q_er": 0.50, "q_mom": 0.50, "q_vol": 0.95},
    {"name": "RELAX2", "q_er": 0.40, "q_mom": 0.50, "q_vol": 0.99},
    {"name": "TIGHT1", "q_er": 0.70, "q_mom": 0.60, "q_vol": 0.85},
]
COV_IS_MIN = 0.05
COV_IS_MAX = 0.80
MIN_IS_ROWS = 5_000

# ---------- Helpers ----------
def _q_safe(s: pl.Series, q: float):
    s2 = s.drop_nulls()
    if s2.len() == 0:
        return None
    v = s2.quantile(q, interpolation="nearest")
    if v is None:
        return None
    fv = float(v)
    return fv if math.isfinite(fv) else None

def _calibrate_side(df_is: pl.DataFrame, side: str) -> dict:
    """Calibrar thresholds para un side (LONG o SHORT) independientemente."""
    er_s = df_is.get_column(ER_COL)
    mom_s = df_is.get_column(MOM_COL)
    vol_s = df_is.get_column(VOL_COL)

    best = None
    for sch in Q_SCHEMES:
        thr_er = _q_safe(er_s, sch["q_er"])
        thr_vol = _q_safe(vol_s, sch["q_vol"])
        if thr_er is None or thr_vol is None:
            continue

        # BUG FIX: LONG usa percentil positivo de mom, SHORT usa percentil negativo
        if side == "LONG":
            thr_mom = _q_safe(mom_s, sch["q_mom"])
            if thr_mom is None:
                continue
            thr_mom = max(0.0, thr_mom)
            gate = (
                (pl.col(ER_COL) >= thr_er) &
                (pl.col(MOM_COL) >= thr_mom) &
                (pl.col(VOL_COL) <= thr_vol)
            )
        else:  # SHORT
            # percentil bajo de momentum (valores negativos)
            thr_mom_short = _q_safe(mom_s, 1.0 - sch["q_mom"])
            if thr_mom_short is None:
                continue
            thr_mom_short = min(0.0, thr_mom_short)
            thr_mom = thr_mom_short
            gate = (
                (pl.col(ER_COL) >= thr_er) &
                (pl.col(MOM_COL) <= thr_mom) &
                (pl.col(VOL_COL) <= thr_vol)
            )

        cov = float(df_is.select(gate.mean()).item())
        payload = {
            "scheme": sch["name"], "side": side,
            "thr_er": float(thr_er), "thr_mom": float(thr_mom), "thr_vol": float(thr_vol),
            "cov_is": float(cov),
        }
        if COV_IS_MIN <= cov <= COV_IS_MAX:
            return payload
        score = abs(cov - 0.30)
        if best is None or score < best[0]:
            best = (score, payload)

    if best is not None:
        return best[1]
    return {"scheme": "FAIL", "side": side, "thr_er": None, "thr_mom": None, "thr_vol": None, "cov_is": 0.0}

# ---------- Main ----------
df_feat = pl.read_parquet(FEATURES_PATH)
df_folds = pl.read_parquet(WFO_FOLDS_PATH)

symbols = df_feat.get_column("symbol").unique().sort().to_list()
fold_ids = df_folds.get_column("fold_id").unique().sort().to_list()

rows = []
for sym in symbols:
    df_sym = df_feat.filter(pl.col("symbol") == sym).sort("time_utc")
    for fid in fold_ids:
        fold_row = df_folds.filter(pl.col("fold_id") == fid).row(0, named=True)
        is_s = fold_row["is_start_utc"]
        is_e = fold_row["is_end_utc"]
        oos_s = fold_row["oos_start_utc"]
        oos_e = fold_row["oos_end_utc"]

        df_is = df_sym.filter(
            (pl.col("time_utc") >= is_s) & (pl.col("time_utc") <= is_e)
        ).drop_nulls([ER_COL, MOM_COL, VOL_COL])

        df_oos = df_sym.filter(
            (pl.col("time_utc") >= oos_s) & (pl.col("time_utc") <= oos_e)
        ).drop_nulls([ER_COL, MOM_COL, VOL_COL])

        for side in ("LONG", "SHORT"):
            cal = _calibrate_side(df_is, side) if df_is.height >= MIN_IS_ROWS else {
                "scheme": "SKIP", "side": side, "thr_er": None, "thr_mom": None, "thr_vol": None, "cov_is": 0.0
            }

            # OOS coverage
            cov_oos = 0.0
            if cal["thr_er"] is not None:
                if side == "LONG":
                    g = (pl.col(ER_COL) >= cal["thr_er"]) & (pl.col(MOM_COL) >= cal["thr_mom"]) & (pl.col(VOL_COL) <= cal["thr_vol"])
                else:
                    g = (pl.col(ER_COL) >= cal["thr_er"]) & (pl.col(MOM_COL) <= cal["thr_mom"]) & (pl.col(VOL_COL) <= cal["thr_vol"])
                if df_oos.height > 0:
                    cov_oos = float(df_oos.select(g.mean()).item())

            rows.append({
                "symbol": sym, "fold_id": fid, "side": side,
                "scheme": cal["scheme"],
                "thr_er": cal["thr_er"], "thr_mom": cal["thr_mom"], "thr_vol": cal["thr_vol"],
                "cov_is": cal["cov_is"], "cov_oos": cov_oos,
                "n_is": df_is.height, "n_oos": df_oos.height,
            })
            print(f"[Celda 06] {sym} fold={fid} {side} :: scheme={cal['scheme']} cov_IS={cal['cov_is']:.3f} cov_OOS={cov_oos:.3f}")

gate_df = pl.DataFrame(rows).sort(["symbol", "fold_id", "side"])
gate_df.write_parquet(str(OUT_REGIME), compression="zstd")

snap = {
    "created_utc": _now_utc_iso(),
    "version": "v2.0.1",
    "symbols": symbols,
    "fold_ids": [str(f) for f in fold_ids],
    "params": {"ER_COL": ER_COL, "MOM_COL": MOM_COL, "VOL_COL": VOL_COL, "Q_SCHEMES": Q_SCHEMES},
    "bug_fix": "SHORT gate calibrado independientemente con percentil negativo de momentum",
}
Path(OUT_SNAP).write_text(json.dumps(snap, indent=2, ensure_ascii=False, default=str), encoding="utf-8")

print(f"\n[Celda 06] OUT: {OUT_REGIME} ({gate_df.height} rows)")
print(f"[Celda 06] OUT: {OUT_SNAP}")
print(">>> Celda 06 v2.0.1 :: OK")


>>> Celda 06 v2.0.1 :: Regime Gate por Fold (TREND, M5)
[Celda 06] BNBUSD fold=1 LONG :: scheme=BASE cov_IS=0.179 cov_OOS=0.129
[Celda 06] BNBUSD fold=1 SHORT :: scheme=BASE cov_IS=0.166 cov_OOS=0.189
[Celda 06] BNBUSD fold=2 LONG :: scheme=BASE cov_IS=0.173 cov_OOS=0.176
[Celda 06] BNBUSD fold=2 SHORT :: scheme=BASE cov_IS=0.171 cov_OOS=0.134
[Celda 06] BNBUSD fold=3 LONG :: scheme=BASE cov_IS=0.175 cov_OOS=0.206
[Celda 06] BNBUSD fold=3 SHORT :: scheme=BASE cov_IS=0.168 cov_OOS=0.131
[Celda 06] BNBUSD fold=4 LONG :: scheme=BASE cov_IS=0.178 cov_OOS=0.207
[Celda 06] BNBUSD fold=4 SHORT :: scheme=BASE cov_IS=0.165 cov_OOS=0.089
[Celda 06] BNBUSD fold=5 LONG :: scheme=BASE cov_IS=0.185 cov_OOS=0.203
[Celda 06] BNBUSD fold=5 SHORT :: scheme=BASE cov_IS=0.159 cov_OOS=0.174
[Celda 06] BNBUSD fold=6 LONG :: scheme=BASE cov_IS=0.184 cov_OOS=0.233
[Celda 06] BNBUSD fold=6 SHORT :: scheme=BASE cov_IS=0.160 cov_OOS=0.150
[Celda 06] BNBUSD fold=7 LONG :: scheme=BASE cov_IS=0.187 cov_OOS=0.176
[C

In [8]:
# ======================================================================================
# Celda 07 v2.0.1 — Senales TREND + Ejecucion t+1 + Costos (BASE/STRESS)
# Entry en open(t+1), exit en open(t+2). Segmento IS/OOS por entry_time.
# ======================================================================================

from __future__ import annotations
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict
import polars as pl

print(">>> Celda 07 v2.0.1 :: Senales + Ejecucion t+1 + Costos")

if "RUN" not in globals():
    raise RuntimeError("[Celda 07] ERROR: RUN no existe.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

FEATURES_PATH = ARTIFACTS["features_m5"]
WFO_PATH = ARTIFACTS["wfo_folds"]
REGIME_PATH = ARTIFACTS["regime_params_by_fold"]
COST_SNAP_PATH = ARTIFACTS.get("cost_model_snapshot", RUN_DIR / "cost_model_snapshot_v2.json")

for p, label in [(FEATURES_PATH, "features"), (WFO_PATH, "wfo_folds"), (REGIME_PATH, "regime_params")]:
    if not Path(p).exists():
        raise RuntimeError(f"[Celda 07] ERROR: falta {label}: {p}")

OUT_SIGNALS = ARTIFACTS.get("signals_all", RUN_DIR / "signals_all_v2.parquet")
OUT_SNAP = ARTIFACTS.get("signals_snapshot", RUN_DIR / "signals_snapshot_v2.json")

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

# ---------- Parametros ----------
ER_COL = "er_288"
MOM_COL = "mom_bps_288"
VOL_COL = "vol_bps_288"

# ---------- Load ----------
df_feat = pl.read_parquet(FEATURES_PATH)
df_folds = pl.read_parquet(WFO_PATH)
df_regime = pl.read_parquet(REGIME_PATH)
cost_snap = json.loads(Path(COST_SNAP_PATH).read_text(encoding="utf-8"))
costs_by_sym = cost_snap.get("costs_by_symbol", {})

symbols = df_feat.get_column("symbol").unique().sort().to_list()
fold_ids = df_folds.get_column("fold_id").unique().sort().to_list()

all_trades = []
for sym in symbols:
    df_sym = df_feat.filter(pl.col("symbol") == sym).sort("time_utc")
    cinfo = costs_by_sym.get(sym, {})
    cost_base_bps = float(cinfo.get("cost_base_bps", cinfo.get("COST_BASE_BPS", 3.0)))
    cost_stress_bps = float(cinfo.get("cost_stress_bps", cinfo.get("COST_STRESS_BPS", 6.0)))
    cost_base_rt = cost_base_bps / 10_000
    cost_stress_rt = cost_stress_bps / 10_000

    for fid in fold_ids:
        fold_row = df_folds.filter(pl.col("fold_id") == fid).row(0, named=True)
        is_s, is_e = fold_row["is_start_utc"], fold_row["is_end_utc"]
        oos_s, oos_e = fold_row["oos_start_utc"], fold_row["oos_end_utc"]

        for side in ("LONG", "SHORT"):
            rg = df_regime.filter(
                (pl.col("symbol") == sym) & (pl.col("fold_id") == fid) & (pl.col("side") == side)
            )
            if rg.is_empty():
                continue
            rg_row = rg.row(0, named=True)
            if rg_row["thr_er"] is None:
                continue

            thr_er = float(rg_row["thr_er"])
            thr_mom = float(rg_row["thr_mom"])
            thr_vol = float(rg_row["thr_vol"])

            if side == "LONG":
                gate_expr = (
                    (pl.col(ER_COL) >= thr_er) &
                    (pl.col(MOM_COL) >= thr_mom) &
                    (pl.col(VOL_COL) <= thr_vol)
                )
            else:
                gate_expr = (
                    (pl.col(ER_COL) >= thr_er) &
                    (pl.col(MOM_COL) <= thr_mom) &
                    (pl.col(VOL_COL) <= thr_vol)
                )

            dfx = (
                df_sym
                .with_columns(gate_expr.alias("signal_gate"))
                .with_columns([
                    pl.col("time_utc").shift(-1).alias("entry_time"),
                    pl.col("time_utc").shift(-2).alias("exit_time"),
                    pl.col("open").shift(-1).alias("entry_price"),
                    pl.col("open").shift(-2).alias("exit_price"),
                ])
                .filter(pl.col("signal_gate"))
                .filter(pl.col("entry_price").is_not_null() & pl.col("exit_price").is_not_null())
                .filter((pl.col("entry_price") > 0) & (pl.col("exit_price") > 0))
            )

            # Segment by entry_time
            seg_expr = (
                pl.when((pl.col("entry_time") >= is_s) & (pl.col("entry_time") <= is_e)).then(pl.lit("IS"))
                .when((pl.col("entry_time") >= oos_s) & (pl.col("entry_time") <= oos_e)).then(pl.lit("OOS"))
                .otherwise(pl.lit(None))
            )

            sign = 1.0 if side == "LONG" else -1.0
            dfx = (
                dfx
                .with_columns([
                    seg_expr.alias("segment"),
                    pl.lit(sym).alias("symbol_col"),
                    pl.lit(fid).alias("fold_id_col"),
                    pl.lit(side).alias("side_col"),
                    (sign * (pl.col("exit_price") / pl.col("entry_price") - 1.0)).alias("gross_ret"),
                ])
                .filter(pl.col("segment").is_not_null())
                .with_columns([
                    (pl.col("gross_ret") - cost_base_rt).alias("net_ret_base"),
                    (pl.col("gross_ret") - cost_stress_rt).alias("net_ret_stress"),
                ])
                .select([
                    pl.col("symbol_col").alias("symbol"),
                    pl.col("fold_id_col").alias("fold_id"),
                    "segment",
                    pl.col("side_col").alias("side"),
                    pl.col("time_utc").alias("signal_time"),
                    "entry_time", "exit_time",
                    "entry_price", "exit_price",
                    "gross_ret", "net_ret_base", "net_ret_stress",
                    ER_COL, MOM_COL, VOL_COL,
                ])
            )
            if dfx.height > 0:
                all_trades.append(dfx)
                print(f"[Celda 07] {sym} fold={fid} {side}: {dfx.height} trades")

if not all_trades:
    raise RuntimeError("[Celda 07] GATE FAIL: 0 trades generados.")

signals_df = pl.concat(all_trades, how="vertical_relaxed").sort(["symbol", "fold_id", "signal_time"])
signals_df.write_parquet(str(OUT_SIGNALS), compression="zstd")

snap = {"created_utc": _now_utc_iso(), "version": "v2.0.1", "n_trades": signals_df.height,
        "symbols": symbols, "sides": ["LONG", "SHORT"], "convention": "entry=open(t+1), exit=open(t+2)"}
Path(OUT_SNAP).write_text(json.dumps(snap, indent=2, ensure_ascii=False, default=str), encoding="utf-8")

print(f"\n[Celda 07] OUT: {OUT_SIGNALS} ({signals_df.height} rows)")
print(">>> Celda 07 v2.0.1 :: OK")


>>> Celda 07 v2.0.1 :: Senales + Ejecucion t+1 + Costos
[Celda 07] BNBUSD fold=1 LONG: 30479 trades
[Celda 07] BNBUSD fold=1 SHORT: 29975 trades
[Celda 07] BNBUSD fold=2 LONG: 35172 trades
[Celda 07] BNBUSD fold=2 SHORT: 33627 trades
[Celda 07] BNBUSD fold=3 LONG: 40682 trades
[Celda 07] BNBUSD fold=3 SHORT: 37459 trades
[Celda 07] BNBUSD fold=4 LONG: 45750 trades
[Celda 07] BNBUSD fold=4 SHORT: 39866 trades
[Celda 07] BNBUSD fold=5 LONG: 52125 trades
[Celda 07] BNBUSD fold=5 SHORT: 44683 trades
[Celda 07] BNBUSD fold=6 LONG: 57370 trades
[Celda 07] BNBUSD fold=6 SHORT: 48356 trades
[Celda 07] BNBUSD fold=7 LONG: 61413 trades
[Celda 07] BNBUSD fold=7 SHORT: 51858 trades
[Celda 07] BNBUSD fold=8 LONG: 67073 trades
[Celda 07] BNBUSD fold=8 SHORT: 55839 trades
[Celda 07] BNBUSD fold=9 LONG: 73506 trades
[Celda 07] BNBUSD fold=9 SHORT: 59774 trades
[Celda 07] BNBUSD fold=10 LONG: 77828 trades
[Celda 07] BNBUSD fold=10 SHORT: 62265 trades
[Celda 07] BTCUSD fold=1 LONG: 18715 trades
[Celda 0

In [9]:
# ======================================================================================
# Celda 08 v2.0.1 — QA Timing Trades (gap-aware diagnostics)
# ======================================================================================

from __future__ import annotations
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict
import polars as pl

print(">>> Celda 08 v2.0.1 :: QA Timing Trades")

if "RUN" not in globals():
    raise RuntimeError("[Celda 08] ERROR: RUN no existe.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

SIGNALS_PATH = ARTIFACTS.get("signals_all", RUN_DIR / "signals_all_v2.parquet")
if not SIGNALS_PATH.exists():
    raise RuntimeError(f"[Celda 08] ERROR: falta signals: {SIGNALS_PATH}")

OUT_QA = ARTIFACTS.get("qa_timing", RUN_DIR / "qa_timing_v2.parquet")

df = pl.read_parquet(SIGNALS_PATH)

df = df.with_columns([
    ((pl.col("entry_time") - pl.col("signal_time")).dt.total_seconds()).alias("dt_signal_to_entry_s"),
    ((pl.col("exit_time") - pl.col("entry_time")).dt.total_seconds()).alias("dt_hold_s"),
])

THRESHOLDS = [900, 3600, 86400]

qa = (
    df.group_by(["symbol", "segment"])
    .agg([
        pl.len().alias("n_trades"),
        pl.col("dt_signal_to_entry_s").median().alias("dt_entry_median_s"),
        pl.col("dt_signal_to_entry_s").quantile(0.90, interpolation="nearest").alias("dt_entry_p90_s"),
        pl.col("dt_signal_to_entry_s").max().alias("dt_entry_max_s"),
        pl.col("dt_hold_s").median().alias("dt_hold_median_s"),
        pl.col("dt_hold_s").quantile(0.90, interpolation="nearest").alias("dt_hold_p90_s"),
        pl.col("dt_hold_s").quantile(0.99, interpolation="nearest").alias("dt_hold_p99_s"),
        pl.col("dt_hold_s").max().alias("dt_hold_max_s"),
        *[(pl.col("dt_hold_s") > t).mean().alias(f"share_hold_gt_{t}s") for t in THRESHOLDS],
    ])
    .sort(["symbol", "segment"])
)

qa.write_parquet(str(OUT_QA), compression="zstd")
print(qa)
print(f"\n[Celda 08] OUT: {OUT_QA} ({qa.height} rows)")
print(">>> Celda 08 v2.0.1 :: OK")


>>> Celda 08 v2.0.1 :: QA Timing Trades
shape: (8, 13)
┌────────┬─────────┬──────────┬────────────┬───┬────────────┬────────────┬────────────┬────────────┐
│ symbol ┆ segment ┆ n_trades ┆ dt_entry_m ┆ … ┆ dt_hold_ma ┆ share_hold ┆ share_hold ┆ share_hold │
│ ---    ┆ ---     ┆ ---      ┆ edian_s    ┆   ┆ x_s        ┆ _gt_900s   ┆ _gt_3600s  ┆ _gt_86400s │
│ str    ┆ str     ┆ u32      ┆ ---        ┆   ┆ ---        ┆ ---        ┆ ---        ┆ ---        │
│        ┆         ┆          ┆ f64        ┆   ┆ i64        ┆ f64        ┆ f64        ┆ f64        │
╞════════╪═════════╪══════════╪════════════╪═══╪════════════╪════════════╪════════════╪════════════╡
│ BNBUSD ┆ IS      ┆ 916838   ┆ 300.0      ┆ … ┆ 125100     ┆ 0.000215   ┆ 0.000149   ┆ 0.000002   │
│ BNBUSD ┆ OOS     ┆ 88262    ┆ 300.0      ┆ … ┆ 125100     ┆ 0.000351   ┆ 0.000295   ┆ 0.000023   │
│ BTCUSD ┆ IS      ┆ 676767   ┆ 300.0      ┆ … ┆ 90900      ┆ 0.000598   ┆ 0.000585   ┆ 0.000001   │
│ BTCUSD ┆ OOS     ┆ 91551    ┆ 300.

In [10]:
# ======================================================================================
# Celda 09 v2.0.1 — Alpha Multi-Horizon Report (LONG/SHORT) + Costs + Mon-Fri
# Horizontes: [1, 3, 6, 12, 24, 48, 96, 288] bars
# ======================================================================================

from __future__ import annotations
import json, math
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict
import polars as pl

print(">>> Celda 09 v2.0.1 :: Alpha Multi-Horizon Report")

if "RUN" not in globals():
    raise RuntimeError("[Celda 09] ERROR: RUN no existe.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

FEATURES_PATH = ARTIFACTS["features_m5"]
WFO_PATH = ARTIFACTS["wfo_folds"]
REGIME_PATH = ARTIFACTS["regime_params_by_fold"]
COST_SNAP_PATH = ARTIFACTS.get("cost_model_snapshot", RUN_DIR / "cost_model_snapshot_v2.json")

OUT_ALPHA = ARTIFACTS.get("alpha_multi_horizon_report", RUN_DIR / "alpha_multi_horizon_report_v2.parquet")
OUT_SNAP = ARTIFACTS.get("alpha_multi_horizon_snapshot", RUN_DIR / "alpha_multi_horizon_snapshot_v2.json")

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

HORIZONS = [1, 3, 6, 12, 24, 48, 96, 288]
ER_COL = "er_288"
MOM_COL = "mom_bps_288"
VOL_COL = "vol_bps_288"

df_feat = pl.read_parquet(FEATURES_PATH)
df_folds = pl.read_parquet(WFO_PATH)
df_regime = pl.read_parquet(REGIME_PATH)
cost_snap = json.loads(Path(COST_SNAP_PATH).read_text(encoding="utf-8"))
costs_by_sym = cost_snap.get("costs_by_symbol", {})

symbols = df_feat.get_column("symbol").unique().sort().to_list()
fold_ids = df_folds.get_column("fold_id").unique().sort().to_list()

rows = []
for sym in symbols:
    df_sym = df_feat.filter(pl.col("symbol") == sym).sort("time_utc")
    cinfo = costs_by_sym.get(sym, {})
    cost_base_rt = float(cinfo.get("cost_base_bps", cinfo.get("COST_BASE_BPS", 3.0))) / 10_000
    cost_stress_rt = float(cinfo.get("cost_stress_bps", cinfo.get("COST_STRESS_BPS", 6.0))) / 10_000

    # Precompute forward returns for all horizons
    fwd_cols = []
    for h in HORIZONS:
        df_sym = df_sym.with_columns(
            (pl.col("close").shift(-h) / pl.col("close") - 1.0).alias(f"fwd_ret_{h}")
        )

    # weekday filter (Mon-Fri)
    df_sym = df_sym.with_columns(pl.col("time_utc").dt.weekday().alias("_dow"))
    # Polars weekday: 1=Mon..7=Sun
    df_sym = df_sym.filter(pl.col("_dow") <= 5)

    for fid in fold_ids:
        fold_row = df_folds.filter(pl.col("fold_id") == fid).row(0, named=True)
        is_s, is_e = fold_row["is_start_utc"], fold_row["is_end_utc"]
        oos_s, oos_e = fold_row["oos_start_utc"], fold_row["oos_end_utc"]

        for side in ("LONG", "SHORT"):
            rg = df_regime.filter(
                (pl.col("symbol") == sym) & (pl.col("fold_id") == fid) & (pl.col("side") == side)
            )
            if rg.is_empty() or rg.row(0, named=True)["thr_er"] is None:
                continue
            rg_row = rg.row(0, named=True)
            thr_er, thr_mom, thr_vol = float(rg_row["thr_er"]), float(rg_row["thr_mom"]), float(rg_row["thr_vol"])

            if side == "LONG":
                gate = (pl.col(ER_COL) >= thr_er) & (pl.col(MOM_COL) >= thr_mom) & (pl.col(VOL_COL) <= thr_vol)
            else:
                gate = (pl.col(ER_COL) >= thr_er) & (pl.col(MOM_COL) <= thr_mom) & (pl.col(VOL_COL) <= thr_vol)

            for seg_name, seg_s, seg_e in [("IS", is_s, is_e), ("OOS", oos_s, oos_e)]:
                df_seg = df_sym.filter(
                    (pl.col("time_utc") >= seg_s) & (pl.col("time_utc") <= seg_e)
                ).filter(gate)

                if df_seg.height == 0:
                    continue

                for h in HORIZONS:
                    col = f"fwd_ret_{h}"
                    vals = df_seg.get_column(col).drop_nulls()
                    if vals.len() < 5:
                        continue
                    sign = 1.0 if side == "LONG" else -1.0
                    rets = vals.to_list()
                    rets_signed = [sign * r for r in rets]
                    n = len(rets_signed)
                    mean_r = sum(rets_signed) / n
                    std_r = (sum((r - mean_r)**2 for r in rets_signed) / max(1, n - 1)) ** 0.5
                    sharpe = mean_r / std_r if std_r > 1e-12 else 0.0
                    wr = sum(1 for r in rets_signed if r > 0) / n

                    rows.append({
                        "symbol": sym, "fold_id": fid, "side": side, "segment": seg_name,
                        "horizon_bars": h, "n_trades": n,
                        "gross_mean": mean_r, "gross_std": std_r,
                        "net_base_mean": mean_r - cost_base_rt,
                        "net_stress_mean": mean_r - cost_stress_rt,
                        "sharpe_like": sharpe, "win_rate": wr,
                    })

alpha_df = pl.DataFrame(rows).sort(["symbol", "fold_id", "side", "segment", "horizon_bars"])
alpha_df.write_parquet(str(OUT_ALPHA), compression="zstd")

snap = {"created_utc": _now_utc_iso(), "version": "v2.0.1", "horizons": HORIZONS,
        "n_rows": alpha_df.height, "symbols": symbols}
Path(OUT_SNAP).write_text(json.dumps(snap, indent=2, ensure_ascii=False, default=str), encoding="utf-8")

print(f"\n[Celda 09] OUT: {OUT_ALPHA} ({alpha_df.height} rows)")
print(">>> Celda 09 v2.0.1 :: OK")


>>> Celda 09 v2.0.1 :: Alpha Multi-Horizon Report

[Celda 09] OUT: C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\alpha_multi_horizon_report_v2.parquet (1280 rows)
>>> Celda 09 v2.0.1 :: OK


In [None]:
# ======================================================================================
# Celda 10 v2.2.0 — Backtest Engine (TREND, M5)
# CAMBIO v2.2.0: Weekend gate valida bar de EJECUCIÓN (idx+1), no solo señal (idx).
# CAMBIO v2.1.0: _simulate() acepta kwargs (sl_atr, tp_atr, trail_atr, time_stop,
#   min_hold) para permitir tuning real en Celda 14.
# BUG FIXES previos: Trail>SL, SHORT gate, dedup keep="last"
# ======================================================================================

from __future__ import annotations
import json, math
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Optional
import polars as pl

print(">>> Celda 10 v2.2.0 :: Backtest Engine (TREND) [weekend exec-bar fix]")

if "RUN" not in globals():
    raise RuntimeError("[Celda 10] ERROR: RUN no existe.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

FEATURES_PATH = ARTIFACTS["features_m5"]
WFO_PATH = ARTIFACTS["wfo_folds"]
REGIME_PATH = ARTIFACTS["regime_params_by_fold"]
COST_SNAP_PATH = ARTIFACTS.get("cost_model_snapshot", RUN_DIR / "cost_model_snapshot_v2.json")

OUT_TRADES = ARTIFACTS.get("trades_engine", RUN_DIR / "trades_engine_v2.parquet")
OUT_SUMMARY = ARTIFACTS.get("summary_engine", RUN_DIR / "summary_engine_v2.parquet")

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

# ---------- Parametros por defecto ----------
SL_ATR     = 3.0
TP_ATR     = 14.0
TRAIL_ATR  = 0
TIME_STOP  = 1440
ENTRY_CONFIRM = 28
EXIT_GATE_OFF = 72
MIN_HOLD   = 72
COOLDOWN   = 48
MON_FRI    = True
EMA_FILTER = True
EMA_FAST   = 48
EMA_SLOW   = 288
RISK_PER_TRADE = 0.01
MIN_POS_SIZE = 0.25
MAX_POS_SIZE = 3.00

ER_COL = "er_288"
MOM_COL = "mom_bps_288"
VOL_COL = "vol_bps_288"
ATR_COL = "atr_bps_96"

print(f"[Celda 10] SL_ATR={SL_ATR} TP_ATR={TP_ATR} TRAIL_ATR={TRAIL_ATR} TIME_STOP={TIME_STOP}")

# ---------- Load ----------
df_feat = pl.read_parquet(FEATURES_PATH)
df_folds = pl.read_parquet(WFO_PATH)
df_regime = pl.read_parquet(REGIME_PATH)
cost_snap = json.loads(Path(COST_SNAP_PATH).read_text(encoding="utf-8"))
costs_by_sym = {e["symbol"]: e for e in cost_snap.get("per_symbol", [])}

symbols = df_feat.get_column("symbol").unique().sort().to_list()
fold_ids = df_folds.get_column("fold_id").unique().sort().to_list()

# ---------- Helpers ----------
def _is_finite(x) -> bool:
    if x is None:
        return False
    try:
        return math.isfinite(float(x))
    except Exception:
        return False

def _simulate(sym, df_j, fold_row, thr_er, thr_mom_long, thr_mom_short, thr_vol,
              cost_base_dec, cost_stress_dec,
              *, sl_atr=None, tp_atr=None, trail_atr=None, time_stop=None, min_hold=None, entry_confirm_bars=None):
    """Bar-by-bar simulation for one symbol/fold. Accepts optional engine param overrides."""
    # Parametros locales (overridable por tuning)
    _SL    = sl_atr    if sl_atr    is not None else SL_ATR
    _TP    = tp_atr    if tp_atr    is not None else TP_ATR
    _TRAIL = trail_atr if trail_atr is not None else TRAIL_ATR
    _TRAIL = None if _TRAIL == 0 else _TRAIL   # TRAIL=0 -> sin trailing stop
    _TSTOP = time_stop if time_stop is not None else TIME_STOP
    _MHOLD = min_hold  if min_hold  is not None else MIN_HOLD
    _EC    = entry_confirm_bars if entry_confirm_bars is not None else ENTRY_CONFIRM

    is_s = fold_row["is_start_utc"]
    is_e = fold_row["is_end_utc"]
    oos_s = fold_row["oos_start_utc"]
    oos_e = fold_row["oos_end_utc"]
    fid = fold_row["fold_id"]

    df_j = df_j.unique(subset=["time_utc"], keep="last").sort("time_utc")

    # EMA filter
    if EMA_FILTER:
        df_j = df_j.with_columns([
            pl.col("close").ewm_mean(span=EMA_FAST, adjust=False).alias("_ema_f"),
            pl.col("close").ewm_mean(span=EMA_SLOW, adjust=False).alias("_ema_s"),
        ])

    # Gates
    long_gate = (pl.col(ER_COL) >= thr_er) & (pl.col(MOM_COL) >= thr_mom_long) & (pl.col(VOL_COL) <= thr_vol)
    short_gate = (pl.col(ER_COL) >= thr_er) & (pl.col(MOM_COL) <= thr_mom_short) & (pl.col(VOL_COL) <= thr_vol)
    if EMA_FILTER:
        long_gate = long_gate & (pl.col("_ema_f") > pl.col("_ema_s"))
        short_gate = short_gate & (pl.col("_ema_f") < pl.col("_ema_s"))

    df_j = df_j.with_columns([long_gate.alias("_gL"), short_gate.alias("_gS")])
    df_j = df_j.with_columns(pl.col("time_utc").dt.weekday().alias("_dow"))
    df_j = df_j.with_columns((pl.col("_dow") >= 6).alias("_is_wk"))

    # Confirm
    df_j = df_j.with_columns([
        (pl.col("_gL").cast(pl.Int8).rolling_sum(_EC, min_samples=_EC).eq(_EC))
            .fill_null(False).alias("_confL"),
        (pl.col("_gS").cast(pl.Int8).rolling_sum(_EC, min_samples=_EC).eq(_EC))
            .fill_null(False).alias("_confS"),
    ])

    # Extract lists
    t_list   = df_j.get_column("time_utc").to_list()
    o_list   = df_j.get_column("open").to_list()
    h_list   = df_j.get_column("high").to_list()
    l_list   = df_j.get_column("low").to_list()
    c_list   = df_j.get_column("close").to_list()
    atr_list = df_j.get_column(ATR_COL).to_list() if ATR_COL in df_j.columns else [None]*df_j.height
    gL_list  = df_j.get_column("_gL").to_list()
    gS_list  = df_j.get_column("_gS").to_list()
    cfL_list = df_j.get_column("_confL").to_list()
    cfS_list = df_j.get_column("_confS").to_list()
    wk_list  = df_j.get_column("_is_wk").to_list()

    n = len(t_list)
    trades = []

    pos = 0; side_str = None; entry_idx = None; entry_price = None
    stop = None; tp_price = None; trail_stop = None; best_price = None
    sl_dist = None; trail_dist = None; pos_size = 1.0
    gate_off_streak = 0; cooldown_cnt = 0

    def _seg(et):
        if is_s <= et <= is_e: return "IS"
        if oos_s <= et <= oos_e: return "OOS"
        return None

    for idx in range(n):
        # --- EXIT LOGIC ---
        if pos != 0 and entry_idx is not None:
            bars_held = idx - entry_idx
            gn = bool(gL_list[idx]) if pos == 1 else bool(gS_list[idx])
            gate_off_streak = 0 if gn else gate_off_streak + 1

            hi = float(h_list[idx]) if _is_finite(h_list[idx]) else float(c_list[idx])
            lo = float(l_list[idx]) if _is_finite(l_list[idx]) else float(c_list[idx])

            exit_reason = None; exit_price = None

            if pos == 1:
                if best_price is None: best_price = float(entry_price)
                best_price = max(best_price, hi)
                if trail_dist is not None:
                    ts = best_price - trail_dist
                    trail_stop = ts if trail_stop is None else max(trail_stop, ts)
                if stop is not None and lo <= stop:
                    exit_reason, exit_price = "SL", stop
                elif trail_stop is not None and lo <= trail_stop:
                    exit_reason, exit_price = "TRAIL", trail_stop
                elif tp_price is not None and hi >= tp_price:
                    exit_reason, exit_price = "TP", tp_price
            else:
                if best_price is None: best_price = float(entry_price)
                best_price = min(best_price, lo)
                if trail_dist is not None:
                    ts = best_price + trail_dist
                    trail_stop = ts if trail_stop is None else min(trail_stop, ts)
                if stop is not None and hi >= stop:
                    exit_reason, exit_price = "SL", stop
                elif trail_stop is not None and hi >= trail_stop:
                    exit_reason, exit_price = "TRAIL", trail_stop
                elif tp_price is not None and lo <= tp_price:
                    exit_reason, exit_price = "TP", tp_price

            if exit_reason is None and bars_held >= _TSTOP:
                exit_reason, exit_price = "TIME", float(c_list[idx])
            if exit_reason is None and bars_held >= _MHOLD and gate_off_streak >= EXIT_GATE_OFF:
                exit_reason, exit_price = "REGIME_OFF", float(c_list[idx])
            if exit_reason is None and MON_FRI and bool(wk_list[idx]):
                exit_reason, exit_price = "WEEKEND", float(c_list[idx])

            if exit_reason is not None:
                sign = 1.0 if pos == 1 else -1.0
                gross_pnl = sign * (exit_price / entry_price - 1.0)
                seg = _seg(t_list[entry_idx])
                trades.append({
                    "symbol": sym, "fold_id": fid, "segment": seg,
                    "side": "LONG" if pos == 1 else "SHORT",
                    "signal_time_utc": t_list[entry_idx],
                    "entry_time_utc": t_list[min(entry_idx + 1, n - 1)],
                    "exit_time_utc": t_list[idx],
                    "entry_price": entry_price, "exit_price": exit_price,
                    "gross_pnl": gross_pnl,
                    "net_pnl_base": gross_pnl - cost_base_dec,
                    "net_pnl_stress": gross_pnl - cost_stress_dec,
                    "hold_bars": bars_held, "exit_reason": exit_reason,
                    "pos_size": pos_size,
                })
                pos = 0; side_str = None; entry_idx = None; entry_price = None
                stop = None; tp_price = None; trail_stop = None; best_price = None
                cooldown_cnt = COOLDOWN
                continue

        # --- COOLDOWN ---
        if cooldown_cnt > 0:
            cooldown_cnt -= 1
            continue

        # --- ENTRY LOGIC ---
        if pos == 0 and idx < n - 2:
            if MON_FRI and bool(wk_list[idx]):
                continue
            # v2.2.0: también validar bar de ejecución (idx+1)
            exec_i = min(idx + 1, n - 1)
            if MON_FRI and bool(wk_list[exec_i]):
                continue

            atr_val = float(atr_list[idx]) / 10_000 * float(c_list[idx]) if _is_finite(atr_list[idx]) else float(c_list[idx]) * 0.005
            if atr_val <= 0:
                continue

            # LONG entry
            if bool(cfL_list[idx]):
                entry_price = float(o_list[idx + 1]) if _is_finite(o_list[idx + 1]) else float(c_list[idx])
                sl_dist = _SL * atr_val
                trail_dist = _TRAIL * atr_val if _TRAIL is not None else None
                stop = entry_price - sl_dist
                tp_price = entry_price + _TP * atr_val
                trail_stop = None; best_price = entry_price
                pos_size = min(MAX_POS_SIZE, max(MIN_POS_SIZE, RISK_PER_TRADE / (sl_dist / entry_price)))
                pos = 1; side_str = "LONG"; entry_idx = idx
                gate_off_streak = 0
            elif bool(cfS_list[idx]):
                entry_price = float(o_list[idx + 1]) if _is_finite(o_list[idx + 1]) else float(c_list[idx])
                sl_dist = _SL * atr_val
                trail_dist = _TRAIL * atr_val if _TRAIL is not None else None
                stop = entry_price + sl_dist
                tp_price = entry_price - _TP * atr_val
                trail_stop = None; best_price = entry_price
                pos_size = min(MAX_POS_SIZE, max(MIN_POS_SIZE, RISK_PER_TRADE / (sl_dist / entry_price)))
                pos = -1; side_str = "SHORT"; entry_idx = idx
                gate_off_streak = 0

    return trades

# ---------- Main ----------
all_trades = []
for sym in symbols:
    df_sym = df_feat.filter(pl.col("symbol") == sym).sort("time_utc")
    cinfo = costs_by_sym.get(sym, {})
    cost_base_bps = float(cinfo.get("base_cost_bps", 8.0))
    cost_stress_bps = float(cinfo.get("stress_cost_bps", 16.0))
    cost_base_dec = cost_base_bps / 10_000
    cost_stress_dec = cost_stress_bps / 10_000
    print(f"  [{sym}] cost_base={cost_base_bps:.1f}bps, cost_stress={cost_stress_bps:.1f}bps "
          f"(from={'snapshot' if cinfo else 'default'})")

    for fid in fold_ids:
        fold_row = df_folds.filter(pl.col("fold_id") == fid).row(0, named=True)

        rg_long = df_regime.filter(
            (pl.col("symbol") == sym) & (pl.col("fold_id") == fid) & (pl.col("side") == "LONG")
        )
        rg_short = df_regime.filter(
            (pl.col("symbol") == sym) & (pl.col("fold_id") == fid) & (pl.col("side") == "SHORT")
        )

        thr_er = None; thr_mom_long = 0.0; thr_mom_short = 0.0; thr_vol = None
        if not rg_long.is_empty():
            rl = rg_long.row(0, named=True)
            thr_er = rl["thr_er"]; thr_mom_long = rl["thr_mom"]; thr_vol = rl["thr_vol"]
        if not rg_short.is_empty():
            rs = rg_short.row(0, named=True)
            thr_mom_short = rs["thr_mom"]
            if thr_er is None: thr_er = rs["thr_er"]
            if thr_vol is None: thr_vol = rs["thr_vol"]

        if thr_er is None:
            continue

        trades = _simulate(sym, df_sym, fold_row,
                           float(thr_er), float(thr_mom_long), float(thr_mom_short), float(thr_vol),
                           cost_base_dec, cost_stress_dec)
        if trades:
            all_trades.extend(trades)
            n_is = sum(1 for t in trades if t["segment"] == "IS")
            n_oos = sum(1 for t in trades if t["segment"] == "OOS")
            print(f"[Celda 10] {sym} fold={fid}: {len(trades)} trades (IS={n_is} OOS={n_oos})")

if not all_trades:
    print("[Celda 10] WARNING: 0 trades generados por el engine.")
    trades_df = pl.DataFrame()
else:
    trades_df = pl.DataFrame(all_trades).sort(["symbol", "fold_id", "signal_time_utc"])

trades_df.write_parquet(str(OUT_TRADES), compression="zstd")

# Summary
if trades_df.height > 0:
    summary = (
        trades_df
        .group_by(["symbol", "fold_id", "segment", "side"])
        .agg([
            pl.len().alias("n_trades"),
            pl.col("gross_pnl").mean().alias("gross_mean"),
            pl.col("net_pnl_base").mean().alias("net_base_mean"),
            pl.col("net_pnl_base").std().alias("net_base_std"),
            (pl.col("net_pnl_base") > 0).mean().alias("win_rate"),
            pl.col("hold_bars").median().alias("hold_bars_median"),
        ])
        .sort(["symbol", "fold_id", "segment"])
    )
else:
    summary = pl.DataFrame()

summary.write_parquet(str(OUT_SUMMARY), compression="zstd")

print(f"\n[Celda 10] OUT: {OUT_TRADES} ({trades_df.height} trades)")
print(f"[Celda 10] OUT: {OUT_SUMMARY} ({summary.height} rows)")
print(">>> Celda 10 v2.2.0 :: OK")

In [None]:
# ======================================================================================
# Celda 11 v2.1.0 — QA: No-Lookahead + Weekend Entries
# CAMBIO v2.1.0: Verifica entry_time_utc > signal_time_utc (no-lookahead)
# ======================================================================================

from __future__ import annotations
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict
import polars as pl

print(">>> Celda 11 v2.1.0 :: QA No-Lookahead + Weekend Entries")

if "RUN" not in globals():
    raise RuntimeError("[Celda 11] ERROR: RUN no existe.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

TRADES_PATH = ARTIFACTS.get("trades_engine", RUN_DIR / "trades_engine_v2.parquet")
OUT_QA = ARTIFACTS.get("engine_qa_report", RUN_DIR / "engine_qa_report_v2.json")

if not TRADES_PATH.exists():
    print("[Celda 11] WARNING: trades_engine no existe, skip.")
    qa = {"status": "SKIPPED", "reason": "no trades file"}
else:
    df = pl.read_parquet(TRADES_PATH)
    if df.height == 0:
        qa = {"status": "PASS", "reason": "0 trades", "weekend_entries": 0, "lookahead_violations": 0}
    else:
        # Weekend entries check
        df = df.with_columns(pl.col("entry_time_utc").dt.weekday().alias("_dow"))
        wk_entries = df.filter(pl.col("_dow") >= 6).height

        # No-lookahead check: entry_time_utc must be > signal_time_utc
        lookahead_violations = df.filter(
            pl.col("entry_time_utc") <= pl.col("signal_time_utc")
        ).height

        status = "PASS"
        issues = []
        if wk_entries > 0:
            status = "FAIL"
            issues.append(f"{wk_entries} weekend entries")
            print(f"[Celda 11] FAIL: {wk_entries} weekend entries detectadas!")
        if lookahead_violations > 0:
            status = "FAIL"
            issues.append(f"{lookahead_violations} lookahead violations")
            print(f"[Celda 11] FAIL: {lookahead_violations} lookahead violations (entry <= signal)!")

        qa = {
            "status": status,
            "total_trades": df.height,
            "weekend_entries": wk_entries,
            "lookahead_violations": lookahead_violations,
            "issues": issues if issues else None,
        }

        if status == "PASS":
            print(f"[Celda 11] PASS: {df.height} trades, 0 weekend, 0 lookahead")

Path(OUT_QA).write_text(json.dumps(qa, indent=2), encoding="utf-8")
print(f"[Celda 11] OUT: {OUT_QA} :: status={qa['status']}")
print(">>> Celda 11 v2.1.0 :: OK")

In [13]:
# ======================================================================================
# Celda 12 v2.0.1 — Engine Report: Equity Curve + KPIs + Exit Reasons
# ======================================================================================

from __future__ import annotations
import json, math
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict
import polars as pl

print(">>> Celda 12 v2.0.1 :: Engine Report")

if "RUN" not in globals():
    raise RuntimeError("[Celda 12] ERROR: RUN no existe.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

TRADES_PATH = ARTIFACTS.get("trades_engine", RUN_DIR / "trades_engine_v2.parquet")
OUT_EQUITY = ARTIFACTS.get("equity_engine", RUN_DIR / "equity_curve_engine_v2.parquet")
OUT_SNAP = ARTIFACTS.get("engine_report_snapshot", RUN_DIR / "engine_report_snapshot_v2.json")

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

if not TRADES_PATH.exists():
    print("[Celda 12] WARNING: trades_engine no existe, skip.")
else:
    df = pl.read_parquet(TRADES_PATH)
    if df.height == 0:
        print("[Celda 12] WARNING: 0 trades.")
        pl.DataFrame().write_parquet(str(OUT_EQUITY))
        snap = {"created_utc": _now_utc_iso(), "status": "EMPTY"}
        Path(OUT_SNAP).write_text(json.dumps(snap, indent=2), encoding="utf-8")
    else:
        # Equity curve (cum log returns)
        eq = (
            df.sort("exit_time_utc")
            .with_columns([
                pl.col("net_pnl_base").cum_sum().alias("cum_ret"),
            ])
            .with_columns([
                pl.col("cum_ret").cum_max().alias("peak"),
            ])
            .with_columns([
                (pl.col("cum_ret") - pl.col("peak")).alias("drawdown"),
            ])
            .select(["symbol", "fold_id", "segment", "side", "exit_time_utc",
                      "net_pnl_base", "cum_ret", "peak", "drawdown"])
        )
        eq.write_parquet(str(OUT_EQUITY), compression="zstd")

        # KPIs
        tot_ret = float(df.get_column("net_pnl_base").sum())
        mdd = float(eq.get_column("drawdown").min())
        n_trades = df.height
        mean_ret = float(df.get_column("net_pnl_base").mean())
        std_ret = float(df.get_column("net_pnl_base").std())
        sharpe = mean_ret / std_ret if std_ret > 1e-12 else 0.0
        wr = float((df.get_column("net_pnl_base") > 0).mean())

        # Exit reasons
        exit_counts = df.group_by("exit_reason").agg(pl.len().alias("count")).sort("count", descending=True)
        exit_dict = {r["exit_reason"]: r["count"] for r in exit_counts.to_dicts()}

        snap = {
            "created_utc": _now_utc_iso(), "version": "v2.0.1",
            "kpis": {
                "total_return": tot_ret, "mdd": mdd, "n_trades": n_trades,
                "sharpe_like": sharpe, "win_rate": wr, "mean_ret": mean_ret,
            },
            "exit_reasons": exit_dict,
        }
        Path(OUT_SNAP).write_text(json.dumps(snap, indent=2, default=str), encoding="utf-8")

        print(f"[Celda 12] total_ret={tot_ret:.4f} MDD={mdd:.4f} sharpe={sharpe:.3f} WR={wr:.3f} n={n_trades}")
        print(f"[Celda 12] exit_reasons: {exit_dict}")
        print(f"[Celda 12] OUT: {OUT_EQUITY} ({eq.height} rows)")

print(">>> Celda 12 v2.0.1 :: OK")


>>> Celda 12 v2.0.1 :: Engine Report
[Celda 12] total_ret=-32.2199 MDD=-32.9071 sharpe=-0.077 WR=0.318 n=83385
[Celda 12] exit_reasons: {'TRAIL': 36533, 'SL': 33983, 'TP': 9923, 'REGIME_OFF': 2226, 'WEEKEND': 720}
[Celda 12] OUT: C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\equity_curve_engine_v2.parquet (83385 rows)
>>> Celda 12 v2.0.1 :: OK


In [14]:
# ======================================================================================
# Celda 13 v2.0.1 — Diagnostico de Rentabilidad + Edge Alignment (alpha<->motor)
# ======================================================================================

from __future__ import annotations
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict
import polars as pl

print(">>> Celda 13 v2.0.1 :: Diagnostico + Edge Alignment")

if "RUN" not in globals():
    raise RuntimeError("[Celda 13] ERROR: RUN no existe.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

ALPHA_PATH = ARTIFACTS.get("alpha_multi_horizon_report", RUN_DIR / "alpha_multi_horizon_report_v2.parquet")
TRADES_PATH = ARTIFACTS.get("trades_engine", RUN_DIR / "trades_engine_v2.parquet")

OUT_DIAG = ARTIFACTS.get("diagnostics", RUN_DIR / "diagnostics_v2.parquet")
OUT_SNAP = ARTIFACTS.get("diagnostics_snapshot", RUN_DIR / "diagnostics_snapshot_v2.json")

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

if not ALPHA_PATH.exists() or not TRADES_PATH.exists():
    print("[Celda 13] WARNING: faltan alpha_report o trades_engine, skip.")
    snap = {"created_utc": _now_utc_iso(), "status": "SKIPPED"}
    Path(OUT_SNAP).write_text(json.dumps(snap, indent=2), encoding="utf-8")
else:
    alpha = pl.read_parquet(ALPHA_PATH)
    trades = pl.read_parquet(TRADES_PATH)

    diag_rows = []
    if trades.height > 0 and alpha.height > 0:
        for sym in trades.get_column("symbol").unique().sort().to_list():
            t_sym = trades.filter(pl.col("symbol") == sym)
            a_sym = alpha.filter(pl.col("symbol") == sym)

            # Best alpha side/horizon in IS
            a_is = a_sym.filter(pl.col("segment") == "IS")
            if a_is.height > 0:
                best_alpha = a_is.sort("sharpe_like", descending=True).row(0, named=True)
            else:
                best_alpha = None

            # Engine hold time distribution
            hold_p50 = float(t_sym.get_column("hold_bars").median()) if t_sym.height > 0 else 0
            hold_p90 = float(t_sym.get_column("hold_bars").quantile(0.90, interpolation="nearest")) if t_sym.height > 0 else 0

            # Trail kill analysis: fraction of trades exited by TRAIL
            trail_share = float(t_sym.filter(pl.col("exit_reason") == "TRAIL").height / max(1, t_sym.height))

            diag_rows.append({
                "symbol": sym,
                "best_alpha_side_IS": best_alpha["side"] if best_alpha else None,
                "best_alpha_horizon_IS": best_alpha["horizon_bars"] if best_alpha else None,
                "best_alpha_sharpe_IS": best_alpha["sharpe_like"] if best_alpha else None,
                "engine_hold_p50": hold_p50,
                "engine_hold_p90": hold_p90,
                "trail_exit_share": trail_share,
                "hold_vs_alpha_ratio": hold_p90 / best_alpha["horizon_bars"] if best_alpha and best_alpha["horizon_bars"] > 0 else None,
                "trail_kills_alpha": trail_share > 0.40 and (hold_p90 < (best_alpha["horizon_bars"] * 0.5 if best_alpha else 999)),
            })

    diag_df = pl.DataFrame(diag_rows) if diag_rows else pl.DataFrame()
    diag_df.write_parquet(str(OUT_DIAG), compression="zstd")

    snap = {
        "created_utc": _now_utc_iso(), "version": "v2.0.1",
        "n_symbols": len(diag_rows),
        "diagnostics": diag_rows,
    }
    Path(OUT_SNAP).write_text(json.dumps(snap, indent=2, default=str), encoding="utf-8")

    print(f"[Celda 13] OUT: {OUT_DIAG} ({diag_df.height} rows)")
    if diag_rows:
        for d in diag_rows:
            print(f"  {d['symbol']}: best_alpha={d['best_alpha_side_IS']}/H{d['best_alpha_horizon_IS']} "
                  f"hold_p90={d['engine_hold_p90']:.0f} trail_share={d['trail_exit_share']:.2f} "
                  f"kills={d['trail_kills_alpha']}")

print(">>> Celda 13 v2.0.1 :: OK")


>>> Celda 13 v2.0.1 :: Diagnostico + Edge Alignment
[Celda 13] OUT: C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\diagnostics_v2.parquet (4 rows)
  BNBUSD: best_alpha=SHORT/H96 hold_p90=36 trail_share=0.44 kills=True
  BTCUSD: best_alpha=LONG/H288 hold_p90=38 trail_share=0.47 kills=True
  LVMH: best_alpha=SHORT/H288 hold_p90=36 trail_share=0.36 kills=False
  XAUAUD: best_alpha=LONG/H96 hold_p90=44 trail_share=0.43 kills=True
>>> Celda 13 v2.0.1 :: OK


In [None]:
# ======================================================================================
# Celda 14 v2.1.0 — Engine Tuning REAL (IS-only)
# CAMBIO v2.1.0: Re-ejecuta _simulate() con cada combinacion de parametros.
#   Score = sum(net_pnl_base) / max(1e-12, std(net_pnl_base))  (Sharpe-like)
#   Anti-placebo: assert que scores varian entre combos.
#
# Grid: SL=[1.5-3.0] TP=[7.0-14.0] Trail=[0] time_stop=[288-576]
#   min_hold=[3,6,12].  Enforce Trail > SL or Trail=0 (no trail).
# ======================================================================================

from __future__ import annotations
import json, math, itertools, time
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict
import polars as pl

print(">>> Celda 14 v2.1.0 :: Engine Tuning REAL (IS-only)")

if "RUN" not in globals():
    raise RuntimeError("[Celda 14] ERROR: RUN no existe.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

FEATURES_PATH = ARTIFACTS["features_m5"]
WFO_PATH = ARTIFACTS["wfo_folds"]
REGIME_PATH = ARTIFACTS["regime_params_by_fold"]
COST_SNAP_PATH = ARTIFACTS.get("cost_model_snapshot", RUN_DIR / "cost_model_snapshot_v2.json")

OUT_TUNING = ARTIFACTS.get("tuning_results", RUN_DIR / "tuning_results_v2.parquet")
OUT_BEST = ARTIFACTS.get("tuning_best_params", RUN_DIR / "tuning_best_params_v2.parquet")
OUT_SNAP = ARTIFACTS.get("tuning_snapshot", RUN_DIR / "tuning_snapshot_v2.json")

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

# Grid
SL_ATR_GRID    = [2.5, 3.0, 3.5, 4.0]
TP_ATR_GRID    = [10.0, 14.0, 20.0]
TRAIL_ATR_GRID = [0]
TIME_STOP_GRID = [576, 1440, 2880]
MIN_HOLD_GRID  = [12, 48, 72]
ENTRY_CONFIRM_GRID = [12, 28, 48]
MAX_COMBOS = 200
MIN_TRADES_SCORE = 20

# Enforce Trail > SL
combos = [(sl, tp, tr, ts, mh, ec)
          for sl, tp, tr, ts, mh, ec in itertools.product(
              SL_ATR_GRID, TP_ATR_GRID, TRAIL_ATR_GRID, TIME_STOP_GRID, MIN_HOLD_GRID, ENTRY_CONFIRM_GRID)
          if tr == 0 or tr > sl][:MAX_COMBOS]

print(f"[Celda 14] {len(combos)} valid combos (Trail > SL enforced)")

# --- Load data (same as Cell 10) ---
df_feat_tuning = pl.read_parquet(FEATURES_PATH)
df_folds_tuning = pl.read_parquet(WFO_PATH)
df_regime_tuning = pl.read_parquet(REGIME_PATH)
cost_snap_tuning = json.loads(Path(COST_SNAP_PATH).read_text(encoding="utf-8"))
costs_by_sym_tuning = {e["symbol"]: e for e in cost_snap_tuning.get("per_symbol", [])}

symbols_tuning = df_feat_tuning.get_column("symbol").unique().sort().to_list()
fold_ids_tuning = df_folds_tuning.get_column("fold_id").unique().sort().to_list()

# --- Tuning loop: re-execute engine per combo ---
t0 = time.time()
results = []

for sym in symbols_tuning:
    df_sym = df_feat_tuning.filter(pl.col("symbol") == sym).sort("time_utc")
    cinfo = costs_by_sym_tuning.get(sym, {})
    cost_base_dec = float(cinfo.get("base_cost_bps", 8.0)) / 10_000
    cost_stress_dec = float(cinfo.get("stress_cost_bps", 16.0)) / 10_000
    print(f"  [{sym}] cost_base={cost_base_dec*10_000:.1f}bps, cost_stress={cost_stress_dec*10_000:.1f}bps "
          f"(from={'snapshot' if cinfo else 'default'})")

    for fid in fold_ids_tuning:
        fold_row = df_folds_tuning.filter(pl.col("fold_id") == fid).row(0, named=True)

        # Regime params
        rg_long = df_regime_tuning.filter(
            (pl.col("symbol") == sym) & (pl.col("fold_id") == fid) & (pl.col("side") == "LONG"))
        rg_short = df_regime_tuning.filter(
            (pl.col("symbol") == sym) & (pl.col("fold_id") == fid) & (pl.col("side") == "SHORT"))

        thr_er = None; thr_mom_long = 0.0; thr_mom_short = 0.0; thr_vol = None
        if not rg_long.is_empty():
            rl = rg_long.row(0, named=True)
            thr_er = rl["thr_er"]; thr_mom_long = rl["thr_mom"]; thr_vol = rl["thr_vol"]
        if not rg_short.is_empty():
            rs = rg_short.row(0, named=True)
            thr_mom_short = rs["thr_mom"]
            if thr_er is None: thr_er = rs["thr_er"]
            if thr_vol is None: thr_vol = rs["thr_vol"]

        if thr_er is None:
            continue

        for sl, tp, tr, ts, mh, ec in combos:
            # RE-RUN ENGINE with this param combo (IS-only trades)
            trades = _simulate(sym, df_sym, fold_row,
                               float(thr_er), float(thr_mom_long), float(thr_mom_short), float(thr_vol),
                               cost_base_dec, cost_stress_dec,
                               sl_atr=sl, tp_atr=tp, trail_atr=tr, time_stop=ts, min_hold=mh, entry_confirm_bars=ec)

            # Filter IS-only trades
            is_trades = [t for t in trades if t.get("segment") == "IS"]
            n = len(is_trades)
            if n < MIN_TRADES_SCORE:
                continue

            rets = [t["net_pnl_base"] for t in is_trades]
            mean_r = sum(rets) / n
            std_r = (sum((r - mean_r)**2 for r in rets) / max(1, n - 1)) ** 0.5
            score = sum(rets) / max(1e-12, std_r)

            results.append({
                "symbol": sym, "fold_id": fid,
                "sl_atr": sl, "tp_atr": tp, "trail_atr": tr,
                "time_stop": ts, "min_hold": mh, "entry_confirm": ec,
                "n_trades": n, "sum_ret": sum(rets), "std_ret": std_r,
                "mean_ret": mean_r, "score": score,
            })

    print(f"[Celda 14] {sym}: {sum(1 for r in results if r['symbol'] == sym)} results")

elapsed = time.time() - t0
print(f"[Celda 14] Tuning completado en {elapsed:.1f}s")

# --- Build results DataFrame ---
if results:
    tuning_df = pl.DataFrame(results).sort(["symbol", "fold_id", "score"], descending=[False, False, True])
else:
    tuning_df = pl.DataFrame()
    print("[Celda 14] WARNING: 0 results (no trades above MIN_TRADES_SCORE)")

tuning_df.write_parquet(str(OUT_TUNING), compression="zstd")

# Best per symbol/fold
if tuning_df.height > 0:
    best = tuning_df.group_by(["symbol", "fold_id"]).first().sort(["symbol", "fold_id"])
else:
    best = pl.DataFrame()
best.write_parquet(str(OUT_BEST), compression="zstd")

# --- Anti-placebo assertion ---
n_unique_scores = 0
if tuning_df.height > 0:
    for sym in tuning_df.get_column("symbol").unique().to_list():
        for fid in tuning_df.filter(pl.col("symbol") == sym).get_column("fold_id").unique().to_list():
            grp = tuning_df.filter((pl.col("symbol") == sym) & (pl.col("fold_id") == fid))
            unique_scores = grp.get_column("score").n_unique()
            n_unique_scores = max(n_unique_scores, unique_scores)
            if unique_scores <= 1 and grp.height > 1:
                print(f"[Celda 14] WARNING PLACEBO: {sym} fold={fid} tiene {grp.height} combos pero solo {unique_scores} score unico")

print(f"[Celda 14] Anti-placebo: max unique scores per (sym,fold) = {n_unique_scores}")
if n_unique_scores > 1:
    print("[Celda 14] PASS: Tuning es REAL (scores varian entre combos)")
else:
    print("[Celda 14] FAIL: Tuning puede ser placebo (scores identicos)")

# --- Snapshot ---
top5 = tuning_df.head(5).to_dicts() if tuning_df.height > 0 else []
snap = {
    "created_utc": _now_utc_iso(), "version": "v2.1.0",
    "grid": {"SL": SL_ATR_GRID, "TP": TP_ATR_GRID, "TRAIL": TRAIL_ATR_GRID,
             "TIME_STOP": TIME_STOP_GRID, "MIN_HOLD": MIN_HOLD_GRID, "ENTRY_CONFIRM": ENTRY_CONFIRM_GRID},
    "n_combos": len(combos), "n_results": tuning_df.height, "n_best": best.height,
    "max_unique_scores_per_group": n_unique_scores,
    "anti_placebo": "PASS" if n_unique_scores > 1 else "FAIL",
    "elapsed_seconds": round(elapsed, 1),
    "top5_results": top5,
}
Path(OUT_SNAP).write_text(json.dumps(snap, indent=2, default=str), encoding="utf-8")

print(f"\n[Celda 14] OUT: {OUT_TUNING} ({tuning_df.height} rows)")
print(f"[Celda 14] OUT: {OUT_BEST} ({best.height} rows)")
print(f"[Celda 14] OUT: {OUT_SNAP}")
print(">>> Celda 14 v2.1.0 :: OK")

In [16]:
# ======================================================================================
# Celda 15 v2.0.1 — Alpha Design (IS-only) [side + horizon selection -> motor targets]
# Gates: n_trades >= 80, net_base_mean >= 0
# Score: sharpe_like * sqrt(n_trades)
# ======================================================================================

from __future__ import annotations
import json, math
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict
import polars as pl

print(">>> Celda 15 v2.0.1 :: Alpha Design (IS-only)")

if "RUN" not in globals():
    raise RuntimeError("[Celda 15] ERROR: RUN no existe.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

ALPHA_PATH = ARTIFACTS.get("alpha_multi_horizon_report", RUN_DIR / "alpha_multi_horizon_report_v2.parquet")
OUT_DESIGN = ARTIFACTS.get("alpha_design", RUN_DIR / "alpha_design_v2.parquet")
OUT_SNAP = ARTIFACTS.get("alpha_design_snapshot", RUN_DIR / "alpha_design_snapshot_v2.json")

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

MIN_TRADES = 80
MIN_NET_MEAN = 0.0

if not ALPHA_PATH.exists():
    print("[Celda 15] WARNING: alpha_report no existe, skip.")
    snap = {"created_utc": _now_utc_iso(), "status": "SKIPPED"}
    Path(OUT_SNAP).write_text(json.dumps(snap, indent=2), encoding="utf-8")
else:
    alpha = pl.read_parquet(ALPHA_PATH)
    a_is = alpha.filter(pl.col("segment") == "IS")

    # Gates
    a_is = a_is.filter(
        (pl.col("n_trades") >= MIN_TRADES) &
        (pl.col("net_base_mean") >= MIN_NET_MEAN)
    )

    if a_is.height == 0:
        print("[Celda 15] WARNING: no hay filas que pasen gates.")
        design_df = pl.DataFrame()
    else:
        # Score
        a_is = a_is.with_columns(
            (pl.col("sharpe_like") * pl.col("n_trades").cast(pl.Float64).sqrt()).alias("score")
        )

        # Best per symbol/fold
        design_rows = []
        for sym in a_is.get_column("symbol").unique().sort().to_list():
            for fid in a_is.filter(pl.col("symbol") == sym).get_column("fold_id").unique().sort().to_list():
                cand = a_is.filter((pl.col("symbol") == sym) & (pl.col("fold_id") == fid))
                if cand.height == 0:
                    continue
                best = cand.sort("score", descending=True).row(0, named=True)
                h = best["horizon_bars"]
                design_rows.append({
                    "symbol": sym, "fold_id": fid,
                    "best_side": best["side"], "best_horizon": h,
                    "sharpe_like": best["sharpe_like"], "score": best["score"],
                    "n_trades": best["n_trades"],
                    "TIME_STOP_target": h,
                    "MIN_HOLD_target": max(6, int(0.25 * h)),
                    "ENTRY_CONFIRM_target": max(3, int(0.10 * h)),
                })

        design_df = pl.DataFrame(design_rows) if design_rows else pl.DataFrame()

    design_df.write_parquet(str(OUT_DESIGN), compression="zstd")

    snap = {
        "created_utc": _now_utc_iso(), "version": "v2.0.1",
        "gates": {"min_trades": MIN_TRADES, "min_net_mean": MIN_NET_MEAN},
        "n_designs": design_df.height if design_df.height else 0,
    }
    Path(OUT_SNAP).write_text(json.dumps(snap, indent=2, default=str), encoding="utf-8")

    print(f"[Celda 15] OUT: {OUT_DESIGN} ({design_df.height} rows)")
    if design_df.height > 0:
        print(design_df)

print(">>> Celda 15 v2.0.1 :: OK")


>>> Celda 15 v2.0.1 :: Alpha Design (IS-only)
[Celda 15] OUT: C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\alpha_design_v2.parquet (19 rows)
shape: (19, 10)
┌────────┬─────────┬───────────┬─────────────┬───┬──────────┬────────────┬────────────┬────────────┐
│ symbol ┆ fold_id ┆ best_side ┆ best_horizo ┆ … ┆ n_trades ┆ TIME_STOP_ ┆ MIN_HOLD_t ┆ ENTRY_CONF │
│ ---    ┆ ---     ┆ ---       ┆ n           ┆   ┆ ---      ┆ target     ┆ arget      ┆ IRM_target │
│ str    ┆ i64     ┆ str       ┆ ---         ┆   ┆ i64      ┆ ---        ┆ ---        ┆ ---        │
│        ┆         ┆           ┆ i64         ┆   ┆          ┆ i64        ┆ i64        ┆ i64        │
╞════════╪═════════╪═══════════╪═════════════╪═══╪══════════╪════════════╪════════════╪════════════╡
│ BNBUSD ┆ 6       ┆ SHORT     ┆ 96          ┆ … ┆ 35779    ┆ 96         ┆ 24         ┆ 9          │
│ BNBUSD ┆ 7       ┆ SHORT     ┆ 96          ┆ … ┆ 38296    ┆ 96         ┆ 24         ┆ 9        

In [17]:
# ======================================================================================
# Celda 16 v3.0.0 — Challenge-Ready Overlay (ChallengeOverlayStateMachine)
# Edge filter: BTCUSD LONG only (fwd_ret +0.356% @24h)
# Challenge rules: daily -$1,250 / total -$2,500 / target +$1,250 / min 2 days
# Sizing: risk_per_trade / median_SL_loss -> 1 SL ~ $risk
# ======================================================================================

from __future__ import annotations
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict
import polars as pl

print(">>> Celda 16 v3.0.0 :: Challenge-Ready Overlay")

if "RUN" not in globals():
    raise RuntimeError("[Celda 16] ERROR: RUN no existe.")

if RUN.get("_overlay_applied"):
    raise RuntimeError("[Celda 16] Overlay ya aplicado en este run. Re-ejecutar desde Cell 00.")
RUN["_overlay_applied"] = True

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

TRADES_PATH = ARTIFACTS.get("trades_engine", RUN_DIR / "trades_engine_v2.parquet")
OUT_OVERLAY_TRADES = ARTIFACTS.get("overlay_trades", RUN_DIR / "overlay_trades_v2.parquet")
OUT_OVERLAY_SUMMARY = ARTIFACTS.get("overlay_summary", RUN_DIR / "overlay_summary_v2.parquet")
OUT_SNAP = ARTIFACTS.get("overlay_snapshot", RUN_DIR / "overlay_snapshot_v2.json")
OUT_CHALLENGE = RUN_DIR / "challenge_dashboard_v2.json"

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

# ── Edge filter params ──
SYMBOL_WHITELIST = ["XAUAUD"]
SIDE_FILTER = "LONG"
ENTRY_WEEKDAYS_ONLY = True

# ── Challenge params (prop-firm exam) ──
CHALLENGE_CAPITAL        = 25_000
CHALLENGE_DAILY_MAX_LOSS = 1_250   # USD
CHALLENGE_TOTAL_MAX_LOSS = 2_500   # USD
CHALLENGE_PROFIT_TARGET  = 1_250   # USD
CHALLENGE_MIN_DAYS       = 2
RISK_PER_TRADE_USD       = 75     # optimal from sweep (worst fold DD = -$1,955, no violations)

if not TRADES_PATH.exists():
    print("[Celda 16] WARNING: trades_engine no existe, skip.")
    snap = {"created_utc": _now_utc_iso(), "status": "SKIPPED"}
    Path(OUT_SNAP).write_text(json.dumps(snap, indent=2), encoding="utf-8")
else:
    df = pl.read_parquet(TRADES_PATH)
    n_engine = df.height

    # ── Step 1: Edge filter ──
    df = df.filter(
        pl.col("symbol").is_in(SYMBOL_WHITELIST) &
        (pl.col("side") == SIDE_FILTER)
    )
    n_after_edge = df.height
    print(f"[Celda 16] Edge filter: {n_engine} -> {n_after_edge} (XAUAUD LONG only)")

    # ── Step 2: Weekday filter ──
    df = df.with_columns([
        pl.col("entry_time_utc").cast(pl.Date).alias("_date"),
        pl.col("entry_time_utc").dt.weekday().alias("_dow"),
    ])
    if ENTRY_WEEKDAYS_ONLY:
        df = df.filter(pl.col("_dow") <= 5)
    n_after_weekday = df.height

    # ── Step 3: Sizing (from SL-exit trades) ──
    sl_trades = df.filter(pl.col("exit_reason") == "SL")
    if sl_trades.height > 0:
        sl_return_median = float(sl_trades["net_pnl_base"].abs().median())
    else:
        sl_return_median = 0.003  # fallback
    sl_return_median = max(sl_return_median, 1e-8)
    pos_notional = RISK_PER_TRADE_USD / sl_return_median

    # Sanity check: 1 SL should cost ~$RISK_PER_TRADE_USD
    if sl_trades.height > 0:
        actual_sl_usd = sl_return_median * pos_notional
        print(f"[Celda 16] Sizing: risk=${RISK_PER_TRADE_USD}, SL_ret={sl_return_median:.4%}, "
              f"notional=${pos_notional:,.0f}, 1-SL=${actual_sl_usd:,.2f}")

    # ── Step 4: ChallengeOverlayStateMachine (OOS simulation) ──
    # Save ALL trades (IS+OOS) for overlay output, but simulate challenge on OOS only
    df_sorted = df.sort("entry_time_utc")

    # For overlay_trades: keep all (no filtering by challenge rules on the parquet)
    df_sorted.write_parquet(str(OUT_OVERLAY_TRADES), compression="zstd")
    n_overlay = df_sorted.height

    # Challenge simulation on OOS
    oos = df_sorted.filter(pl.col("segment") == "OOS")

    challenge_result = None
    if oos.height > 0:
        equity = CHALLENGE_CAPITAL
        trading_days = set()
        daily_log = {}
        trades_taken = 0
        trades_skipped = 0
        target_reached = False
        target_day = None
        violated_daily = False
        violated_total = False
        max_daily_loss_seen = 0.0
        max_total_dd_seen = 0.0
        total_wins = 0
        total_win_usd = 0.0
        total_loss_usd = 0.0

        for row in oos.iter_rows(named=True):
            trade_date = row["_date"]
            pnl_usd = row["net_pnl_base"] * pos_notional

            if trade_date not in daily_log:
                daily_log[trade_date] = {"n_trades": 0, "pnl_usd": 0.0, "skipped": 0}
            day = daily_log[trade_date]

            # Daily stop BEFORE trade
            if day["pnl_usd"] <= -CHALLENGE_DAILY_MAX_LOSS:
                day["skipped"] += 1
                trades_skipped += 1
                continue

            # Total stop BEFORE trade
            if (equity - CHALLENGE_CAPITAL) <= -CHALLENGE_TOTAL_MAX_LOSS:
                violated_total = True
                break

            # Take trade
            equity += pnl_usd
            day["n_trades"] += 1
            day["pnl_usd"] += pnl_usd
            trades_taken += 1
            trading_days.add(trade_date)

            if pnl_usd > 0:
                total_wins += 1
                total_win_usd += pnl_usd
            else:
                total_loss_usd += abs(pnl_usd)

            # Daily violation check
            if day["pnl_usd"] <= -CHALLENGE_DAILY_MAX_LOSS:
                violated_daily = True

            # Total DD tracking
            total_dd = equity - CHALLENGE_CAPITAL
            max_total_dd_seen = min(max_total_dd_seen, total_dd)
            if total_dd <= -CHALLENGE_TOTAL_MAX_LOSS:
                violated_total = True
                break

            # Target check
            if total_dd >= CHALLENGE_PROFIT_TARGET and len(trading_days) >= CHALLENGE_MIN_DAYS:
                target_reached = True
                target_day = str(trade_date)
                break

        # Worst daily loss
        for info in daily_log.values():
            max_daily_loss_seen = min(max_daily_loss_seen, info["pnl_usd"])

        # Discipline
        r_days = len(trading_days) >= CHALLENGE_MIN_DAYS
        r_daily = max_daily_loss_seen > -CHALLENGE_DAILY_MAX_LOSS
        r_total = max_total_dd_seen > -CHALLENGE_TOTAL_MAX_LOSS
        r_target = target_reached
        discipline = sum([r_days, r_daily, r_total, r_target]) * 25

        wr = total_wins / trades_taken if trades_taken > 0 else 0
        avg_win = total_win_usd / total_wins if total_wins > 0 else 0
        n_losses = trades_taken - total_wins
        avg_loss = total_loss_usd / n_losses if n_losses > 0 else 0

        # Daily summary
        daily_summary = []
        for d in sorted(daily_log.keys()):
            info = daily_log[d]
            if info["n_trades"] > 0 or info["skipped"] > 0:
                daily_summary.append({
                    "date": str(d), "n_trades": info["n_trades"],
                    "pnl_usd": round(info["pnl_usd"], 2), "skipped": info["skipped"],
                })

        challenge_result = {
            "created_utc": _now_utc_iso(),
            "version": "v3.0.0",
            "sizing": {
                "risk_per_trade_usd": RISK_PER_TRADE_USD,
                "sl_return_median": round(sl_return_median, 6),
                "position_notional": round(pos_notional, 2),
            },
            "challenge": {
                "initial_capital": CHALLENGE_CAPITAL,
                "daily_max_loss_usd": CHALLENGE_DAILY_MAX_LOSS,
                "total_max_loss_usd": CHALLENGE_TOTAL_MAX_LOSS,
                "profit_target_usd": CHALLENGE_PROFIT_TARGET,
                "min_trading_days": CHALLENGE_MIN_DAYS,
            },
            "results": {
                "final_equity": round(equity, 2),
                "final_pnl_usd": round(equity - CHALLENGE_CAPITAL, 2),
                "trades_taken": trades_taken,
                "trades_skipped_daily_stop": trades_skipped,
                "trading_days": len(trading_days),
                "win_rate": round(wr, 4),
                "avg_win_usd": round(avg_win, 2),
                "avg_loss_usd": round(avg_loss, 2),
                "payoff_ratio": round(avg_win / avg_loss, 2) if avg_loss > 0 else 0,
                "max_daily_loss_usd": round(max_daily_loss_seen, 2),
                "max_total_dd_usd": round(max_total_dd_seen, 2),
                "target_reached": target_reached,
                "target_day": target_day,
                "violated_daily_limit": violated_daily,
                "violated_total_limit": violated_total,
                "discipline_pct": discipline,
            },
            "rules": {
                "min_2_days": r_days,
                "daily_loss_ok": r_daily,
                "total_loss_ok": r_total,
                "target_hit": r_target,
            },
            "daily_summary": daily_summary,
        }

        # Print dashboard
        res = challenge_result["results"]
        print(f"[Celda 16] Challenge OOS (base): PnL=${res['final_pnl_usd']:+,.0f}, "
              f"disc={res['discipline_pct']}%, target={'SI' if res['target_reached'] else 'NO'}")
        print(f"[Celda 16] MaxDayLoss=${res['max_daily_loss_usd']:,.0f} "
              f"MaxTotDD=${res['max_total_dd_usd']:,.0f} "
              f"trades={res['trades_taken']} days={res['trading_days']}")

    # Save challenge dashboard
    if challenge_result:
        Path(OUT_CHALLENGE).write_text(
            json.dumps(challenge_result, indent=2, default=str), encoding="utf-8")
        print(f"[Celda 16] OUT: {OUT_CHALLENGE}")

    # Summary
    if n_overlay > 0:
        summary = (
            df_sorted.group_by(["symbol", "segment"])
            .agg([
                pl.len().alias("n_trades"),
                pl.col("net_pnl_base").sum().alias("total_ret"),
                pl.col("net_pnl_base").mean().alias("mean_ret"),
                (pl.col("net_pnl_base") > 0).mean().alias("win_rate"),
            ])
            .sort(["symbol", "segment"])
        )
    else:
        summary = pl.DataFrame()
    summary.write_parquet(str(OUT_OVERLAY_SUMMARY), compression="zstd")

    snap = {
        "created_utc": _now_utc_iso(), "version": "v3.0.0",
        "edge_filter": {"symbols": SYMBOL_WHITELIST, "side": SIDE_FILTER},
        "challenge": {
            "capital": CHALLENGE_CAPITAL,
            "daily_max_loss": CHALLENGE_DAILY_MAX_LOSS,
            "total_max_loss": CHALLENGE_TOTAL_MAX_LOSS,
            "profit_target": CHALLENGE_PROFIT_TARGET,
            "risk_per_trade": RISK_PER_TRADE_USD,
        },
        "sizing": {"sl_return_median": round(sl_return_median, 6),
                   "pos_notional": round(pos_notional, 2)},
        "n_engine": n_engine, "n_after_edge": n_after_edge,
        "n_after_weekday": n_after_weekday, "n_overlay": n_overlay,
    }
    Path(OUT_SNAP).write_text(json.dumps(snap, indent=2, default=str), encoding="utf-8")

    print(f"[Celda 16] trades: {n_engine} -> {n_overlay} (edge+weekday filter)")
    print(f"[Celda 16] OUT: {OUT_OVERLAY_TRADES}")

print(">>> Celda 16 v3.0.0 :: OK")


>>> Celda 16 v2.0.1 :: Execution & Risk Overlay
[Celda 16] trades: 83385 -> 3253 (filtered 80132)
[Celda 16] OUT: C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\overlay_trades_v2.parquet
>>> Celda 16 v2.0.1 :: OK


In [18]:
# ======================================================================================
# Celda 17 v2.0.1 — Seleccion Institucional (OOS-first + gates + score)
# Gates: min_oos_trades=30, max_mdd=-0.20, min_totret=-0.05, min_wr=0.15, max_exposure=0.65
# ======================================================================================

from __future__ import annotations
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict
import polars as pl

print(">>> Celda 17 v2.0.1 :: Seleccion Institucional")

if "RUN" not in globals():
    raise RuntimeError("[Celda 17] ERROR: RUN no existe.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

OVERLAY_PATH = ARTIFACTS.get("overlay_trades", RUN_DIR / "overlay_trades_v2.parquet")
ENGINE_SNAP_PATH = ARTIFACTS.get("engine_report_snapshot", RUN_DIR / "engine_report_snapshot_v2.json")
OUT_SEL = ARTIFACTS.get("selection", RUN_DIR / "selection_v2.parquet")
OUT_SNAP = ARTIFACTS.get("selection_snapshot", RUN_DIR / "selection_snapshot_v2.json")

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

# Gates
MIN_OOS_TRADES = 20
MAX_MDD = -0.35
MIN_TOTRET = -0.15
MIN_WINRATE = 0.10   # BE_WR=17.6% for SL=3/TP=14, 0.10 filters strategies without real edge
MAX_EXPOSURE = 0.65

if not OVERLAY_PATH.exists():
    print("[Celda 17] WARNING: overlay_trades no existe, skip.")
    snap = {"created_utc": _now_utc_iso(), "status": "SKIPPED"}
    Path(OUT_SNAP).write_text(json.dumps(snap, indent=2), encoding="utf-8")
else:
    df = pl.read_parquet(OVERLAY_PATH)
    df_oos = df.filter(pl.col("segment") == "OOS") if df.height > 0 else df

    sel_rows = []
    if df_oos.height > 0:
        for sym in df_oos.get_column("symbol").unique().sort().to_list():
            for side in df_oos.filter(pl.col("symbol") == sym).get_column("side").unique().to_list():
                sub = df_oos.filter((pl.col("symbol") == sym) & (pl.col("side") == side))
                n = sub.height
                if n < MIN_OOS_TRADES:
                    sel_rows.append({"symbol": sym, "side": side, "decision": "NO_GO", "reason": f"n_oos={n}<{MIN_OOS_TRADES}", "score": 0.0, "n_oos": n})
                    continue

                tot_ret = float(sub.get_column("net_pnl_base").sum())
                wr = float((sub.get_column("net_pnl_base") > 0).mean())
                cum = sub.sort("exit_time_utc").with_columns(pl.col("net_pnl_base").cum_sum().alias("_cr"))
                mdd = float((cum.get_column("_cr") - cum.get_column("_cr").cum_max()).min())

                # Score
                sharpe = float(sub.get_column("net_pnl_base").mean()) / max(1e-12, float(sub.get_column("net_pnl_base").std()))
                score = tot_ret + 0.15 * sharpe + 0.05 * (wr - 0.5) - 1.25 * (-mdd) - 0.25 * 0.5

                go = (tot_ret >= MIN_TOTRET and mdd >= MAX_MDD and wr >= MIN_WINRATE)
                sel_rows.append({
                    "symbol": sym, "side": side,
                    "decision": "GO" if go else "NO_GO",
                    "reason": "PASS" if go else "gates",
                    "score": score, "n_oos": n,
                    "tot_ret": tot_ret, "mdd": mdd, "win_rate": wr, "sharpe": sharpe,
                })

    sel_df = pl.DataFrame(sel_rows) if sel_rows else pl.DataFrame()
    sel_df.write_parquet(str(OUT_SEL), compression="zstd")

    snap = {
        "created_utc": _now_utc_iso(), "version": "v2.0.1",
        "gates": {"min_oos_trades": MIN_OOS_TRADES, "max_mdd": MAX_MDD, "min_totret": MIN_TOTRET,
                  "min_wr": MIN_WINRATE, "max_exposure": MAX_EXPOSURE},
        "selections": sel_rows,
    }
    Path(OUT_SNAP).write_text(json.dumps(snap, indent=2, default=str), encoding="utf-8")

    n_go = sum(1 for r in sel_rows if r["decision"] == "GO")
    print(f"[Celda 17] {n_go}/{len(sel_rows)} symbols GO")
    print(f"[Celda 17] OUT: {OUT_SEL}")

print(">>> Celda 17 v2.0.1 :: OK")


>>> Celda 17 v2.0.1 :: Seleccion Institucional
[Celda 17] 0/8 symbols GO
[Celda 17] OUT: C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\selection_v2.parquet
>>> Celda 17 v2.0.1 :: OK


In [19]:
# ======================================================================================
# Celda 18 v2.0.1 — Deploy Pack (freeze config + per-symbol JSONs)
# Reads selection, filters GO (fallback TOPK=2), exports deploy configs.
# ======================================================================================

from __future__ import annotations
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict
import polars as pl

print(">>> Celda 18 v2.0.1 :: Deploy Pack")

if "RUN" not in globals():
    raise RuntimeError("[Celda 18] ERROR: RUN no existe.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

SEL_PATH = ARTIFACTS.get("selection", RUN_DIR / "selection_v2.parquet")
REGIME_PATH = ARTIFACTS["regime_params_by_fold"]
COST_SNAP_PATH = ARTIFACTS.get("cost_model_snapshot", RUN_DIR / "cost_model_snapshot_v2.json")

OUT_DEPLOY = ARTIFACTS.get("deploy_pack", RUN_DIR / "deploy_pack_v2.parquet")
OUT_DEPLOY_JSON = ARTIFACTS.get("deploy_pack_json", RUN_DIR / "deploy_pack_v2.json")

DEPLOY_DIR = RUN_DIR / "deploy"
DEPLOY_DIR.mkdir(parents=True, exist_ok=True)

TOPK = 2

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

if not SEL_PATH.exists():
    print("[Celda 18] WARNING: selection no existe, skip.")
else:
    sel = pl.read_parquet(SEL_PATH)
    regime = pl.read_parquet(REGIME_PATH)
    cost_snap = json.loads(Path(COST_SNAP_PATH).read_text(encoding="utf-8"))

    go = sel.filter(pl.col("decision") == "GO") if sel.height > 0 and "decision" in sel.columns else pl.DataFrame()
    if go.height == 0 and sel.height > 0 and "score" in sel.columns:
        go = sel.sort("score", descending=True).head(TOPK)
        print(f"[Celda 18] No GO symbols, fallback TOPK={TOPK}")

    deploy_rows = []
    for row in go.iter_rows(named=True):
        sym = row["symbol"]
        side = row["side"]
        rg = regime.filter((pl.col("symbol") == sym) & (pl.col("side") == side))
        rg_dict = rg.to_dicts() if rg.height > 0 else []

        config = {
            "symbol": sym, "side": side, "score": row.get("score", 0),
            "regime_gates": json.dumps(rg_dict, default=str),
            "costs": json.dumps(cost_snap.get("costs_by_symbol", {}).get(sym, {}), default=str),
            "created_utc": _now_utc_iso(),
        }
        deploy_rows.append(config)

        # Per-symbol JSON
        sym_json = DEPLOY_DIR / f"{sym}_{side}_config.json"
        sym_json.write_text(json.dumps(config, indent=2, default=str), encoding="utf-8")

    deploy_df = pl.DataFrame(deploy_rows) if deploy_rows else pl.DataFrame()
    deploy_df.write_parquet(str(OUT_DEPLOY), compression="zstd")
    Path(OUT_DEPLOY_JSON).write_text(json.dumps(deploy_rows, indent=2, default=str), encoding="utf-8")

    print(f"[Celda 18] {len(deploy_rows)} symbols deployed")
    print(f"[Celda 18] OUT: {OUT_DEPLOY}")
    print(f"[Celda 18] OUT: {DEPLOY_DIR}/")

print(">>> Celda 18 v2.0.1 :: OK")


>>> Celda 18 v2.0.1 :: Deploy Pack
[Celda 18] No GO symbols, fallback TOPK=2
[Celda 18] 2 symbols deployed
[Celda 18] OUT: C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\deploy_pack_v2.parquet
[Celda 18] OUT: C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\deploy/
>>> Celda 18 v2.0.1 :: OK


In [20]:
# ======================================================================================
# Celda 19 v2.0.1 — QA Alpha<->Motor Alignment (OOS-first + mismatch report)
# ======================================================================================

from __future__ import annotations
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict
import polars as pl

print(">>> Celda 19 v2.0.1 :: QA Alpha<->Motor Alignment")

if "RUN" not in globals():
    raise RuntimeError("[Celda 19] ERROR: RUN no existe.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]

ALPHA_PATH = ARTIFACTS.get("alpha_multi_horizon_report", RUN_DIR / "alpha_multi_horizon_report_v2.parquet")
TRADES_PATH = ARTIFACTS.get("trades_engine", RUN_DIR / "trades_engine_v2.parquet")
OUT_QA = ARTIFACTS.get("qa_alignment", RUN_DIR / "qa_alignment_v2.parquet")
OUT_SNAP = ARTIFACTS.get("qa_alignment_snapshot", RUN_DIR / "qa_alignment_snapshot_v2.json")

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

if not ALPHA_PATH.exists() or not TRADES_PATH.exists():
    print("[Celda 19] WARNING: faltan inputs, skip.")
    snap = {"created_utc": _now_utc_iso(), "status": "SKIPPED"}
    Path(OUT_SNAP).write_text(json.dumps(snap, indent=2), encoding="utf-8")
else:
    alpha = pl.read_parquet(ALPHA_PATH)
    trades = pl.read_parquet(TRADES_PATH)

    qa_rows = []
    if trades.height > 0 and alpha.height > 0:
        a_oos = alpha.filter(pl.col("segment") == "OOS")
        t_oos = trades.filter(pl.col("segment") == "OOS")

        for sym in t_oos.get_column("symbol").unique().sort().to_list():
            # Best alpha side OOS
            a_sym = a_oos.filter(pl.col("symbol") == sym)
            if a_sym.height == 0:
                continue
            best_alpha = a_sym.sort("sharpe_like", descending=True).row(0, named=True)

            # Engine best side OOS
            t_sym = t_oos.filter(pl.col("symbol") == sym)
            if t_sym.height == 0:
                continue
            eng_sides = (
                t_sym.group_by("side")
                .agg(pl.col("net_pnl_base").sum().alias("tot"))
                .sort("tot", descending=True)
            )
            eng_best_side = eng_sides.row(0, named=True)["side"]

            # Mismatch flags
            hold_p90 = float(t_sym.get_column("hold_bars").quantile(0.90, interpolation="nearest"))
            alpha_h = best_alpha["horizon_bars"]
            trail_share = t_sym.filter(pl.col("exit_reason") == "TRAIL").height / max(1, t_sym.height)

            qa_rows.append({
                "symbol": sym,
                "alpha_best_side_oos": best_alpha["side"],
                "alpha_best_horizon_oos": alpha_h,
                "alpha_sharpe_oos": best_alpha["sharpe_like"],
                "engine_best_side_oos": eng_best_side,
                "side_mismatch": best_alpha["side"] != eng_best_side,
                "hold_p90_over_alphaH": hold_p90 / alpha_h if alpha_h > 0 else None,
                "trail_dominates_short_hold": trail_share > 0.40,
                "alpha_edge_nonpos_oos": best_alpha["net_base_mean"] <= 0 if "net_base_mean" in best_alpha else False,
            })

    qa_df = pl.DataFrame(qa_rows) if qa_rows else pl.DataFrame()
    qa_df.write_parquet(str(OUT_QA), compression="zstd")

    snap = {
        "created_utc": _now_utc_iso(), "version": "v2.0.1",
        "alignment_report": qa_rows,
    }
    Path(OUT_SNAP).write_text(json.dumps(snap, indent=2, default=str), encoding="utf-8")

    print(f"[Celda 19] OUT: {OUT_QA} ({qa_df.height} rows)")
    if qa_rows:
        for q in qa_rows:
            mismatch = "MISMATCH" if q["side_mismatch"] else "OK"
            print(f"  {q['symbol']}: alpha={q['alpha_best_side_oos']} engine={q['engine_best_side_oos']} [{mismatch}]")

print(">>> Celda 19 v2.0.1 :: OK")


>>> Celda 19 v2.0.1 :: QA Alpha<->Motor Alignment
[Celda 19] OUT: C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\qa_alignment_v2.parquet (4 rows)
  BNBUSD: alpha=LONG engine=LONG [OK]
  BTCUSD: alpha=LONG engine=LONG [OK]
  LVMH: alpha=SHORT engine=LONG [MISMATCH]
  XAUAUD: alpha=LONG engine=LONG [OK]
>>> Celda 19 v2.0.1 :: OK


In [21]:
# ======================================================================================
# Celda 20 v2.0.1 — Run Summary + Manifest Final
# Verifica todos los artifacts, calcula resumen ejecutivo, cierra manifest.
# ======================================================================================

from __future__ import annotations
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict
import polars as pl

print(">>> Celda 20 v2.0.1 :: Run Summary + Manifest Final")

if "RUN" not in globals():
    raise RuntimeError("[Celda 20] ERROR: RUN no existe.")

RUN_DIR: Path = RUN["RUN_DIR"]
ARTIFACTS: Dict[str, Path] = RUN["ARTIFACTS"]
RUN_ID = RUN["RUN_ID"]

def _now_utc_iso() -> str:
    return datetime.now(timezone.utc).isoformat(timespec="seconds")

# Verify all artifacts exist
missing = []
existing = []
for key, path in ARTIFACTS.items():
    if Path(path).exists():
        existing.append(key)
    else:
        missing.append(key)

print(f"[Celda 20] Artifacts: {len(existing)} exist, {len(missing)} missing")
if missing:
    print(f"[Celda 20] MISSING: {missing}")

# Summary stats
summary = {"run_id": RUN_ID, "completion_utc": _now_utc_iso()}

sel_path = ARTIFACTS.get("selection", RUN_DIR / "selection_v2.parquet")
if Path(sel_path).exists():
    sel = pl.read_parquet(sel_path)
    if sel.height > 0 and "decision" in sel.columns:
        summary["symbols_go"] = sel.filter(pl.col("decision") == "GO").height
        summary["symbols_total"] = sel.height

eng_snap_path = ARTIFACTS.get("engine_report_snapshot", RUN_DIR / "engine_report_snapshot_v2.json")
if Path(eng_snap_path).exists():
    eng_snap = json.loads(Path(eng_snap_path).read_text(encoding="utf-8"))
    kpis = eng_snap.get("kpis", {})
    summary["best_sharpe"] = kpis.get("sharpe_like")
    summary["worst_mdd"] = kpis.get("mdd")
    summary["total_return"] = kpis.get("total_return")

summary["artifacts_existing"] = len(existing)
summary["artifacts_missing"] = len(missing)
summary["artifacts_missing_keys"] = missing

# Update manifest
manifest_path = RUN_DIR / "run_manifest_v2.json"
if manifest_path.exists():
    manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
else:
    manifest = {}

manifest["completion_utc"] = summary["completion_utc"]
manifest["summary"] = summary
manifest_path.write_text(json.dumps(manifest, indent=2, default=str), encoding="utf-8")

# Latest
latest_path = RUN_DIR.parent / "run_manifest_v2_latest.json"
latest_path.write_text(json.dumps(manifest, indent=2, default=str), encoding="utf-8")

print(f"\n{'='*60}")
print(f"  RUN SUMMARY — TREND v2")
print(f"{'='*60}")
for k, v in summary.items():
    print(f"  {k:30s}: {v}")
print(f"{'='*60}")
print(f"[Celda 20] Manifest updated: {manifest_path}")
print(">>> Celda 20 v2.0.1 :: OK")


>>> Celda 20 v2.0.1 :: Run Summary + Manifest Final
[Celda 20] Artifacts: 37 exist, 2 missing
[Celda 20] MISSING: ['trades_baseline', 'summary_baseline']

  RUN SUMMARY — TREND v2
  run_id                        : 20260218_000143_164d8480
  completion_utc                : 2026-02-18T00:02:20+00:00
  symbols_go                    : 0
  symbols_total                 : 8
  best_sharpe                   : -0.0770189238547151
  worst_mdd                     : -32.90706074948353
  total_return                  : -32.219949031643154
  artifacts_existing            : 37
  artifacts_missing             : 2
  artifacts_missing_keys        : ['trades_baseline', 'summary_baseline']
[Celda 20] Manifest updated: C:\Quant\projects\MT5_Data_Extraction\outputs\trend_v2\run_20260218_000143_164d8480\run_manifest_v2.json
>>> Celda 20 v2.0.1 :: OK
