In [None]:
"""
========================================================================================
PHASE 04A-2: AGGREGATED FEATURE ENGINEERING FOR TYPE A DETECTION [LARGE INFORMED TRADE]
========================================================================================

Pipeline Position: 
    Phase 03 (Per-Row Features) → Phase 04A-1 (Baselines) → **Phase 04A-2**
    → Phase 05 (Anomaly Detection)

Purpose:
    Transforms per-trade data into feature-rich records optimized for detecting
    Type A unusual options activity (large informed single trades). Computes
    aggregations at multiple levels (contract, expiry, chain) and compares
    against historical baselines to identify statistically unusual activity.

Target Signal (Type A Characteristics):
    - Large trade size relative to contract/chain
    - Paying above VWAP (urgency)
    - OTM strikes (leverage seeking)
    - Near-dated expiries (catalyst-driven)
    - Concentrated in specific contracts
    - Elevated chain volume vs historical baseline

Input:
    - Phase 03 output files: {TICKER}_perrowfeatures_YYYY-MM-DD.parquet
    - Phase 04A-1 baseline table: baseline_{TICKER}_daily.parquet
    
Output:
    - Feature-enriched files: {TICKER}_aggregatedfeatures_YYYY-MM-DD.parquet
    - Each row is a single trade with ~72 additional aggregated features

Author: [Your Name]
Created: 2026-02-XX
Version: 1.0

Dependencies:
    - pandas >= 1.5.0
    - numpy >= 1.20.0
    - pyarrow >= 10.0.0

Usage:
    python "scriptname.py"

================================================================================
"""

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
from typing import Optional, List, Dict, Tuple
import warnings
import logging

warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================

CONFIG = {
    # Input folder containing Phase 03A output files
    "input_folder": Path(r"D:\cyclelabs_codes\CL_20251120_siphontrades\01_FIXINGRAWDATA\output_2_perrowfeateng"),
    
    # Baseline folder from Phase 03B-1 (contains baseline_{TICKER}_daily.parquet files)
    "baseline_folder": Path(r"D:\cyclelabs_codes\CL_20251120_siphontrades\01_FIXINGRAWDATA\output_4a1_baseline"),
    
    # Output folder for aggregated features
    "output_folder": Path(r"D:\cyclelabs_codes\CL_20251120_siphontrades\01_FIXINGRAWDATA\output_4a2_aggfeateng"),
    
    # Date range to process (inclusive)
    "start_date": "2024-02-01",
    "end_date": "2025-12-31",
    
    # Tickers to process (None or [] for all tickers)
    "tickers_to_process": ["CIFR"],  # e.g., ["AAPL", "TSLA", "CIFR"]
    
    # OTM percentage thresholds
    "otm_threshold": 5.0,        # > 5% = OTM
    "deep_otm_threshold": 20.0,  # > 20% = Deep OTM
    
    # DTE thresholds
    "short_dte_threshold": 14,   # <= 14 days = short-dated
    "medium_dte_threshold": 45,  # <= 45 days = medium-dated
    
    # Minimum thresholds for valid signals
    "min_chain_volume": 100,          # Minimum chain volume
    "min_chain_trades": 10,           # Minimum trade count
    "min_contract_trades": 1,         # Minimum trades per contract
    
    # Z-score clipping for composite scores
    "zscore_clip_max": 5.0,
    
    # Epsilon for division safety
    "epsilon": 1e-8,
    
    # Logging level
    "log_level": logging.INFO,
}

# =============================================================================
# LOGGING SETUP
# =============================================================================

logging.basicConfig(
    level=CONFIG["log_level"],
    format='%(asctime)s | %(levelname)s | %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================

def parse_filename(filename: str) -> Tuple[Optional[str], Optional[str]]:
    """
    Parse ticker and date from Phase 03A output filename.
    
    Expected format: {TICKER}_perrowfeatures_YYYY-MM-DD.parquet
    """
    try:
        base = filename.replace('.parquet', '')
        parts = base.split('_perrowfeatures_')
        if len(parts) != 2:
            return None, None
        ticker = parts[0]
        date_str = parts[1]
        datetime.strptime(date_str, '%Y-%m-%d')
        return ticker, date_str
    except Exception:
        return None, None


def safe_divide(numerator, denominator, default=0.0):
    """Safe division handling zero denominators. Works with scalars and arrays."""
    if isinstance(numerator, (pd.Series, np.ndarray)):
        result = np.where(
            (denominator == 0) | pd.isna(denominator),
            default,
            numerator / (denominator + CONFIG["epsilon"])
        )
        return result
    else:
        if denominator == 0 or pd.isna(denominator):
            return default
        return numerator / denominator


def compute_hhi(values: pd.Series) -> float:
    """
    Compute Herfindahl-Hirschman Index for concentration measurement.
    """
    total = values.sum()
    if total == 0:
        return 0.0
    shares = values / total
    return (shares ** 2).sum()


def compute_zscore(value, mean, std):
    """Compute z-score with handling for zero/nan std."""
    if isinstance(value, (pd.Series, np.ndarray)):
        result = np.where(
            (std == 0) | pd.isna(std) | pd.isna(mean),
            0.0,
            (value - mean) / (std + CONFIG["epsilon"])
        )
        return result
    else:
        if std == 0 or pd.isna(std) or pd.isna(mean):
            return 0.0
        return (value - mean) / std


def normalize_series(series: pd.Series) -> pd.Series:
    """Normalize series to 0-1 range."""
    min_val = series.min()
    max_val = series.max()
    if max_val == min_val:
        return pd.Series(0.5, index=series.index)
    return (series - min_val) / (max_val - min_val)


def clip_zscore(zscore, max_val=None):
    """Clip z-score to reasonable range."""
    if max_val is None:
        max_val = CONFIG["zscore_clip_max"]
    if isinstance(zscore, (pd.Series, np.ndarray)):
        return np.clip(zscore, -max_val, max_val)
    return max(-max_val, min(max_val, zscore))


# =============================================================================
# FEATURE GROUP 1: CONTRACT-LEVEL AGGREGATIONS
# =============================================================================

def compute_contract_level_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute contract-level aggregations and join back to trades.
    
    Contract = same strike + expiry + type (identified by 'ticker' column)
    """
    logger.debug("  Computing contract-level features...")
    
    eps = CONFIG["epsilon"]
    
    # -------------------------------------------------------------------------
    # Aggregate statistics per contract
    # -------------------------------------------------------------------------
    
    contract_agg = df.groupby("ticker").agg(
        contract_total_volume=("size", "sum"),
        contract_total_notional=("opt_trade_notional_value", "sum"),
        contract_trade_count=("size", "count"),
        contract_max_size=("size", "max"),
        contract_min_size=("size", "min"),
        contract_mean_size=("size", "mean"),
        contract_oi=("open_interest_now", "first"),
        contract_oi_yesterday=("open_interest_yesterday", "first"),
        contract_price_sum_product=("price", lambda x: (x * df.loc[x.index, "size"]).sum()),
    ).reset_index()
    
    # Compute contract VWAP
    contract_agg["contract_vwap"] = (
        contract_agg["contract_price_sum_product"] / 
        (contract_agg["contract_total_volume"] + eps)
    )
    contract_agg.drop(columns=["contract_price_sum_product"], inplace=True)
    
    # -------------------------------------------------------------------------
    # Join aggregates back to trades
    # -------------------------------------------------------------------------
    
    df = df.merge(contract_agg, on="ticker", how="left")
    
    # -------------------------------------------------------------------------
    # Compute trade-level features relative to contract
    # -------------------------------------------------------------------------
    
    # Price vs VWAP (urgency signal)
    df["trade_price_vs_contract_vwap"] = (
        df["price"] / (df["contract_vwap"] + eps)
    ) - 1
    
    # Trade size as percentage of contract total
    df["trade_size_pct_of_contract"] = safe_divide(
        df["size"], df["contract_total_volume"]
    )
    
    # Trade size rank within contract (1 = largest)
    df["trade_size_rank_in_contract"] = df.groupby("ticker")["size"].rank(
        ascending=False, method="min"
    )
    
    # Is this the largest trade in the contract?
    df["is_largest_in_contract"] = (
        df["size"] == df["contract_max_size"]
    ).astype(int)
    
    # Contract volume to open interest (turnover)
    df["contract_volume_to_oi"] = safe_divide(
        df["contract_total_volume"], df["contract_oi"]
    )
    
    # OI change (positive = new positions being opened)
    df["contract_oi_change"] = df["contract_oi"] - df["contract_oi_yesterday"]
    df["contract_oi_change_pct"] = safe_divide(
        df["contract_oi_change"], df["contract_oi_yesterday"]
    )
    
    # -------------------------------------------------------------------------
    # Contract-level concentration (HHI)
    # -------------------------------------------------------------------------
    
    # Compute HHI per contract (are a few trades dominating?)
    contract_hhi = df.groupby("ticker").apply(
        lambda g: compute_hhi(g["size"])
    ).rename("contract_hhi")
    
    df = df.merge(contract_hhi, on="ticker", how="left")
    
    return df


# =============================================================================
# FEATURE GROUP 2: CHAIN-LEVEL AGGREGATIONS
# =============================================================================

def compute_chain_level_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute chain-level (entire underlying for the day) aggregations.
    """
    logger.debug("  Computing chain-level features...")
    
    eps = CONFIG["epsilon"]
    otm_threshold = CONFIG["otm_threshold"]
    deep_otm_threshold = CONFIG["deep_otm_threshold"]
    short_dte_threshold = CONFIG["short_dte_threshold"]
    
    # -------------------------------------------------------------------------
    # Basic chain aggregates
    # -------------------------------------------------------------------------
    
    chain_total_volume = df["size"].sum()
    chain_total_notional = df["opt_trade_notional_value"].sum()
    chain_trade_count = len(df)
    
    df["chain_total_volume"] = chain_total_volume
    df["chain_total_notional"] = chain_total_notional
    df["chain_trade_count"] = chain_trade_count
    
    # Total OI (sum of unique contracts)
    chain_oi = df.groupby("ticker")["open_interest_now"].first().sum()
    df["chain_total_oi"] = chain_oi
    df["chain_volume_to_oi"] = safe_divide(chain_total_volume, chain_oi)
    
    # -------------------------------------------------------------------------
    # Contract share of chain
    # -------------------------------------------------------------------------
    
    df["contract_volume_share_of_chain"] = safe_divide(
        df["contract_total_volume"], chain_total_volume
    )
    df["contract_notional_share_of_chain"] = safe_divide(
        df["contract_total_notional"], chain_total_notional
    )
    
    # Trade share of chain
    df["trade_size_pct_of_chain"] = safe_divide(df["size"], chain_total_volume)
    df["trade_notional_pct_of_chain"] = safe_divide(
        df["opt_trade_notional_value"], chain_total_notional
    )
    
    # -------------------------------------------------------------------------
    # Call/Put distribution
    # -------------------------------------------------------------------------
    
    call_mask = df["option_type_call"] == 1
    put_mask = df["option_type_call"] == 0
    
    chain_call_volume = df.loc[call_mask, "size"].sum()
    chain_put_volume = df.loc[put_mask, "size"].sum()
    chain_call_notional = df.loc[call_mask, "opt_trade_notional_value"].sum()
    chain_put_notional = df.loc[put_mask, "opt_trade_notional_value"].sum()
    
    df["chain_call_volume"] = chain_call_volume
    df["chain_put_volume"] = chain_put_volume
    df["chain_call_notional"] = chain_call_notional
    df["chain_put_notional"] = chain_put_notional
    
    df["call_put_volume_ratio"] = safe_divide(chain_call_volume, chain_put_volume + 1)
    df["call_put_notional_ratio"] = safe_divide(chain_call_notional, chain_put_notional + 1)
    df["call_volume_share"] = safe_divide(chain_call_volume, chain_total_volume)
    df["put_volume_share"] = safe_divide(chain_put_volume, chain_total_volume)
    
    # -------------------------------------------------------------------------
    # Moneyness distribution
    # -------------------------------------------------------------------------
    
    otm_mask = df["otm_percentage"] > otm_threshold
    deep_otm_mask = df["otm_percentage"] > deep_otm_threshold
    itm_mask = df["otm_percentage"] < 0
    atm_mask = (df["otm_percentage"] >= -otm_threshold) & \
               (df["otm_percentage"] <= otm_threshold)
    
    chain_otm_volume = df.loc[otm_mask, "size"].sum()
    chain_deep_otm_volume = df.loc[deep_otm_mask, "size"].sum()
    chain_itm_volume = df.loc[itm_mask, "size"].sum()
    chain_atm_volume = df.loc[atm_mask, "size"].sum()
    
    df["chain_otm_volume"] = chain_otm_volume
    df["chain_deep_otm_volume"] = chain_deep_otm_volume
    df["chain_itm_volume"] = chain_itm_volume
    df["chain_atm_volume"] = chain_atm_volume
    
    df["otm_volume_share"] = safe_divide(chain_otm_volume, chain_total_volume)
    df["deep_otm_volume_share"] = safe_divide(chain_deep_otm_volume, chain_total_volume)
    df["itm_volume_share"] = safe_divide(chain_itm_volume, chain_total_volume)
    df["atm_volume_share"] = safe_divide(chain_atm_volume, chain_total_volume)
    
    # Volume-weighted average OTM
    df["chain_volume_weighted_otm"] = safe_divide(
        (df["otm_percentage"] * df["size"]).sum(), chain_total_volume
    )
    
    # -------------------------------------------------------------------------
    # DTE distribution
    # -------------------------------------------------------------------------
    
    short_dte_mask = df["days_to_expiry"] <= short_dte_threshold
    long_dte_mask = df["days_to_expiry"] > CONFIG["medium_dte_threshold"]
    
    chain_short_dte_volume = df.loc[short_dte_mask, "size"].sum()
    chain_long_dte_volume = df.loc[long_dte_mask, "size"].sum()
    
    df["chain_short_dte_volume"] = chain_short_dte_volume
    df["chain_long_dte_volume"] = chain_long_dte_volume
    
    df["short_dte_volume_share"] = safe_divide(chain_short_dte_volume, chain_total_volume)
    df["long_dte_volume_share"] = safe_divide(chain_long_dte_volume, chain_total_volume)
    
    # Volume-weighted average DTE
    df["chain_volume_weighted_dte"] = safe_divide(
        (df["days_to_expiry"] * df["size"]).sum(), chain_total_volume
    )
    
    # -------------------------------------------------------------------------
    # Execution type distribution
    # -------------------------------------------------------------------------
    
    if "cndn_multilegstrategy" in df.columns:
        multileg_volume = df.loc[df["cndn_multilegstrategy"] == 1, "size"].sum()
        df["multileg_volume_share"] = safe_divide(multileg_volume, chain_total_volume)
    else:
        df["multileg_volume_share"] = 0.0
    
    if "cndn_autoelectronic" in df.columns:
        sweep_volume = df.loc[df["cndn_autoelectronic"] == 1, "size"].sum()
        df["sweep_volume_share"] = safe_divide(sweep_volume, chain_total_volume)
    else:
        df["sweep_volume_share"] = 0.0
    
    if "cndn_floorexecuted" in df.columns:
        floor_volume = df.loc[df["cndn_floorexecuted"] == 1, "size"].sum()
        df["floor_volume_share"] = safe_divide(floor_volume, chain_total_volume)
    else:
        df["floor_volume_share"] = 0.0
    
    # -------------------------------------------------------------------------
    # Chain-level concentration metrics
    # -------------------------------------------------------------------------
    
    # Max contract share
    contract_volumes = df.groupby("ticker")["size"].sum()
    max_contract_share = safe_divide(contract_volumes.max(), chain_total_volume)
    df["chain_max_contract_share"] = max_contract_share
    
    # Contract HHI (concentration across contracts)
    df["chain_contract_hhi"] = compute_hhi(contract_volumes)
    
    # Number of unique contracts traded
    df["chain_unique_contracts"] = len(contract_volumes)
    
    # -------------------------------------------------------------------------
    # Greeks aggregation (chain-wide)
    # -------------------------------------------------------------------------
    
    if "delta_size" in df.columns:
        df["chain_net_delta"] = df["delta_size"].sum()
        df["chain_net_gamma"] = df["gamma_size"].sum()
        df["chain_net_vega"] = df["vega_size"].sum()
        df["chain_net_theta"] = df["theta_size"].sum()
    
    return df


# =============================================================================
# FEATURE GROUP 3: EXPIRY-LEVEL AGGREGATIONS
# =============================================================================

def compute_expiry_level_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute expiry-level (same expiration date, all strikes) aggregations.
    """
    logger.debug("  Computing expiry-level features...")
    
    eps = CONFIG["epsilon"]
    chain_total_volume = df["chain_total_volume"].iloc[0]
    chain_total_notional = df["chain_total_notional"].iloc[0]
    
    # -------------------------------------------------------------------------
    # Aggregate by expiry
    # -------------------------------------------------------------------------
    
    expiry_agg = df.groupby("opt_expiration_date").agg(
        expiry_total_volume=("size", "sum"),
        expiry_total_notional=("opt_trade_notional_value", "sum"),
        expiry_trade_count=("size", "count"),
        expiry_call_volume=("size", lambda x: x[df.loc[x.index, "option_type_call"] == 1].sum()),
        expiry_put_volume=("size", lambda x: x[df.loc[x.index, "option_type_call"] == 0].sum()),
        expiry_unique_strikes=("strike_price", "nunique"),
    ).reset_index()
    
    # Expiry share of chain
    expiry_agg["expiry_volume_share_of_chain"] = safe_divide(
        expiry_agg["expiry_total_volume"], chain_total_volume
    )
    expiry_agg["expiry_notional_share_of_chain"] = safe_divide(
        expiry_agg["expiry_total_notional"], chain_total_notional
    )
    
    # Call/put ratio within expiry
    expiry_agg["expiry_call_put_ratio"] = safe_divide(
        expiry_agg["expiry_call_volume"], expiry_agg["expiry_put_volume"] + 1
    )
    
    # -------------------------------------------------------------------------
    # Join back to trades
    # -------------------------------------------------------------------------
    
    df = df.merge(
        expiry_agg[["opt_expiration_date", "expiry_total_volume", "expiry_total_notional",
                    "expiry_trade_count", "expiry_volume_share_of_chain",
                    "expiry_notional_share_of_chain", "expiry_call_put_ratio",
                    "expiry_unique_strikes"]],
        on="opt_expiration_date",
        how="left"
    )
    
    # Contract share within expiry
    df["contract_volume_share_of_expiry"] = safe_divide(
        df["contract_total_volume"], df["expiry_total_volume"]
    )
    
    # -------------------------------------------------------------------------
    # Strike concentration within expiry (HHI)
    # -------------------------------------------------------------------------
    
    expiry_strike_hhi = df.groupby("opt_expiration_date").apply(
        lambda g: compute_hhi(g.groupby("strike_price")["size"].sum())
    ).rename("expiry_strike_hhi")
    
    df = df.merge(expiry_strike_hhi, on="opt_expiration_date", how="left")
    
    # -------------------------------------------------------------------------
    # OTM distribution within expiry
    # -------------------------------------------------------------------------
    
    otm_threshold = CONFIG["otm_threshold"]
    
    expiry_otm = df.groupby("opt_expiration_date").apply(
        lambda g: safe_divide(
            g.loc[g["otm_percentage"] > otm_threshold, "size"].sum(),
            g["size"].sum()
        )
    ).rename("expiry_otm_volume_share")
    
    df = df.merge(expiry_otm, on="opt_expiration_date", how="left")
    
    return df


# =============================================================================
# FEATURE GROUP 4: HISTORICAL BASELINE COMPARISONS
# =============================================================================

def compute_baseline_comparison_features(
    df: pd.DataFrame, 
    baseline_df: pd.DataFrame,
    underlying: str,
    trade_date: str
) -> pd.DataFrame:
    """
    Compare today's activity to historical baselines.
    """
    logger.debug("  Computing baseline comparison features...")
    
    eps = CONFIG["epsilon"]
    
    # -------------------------------------------------------------------------
    # Get baseline for this underlying and date
    # -------------------------------------------------------------------------
    
    baseline_row = baseline_df[
        (baseline_df["underlying"] == underlying) &
        (baseline_df["trade_date"] == trade_date)
    ]
    
    if len(baseline_row) == 0:
        logger.warning(f"    No baseline found for {underlying} on {trade_date}")
        # Fill with NaN for baseline features
        baseline_features = [
            "chain_volume_vs_baseline_mean", "chain_volume_zscore",
            "chain_notional_vs_baseline_mean", "chain_notional_zscore",
            "avg_trade_size_vs_baseline", "avg_trade_size_zscore",
            "call_share_vs_baseline", "call_share_zscore",
            "otm_share_vs_baseline", "otm_share_zscore",
            "deep_otm_share_vs_baseline", "deep_otm_share_zscore",
            "short_dte_share_vs_baseline", "short_dte_share_zscore",
            "max_contract_share_vs_baseline", "max_contract_share_zscore",
            "volume_to_oi_vs_baseline", "volume_to_oi_zscore",
            "vwap_premium_vs_baseline", "baseline_days_count",
            "baseline_is_sufficient"
        ]
        for feat in baseline_features:
            df[feat] = np.nan
        df["baseline_days_count"] = 0
        df["baseline_is_sufficient"] = 0
        return df
    
    baseline = baseline_row.iloc[0]
    
    # Store baseline metadata
    df["baseline_days_count"] = baseline["baseline_days_count"]
    df["baseline_is_sufficient"] = baseline["baseline_is_sufficient"]
    
    # -------------------------------------------------------------------------
    # Volume comparisons
    # -------------------------------------------------------------------------
    
    chain_volume = df["chain_total_volume"].iloc[0]
    chain_notional = df["chain_total_notional"].iloc[0]
    
    df["chain_volume_vs_baseline_mean"] = safe_divide(
        chain_volume, baseline["baseline_chain_volume_mean"]
    )
    df["chain_volume_zscore"] = compute_zscore(
        chain_volume,
        baseline["baseline_chain_volume_mean"],
        baseline["baseline_chain_volume_std"]
    )
    
    df["chain_notional_vs_baseline_mean"] = safe_divide(
        chain_notional, baseline["baseline_chain_notional_mean"]
    )
    df["chain_notional_zscore"] = compute_zscore(
        chain_notional,
        baseline["baseline_chain_notional_mean"],
        baseline["baseline_chain_notional_std"]
    )
    
    # -------------------------------------------------------------------------
    # Trade size comparisons
    # -------------------------------------------------------------------------
    
    avg_trade_size = df["size"].mean()
    
    df["avg_trade_size_vs_baseline"] = safe_divide(
        avg_trade_size, baseline["baseline_avg_trade_size_mean"]
    )
    df["avg_trade_size_zscore"] = compute_zscore(
        avg_trade_size,
        baseline["baseline_avg_trade_size_mean"],
        baseline["baseline_avg_trade_size_std"]
    )
    
    # -------------------------------------------------------------------------
    # Distribution comparisons
    # -------------------------------------------------------------------------
    
    call_share = df["call_volume_share"].iloc[0]
    otm_share = df["otm_volume_share"].iloc[0]
    deep_otm_share = df["deep_otm_volume_share"].iloc[0]
    short_dte_share = df["short_dte_volume_share"].iloc[0]
    max_contract_share = df["chain_max_contract_share"].iloc[0]
    volume_to_oi = df["chain_volume_to_oi"].iloc[0]
    
    # Call share
    df["call_share_vs_baseline"] = safe_divide(
        call_share, baseline["baseline_call_volume_share_mean"]
    )
    df["call_share_zscore"] = compute_zscore(
        call_share,
        baseline["baseline_call_volume_share_mean"],
        baseline["baseline_call_volume_share_std"]
    )
    
    # OTM share
    df["otm_share_vs_baseline"] = safe_divide(
        otm_share, baseline["baseline_otm_volume_share_mean"]
    )
    df["otm_share_zscore"] = compute_zscore(
        otm_share,
        baseline["baseline_otm_volume_share_mean"],
        baseline["baseline_otm_volume_share_std"]
    )
    
    # Deep OTM share
    df["deep_otm_share_vs_baseline"] = safe_divide(
        deep_otm_share, baseline["baseline_deep_otm_volume_share_mean"]
    )
    df["deep_otm_share_zscore"] = compute_zscore(
        deep_otm_share,
        baseline["baseline_deep_otm_volume_share_mean"],
        baseline["baseline_deep_otm_volume_share_std"]
    )
    
    # Short DTE share
    df["short_dte_share_vs_baseline"] = safe_divide(
        short_dte_share, baseline["baseline_short_dte_volume_share_mean"]
    )
    df["short_dte_share_zscore"] = compute_zscore(
        short_dte_share,
        baseline["baseline_short_dte_volume_share_mean"],
        baseline["baseline_short_dte_volume_share_std"]
    )
    
    # Max contract share (concentration)
    df["max_contract_share_vs_baseline"] = safe_divide(
        max_contract_share, baseline["baseline_max_contract_share_mean"]
    )
    df["max_contract_share_zscore"] = compute_zscore(
        max_contract_share,
        baseline["baseline_max_contract_share_mean"],
        baseline["baseline_max_contract_share_std"]
    )
    
    # Volume to OI
    df["volume_to_oi_vs_baseline"] = safe_divide(
        volume_to_oi, baseline["baseline_volume_to_oi_mean"]
    )
    df["volume_to_oi_zscore"] = compute_zscore(
        volume_to_oi,
        baseline["baseline_volume_to_oi_mean"],
        baseline["baseline_volume_to_oi_std"]
    )
    
    # VWAP premium
    avg_vwap_premium = df["trade_price_vs_contract_vwap"].mean()
    df["vwap_premium_vs_baseline"] = safe_divide(
        avg_vwap_premium, baseline["baseline_vwap_premium_mean"] + eps
    )
    
    return df


# =============================================================================
# FEATURE GROUP 5: COMPOSITE SCORES (TYPE A DETECTION)
# =============================================================================

def compute_composite_scores(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute composite scores optimized for Type A detection.
    """
    logger.debug("  Computing composite scores...")
    
    eps = CONFIG["epsilon"]
    zscore_clip = CONFIG["zscore_clip_max"]
    
    # -------------------------------------------------------------------------
    # Time of day features
    # -------------------------------------------------------------------------
    
    # Parse time from sip_timestamp if it's a string
    if df["sip_timestamp"].dtype == object:
        try:
            df["trade_datetime"] = pd.to_datetime(df["sip_timestamp"])
            df["time_of_day_minutes"] = (
                df["trade_datetime"].dt.hour * 60 + 
                df["trade_datetime"].dt.minute - 
                9 * 60 - 30  # Minutes since 9:30 AM
            )
            # Clip to regular trading hours (0-390 minutes)
            df["time_of_day_minutes"] = df["time_of_day_minutes"].clip(0, 390)
        except Exception:
            df["time_of_day_minutes"] = 195  # Default to mid-day
    else:
        df["time_of_day_minutes"] = 195  # Default to mid-day
    
    # Early session indicator (first 30 minutes)
    df["is_early_session"] = (df["time_of_day_minutes"] <= 30).astype(int)
    
    # -------------------------------------------------------------------------
    # URGENCY SCORE
    # Captures time pressure signals
    # -------------------------------------------------------------------------
    
    # Time component (earlier = higher urgency)
    time_score = 1 - (df["time_of_day_minutes"] / 390)
    
    # Price vs VWAP component (paying above average = urgency)
    vwap_premium_norm = normalize_series(
        df["trade_price_vs_contract_vwap"].clip(-0.1, 0.2)
    )
    
    # Sweep indicator (aggressive execution)
    sweep_indicator = df.get("cndn_autoelectronic", 0)
    
    df["urgency_score"] = (
        0.40 * vwap_premium_norm +
        0.35 * time_score +
        0.25 * sweep_indicator
    )
    
    # -------------------------------------------------------------------------
    # SIZE DOMINANCE SCORE
    # Captures whether this trade is unusually large
    # -------------------------------------------------------------------------
    
    # Trade size relative to contract
    contract_pct_norm = normalize_series(
        df["trade_size_pct_of_contract"].clip(0, 1)
    )
    
    # Trade size relative to chain
    chain_pct_norm = normalize_series(
        df["trade_size_pct_of_chain"].clip(0, 0.1)  # Clip outliers
    )
    
    # Inverse rank (rank 1 = highest score)
    rank_score = 1 / (df["trade_size_rank_in_contract"] + eps)
    rank_score_norm = normalize_series(rank_score)
    
    # Contract turnover (volume / OI)
    turnover_norm = normalize_series(
        df["contract_volume_to_oi"].clip(0, 5)
    )
    
    df["size_dominance_score"] = (
        0.30 * contract_pct_norm +
        0.30 * chain_pct_norm +
        0.20 * rank_score_norm +
        0.20 * turnover_norm
    )
    
    # -------------------------------------------------------------------------
    # INFORMED FLOW SCORE (Primary Type A Detector)
    # Combines key signals for informed directional betting
    # -------------------------------------------------------------------------
    
    # OTM component (higher OTM = more speculative/informed)
    otm_norm = normalize_series(df["otm_percentage"].clip(0, 50))
    
    # Near-dated component (lower DTE = catalyst-driven)
    dte_score = 1 / (df["days_to_expiry"].clip(1, 365) + eps)
    dte_norm = normalize_series(dte_score)
    
    # Volume zscore component (unusual day for this underlying)
    volume_zscore_clipped = clip_zscore(df["chain_volume_zscore"]).fillna(0)
    volume_zscore_norm = (volume_zscore_clipped + zscore_clip) / (2 * zscore_clip)
    
    # Contract concentration (bet concentrated in this contract)
    concentration_norm = normalize_series(
        df["contract_volume_share_of_chain"].clip(0, 0.5)
    )
    
    df["informed_flow_score"] = (
        0.25 * normalize_series(df["size_dominance_score"]) +
        0.20 * normalize_series(df["urgency_score"]) +
        0.20 * otm_norm +
        0.15 * dte_norm +
        0.10 * volume_zscore_norm +
        0.10 * concentration_norm
    )
    
    # -------------------------------------------------------------------------
    # BASELINE ANOMALY SCORE
    # Purely statistical measure of how unusual this day is
    # -------------------------------------------------------------------------
    
    # Clip and normalize z-scores
    vol_z = clip_zscore(df["chain_volume_zscore"]).fillna(0)
    deep_otm_z = clip_zscore(df["deep_otm_share_zscore"]).fillna(0)
    max_contract_z = clip_zscore(df["max_contract_share_zscore"]).fillna(0)
    vol_oi_z = clip_zscore(df["volume_to_oi_zscore"]).fillna(0)
    
    # Directional alignment: call share z-score for calls, negative for puts
    call_z = clip_zscore(df["call_share_zscore"]).fillna(0)
    directional_z = np.where(
        df["option_type_call"] == 1,
        call_z,  # For calls, positive call share is aligned
        -call_z  # For puts, negative call share is aligned
    )
    
    # Normalize all to 0-1 range (assuming z-scores are clipped to ±5)
    df["baseline_anomaly_score"] = (
        0.30 * (vol_z + zscore_clip) / (2 * zscore_clip) +
        0.20 * (deep_otm_z + zscore_clip) / (2 * zscore_clip) +
        0.20 * (max_contract_z + zscore_clip) / (2 * zscore_clip) +
        0.15 * (vol_oi_z + zscore_clip) / (2 * zscore_clip) +
        0.15 * (directional_z + zscore_clip) / (2 * zscore_clip)
    )
    
    # -------------------------------------------------------------------------
    # COMBINED TYPE A SCORE
    # Final score optimized for Type A detection
    # -------------------------------------------------------------------------
    
    df["type_a_score"] = (
        0.35 * normalize_series(df["informed_flow_score"]) +
        0.30 * normalize_series(df["size_dominance_score"]) +
        0.20 * normalize_series(df["baseline_anomaly_score"]) +
        0.15 * normalize_series(df["urgency_score"])
    )
    
    return df


# =============================================================================
# FEATURE GROUP 6: FALSE POSITIVE FILTERS
# =============================================================================

def compute_false_positive_filters(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute features to filter out likely false positives.
    """
    logger.debug("  Computing false positive filters...")
    
    # -------------------------------------------------------------------------
    # Trade type filters
    # -------------------------------------------------------------------------
    
    # Multi-leg trades (usually hedges, not directional bets)
    df["is_likely_multileg"] = df.get("cndn_multilegstrategy", 0)
    
    # Extended hours (different dynamics)
    df["is_extended_hours"] = df.get("condn_extendedhours", 0)
    
    # -------------------------------------------------------------------------
    # Position type filters
    # -------------------------------------------------------------------------
    
    # Likely closing position (OI decreased)
    df["is_likely_closing"] = (
        df["contract_oi_change"] < 0
    ).astype(int)
    
    # ITM trades (often hedging or assignment-related)
    df["is_itm_trade"] = (df["otm_percentage"] < 0).astype(int)
    
    # -------------------------------------------------------------------------
    # Liquidity filters
    # -------------------------------------------------------------------------
    
    # Low liquidity contract
    df["is_low_liquidity_contract"] = (
        df["contract_trade_count"] < CONFIG["min_contract_trades"]
    ).astype(int)
    
    # Low liquidity chain
    df["is_low_liquidity_chain"] = (
        (df["chain_trade_count"] < CONFIG["min_chain_trades"]) |
        (df["chain_total_volume"] < CONFIG["min_chain_volume"])
    ).astype(int)
    
    # -------------------------------------------------------------------------
    # Data quality filters
    # -------------------------------------------------------------------------
    
    # Insufficient baseline
    df["is_insufficient_baseline"] = (
        df["baseline_is_sufficient"] == 0
    ).astype(int)
    
    # Wide spread (illiquid, price less meaningful)
    if "close_ask" in df.columns and "close_bid" in df.columns:
        mid_price = (df["close_ask"] + df["close_bid"]) / 2
        spread = df["close_ask"] - df["close_bid"]
        df["relative_spread"] = safe_divide(spread, mid_price)
        df["is_wide_spread"] = (df["relative_spread"] > 0.10).astype(int)
    else:
        df["relative_spread"] = np.nan
        df["is_wide_spread"] = 0
    
    # -------------------------------------------------------------------------
    # Combined False Positive Risk Score
    # -------------------------------------------------------------------------
    
    df["false_positive_risk"] = (
        0.25 * df["is_likely_multileg"] +
        0.20 * df["is_likely_closing"] +
        0.15 * df["is_low_liquidity_contract"] +
        0.15 * df["is_low_liquidity_chain"] +
        0.10 * df["is_itm_trade"] +
        0.08 * df["is_extended_hours"] +
        0.05 * df["is_insufficient_baseline"] +
        0.02 * df["is_wide_spread"]
    )
    
    # -------------------------------------------------------------------------
    # Final filter flag
    # -------------------------------------------------------------------------
    
    # High-quality signal flag (passes all major filters)
    df["is_high_quality_signal"] = (
        (df["is_likely_multileg"] == 0) &
        (df["is_low_liquidity_chain"] == 0) &
        (df["is_extended_hours"] == 0) &
        (df["false_positive_risk"] < 0.4)
    ).astype(int)
    
    return df


# =============================================================================
# MAIN PROCESSING FUNCTION
# =============================================================================

def process_single_file(
    filepath: Path,
    underlying: str,
    trade_date: str,
    baseline_folder: Path,
    output_folder: Path
) -> bool:
    """
    Process a single file through all aggregation stages.
    
    Returns:
        True if successful, False otherwise
    """
    try:
        # Load data
        df = pd.read_parquet(filepath)
        
        # Filter to correct underlying (safety check)
        if "underlying" in df.columns:
            df = df[df["underlying"] == underlying].copy()
        
        if len(df) == 0:
            logger.warning(f"  No trades found for {underlying} on {trade_date}")
            return False
        
        logger.info(f"  Loaded {len(df):,} trades")
        
        # Load baseline for this ticker
        baseline_file = baseline_folder / f"baseline_{underlying}_daily.parquet"
        if baseline_file.exists():
            baseline_df = pd.read_parquet(baseline_file)
            logger.debug(f"  Loaded baseline: {len(baseline_df)} rows")
        else:
            logger.warning(f"  No baseline file found for {underlying}")
            baseline_df = pd.DataFrame()  # Empty dataframe, will trigger NaN baseline features
        
        # Store original row count
        original_count = len(df)
        
        # =====================================================================
        # APPLY FEATURE ENGINEERING STAGES
        # =====================================================================
        
        # Stage 1: Contract-level features
        df = compute_contract_level_features(df)
        
        # Stage 2: Chain-level features
        df = compute_chain_level_features(df)
        
        # Stage 3: Expiry-level features
        df = compute_expiry_level_features(df)
        
        # Stage 4: Historical baseline comparisons
        df = compute_baseline_comparison_features(
            df, baseline_df, underlying, trade_date
        )
        
        # Stage 5: Composite scores
        df = compute_composite_scores(df)
        
        # Stage 6: False positive filters
        df = compute_false_positive_filters(df)
        
        # =====================================================================
        # SAVE OUTPUT
        # =====================================================================
        
        output_filename = f"{underlying}_aggregatedfeatures_{trade_date}.parquet"
        output_path = output_folder / output_filename
        
        df.to_parquet(output_path, index=False)
        
        # Log summary statistics
        high_quality_count = df["is_high_quality_signal"].sum()
        avg_type_a_score = df["type_a_score"].mean()
        max_type_a_score = df["type_a_score"].max()
        
        logger.info(f"  Saved: {output_filename}")
        logger.info(f"    Rows: {len(df):,} | Columns: {len(df.columns)}")
        logger.info(f"    High-quality signals: {high_quality_count:,} "
                   f"({100*high_quality_count/len(df):.1f}%)")
        logger.info(f"    Type A Score: mean={avg_type_a_score:.3f}, "
                   f"max={max_type_a_score:.3f}")
        
        return True
        
    except Exception as e:
        logger.error(f"  Error processing {filepath.name}: {e}")
        import traceback
        logger.debug(traceback.format_exc())
        return False


def main():
    """Main execution function."""
    
    logger.info("=" * 70)
    logger.info("PHASE 03B-2: AGGREGATED FEATURE ENGINEERING")
    logger.info("=" * 70)
    
    # -------------------------------------------------------------------------
    # SETUP
    # -------------------------------------------------------------------------
    
    input_folder = CONFIG["input_folder"]
    baseline_folder = CONFIG["baseline_folder"]
    output_folder = CONFIG["output_folder"]
    
    # Create output folder if needed
    output_folder.mkdir(parents=True, exist_ok=True)
    
    logger.info(f"Input folder: {input_folder}")
    logger.info(f"Baseline folder: {baseline_folder}")
    logger.info(f"Output folder: {output_folder}")
    
    # -------------------------------------------------------------------------
    # VERIFY BASELINE FOLDER EXISTS
    # -------------------------------------------------------------------------
    
    logger.info("-" * 70)
    logger.info("CHECKING BASELINE FOLDER")
    logger.info("-" * 70)
    
    if not baseline_folder.exists():
        logger.error(f"Baseline folder not found: {baseline_folder}")
        logger.error("Please run Phase 03B-1 first to generate baselines.")
        return
    
    # Count available baseline files
    baseline_files = list(baseline_folder.glob("baseline_*_daily.parquet"))
    logger.info(f"Found {len(baseline_files)} baseline files in folder")
    
    # -------------------------------------------------------------------------
    # SCAN INPUT FILES
    # -------------------------------------------------------------------------
    
    logger.info("-" * 70)
    logger.info("SCANNING INPUT FILES")
    logger.info("-" * 70)
    
    # Parse date filters
    start_date = CONFIG["start_date"]
    end_date = CONFIG["end_date"]
    tickers_to_process = CONFIG["tickers_to_process"]
    
    start_dt = datetime.strptime(start_date, '%Y-%m-%d') if start_date else None
    end_dt = datetime.strptime(end_date, '%Y-%m-%d') if end_date else None
    
    files_to_process = []
    
    for filepath in input_folder.glob("*_perrowfeatures_*.parquet"):
        ticker, date_str = parse_filename(filepath.name)
        
        if ticker is None:
            continue
        
        # Apply ticker filter
        if tickers_to_process and ticker not in tickers_to_process:
            continue
        
        # Apply date filter
        file_dt = datetime.strptime(date_str, '%Y-%m-%d')
        if start_dt and file_dt < start_dt:
            continue
        if end_dt and file_dt > end_dt:
            continue
        
        files_to_process.append((ticker, date_str, filepath))
    
    # Sort by date then ticker
    files_to_process = sorted(files_to_process, key=lambda x: (x[1], x[0]))
    
    logger.info(f"Found {len(files_to_process)} files to process")
    
    if not files_to_process:
        logger.warning("No files to process!")
        return
    
    # -------------------------------------------------------------------------
    # PROCESS FILES
    # -------------------------------------------------------------------------
    
    logger.info("-" * 70)
    logger.info("PROCESSING FILES")
    logger.info("-" * 70)
    
    success_count = 0
    error_count = 0
    
    for i, (ticker, date_str, filepath) in enumerate(files_to_process):
        logger.info(f"\n[{i+1}/{len(files_to_process)}] {ticker} - {date_str}")
        
        success = process_single_file(
            filepath=filepath,
            underlying=ticker,
            trade_date=date_str,
            baseline_folder=baseline_folder,
            output_folder=output_folder
        )
        
        if success:
            success_count += 1
        else:
            error_count += 1
    
    # -------------------------------------------------------------------------
    # SUMMARY
    # -------------------------------------------------------------------------
    
    logger.info("\n" + "=" * 70)
    logger.info("PROCESSING COMPLETE")
    logger.info("=" * 70)
    logger.info(f"  Successful: {success_count}")
    logger.info(f"  Errors: {error_count}")
    logger.info(f"  Total: {len(files_to_process)}")
    
    if success_count > 0:
        # Count output files
        output_files = list(output_folder.glob("*_aggregatedfeatures_*.parquet"))
        logger.info(f"\nOutput files created: {len(output_files)}")
        
        # Sample column count from first file
        if output_files:
            sample_df = pd.read_parquet(output_files[0])
            logger.info(f"Output columns per file: {len(sample_df.columns)}")


# =============================================================================
# ENTRY POINT
# =============================================================================

if __name__ == "__main__":
    main()