In [None]:
"""
============================================================================================
PHASE 04A-1: HISTORICAL BASELINE CONSTRUCTION FOR TYPE A DETECTION [LARGE INFORMED TRADE]
============================================================================================

Pipeline Position: 
    Phase 03 (Per-Row Features) → **Phase 04A-1** → Phase 04A-2 (Aggregated Features)

Purpose:
    Builds rolling historical baseline statistics per underlying ticker. These
    baselines are essential for determining what is "unusual" - a 10,000 contract
    day might be quiet for AAPL but 50x normal for a small biotech.

Input:
    - Phase 04A-1 output files: {TICKER}_perrowfeatures_YYYY-MM-DD.parquet
    
Output:
    - Baseline lookup table: baseline_underlying_daily.parquet
    - One row per underlying per trade_date with rolling 20-day statistics

Key Features:
    - Strict no-lookahead: baseline for date T uses only data from T-20 to T-1
    - Handles missing days gracefully (weekends, holidays, no trading activity)
    - Flags underlyings with insufficient history (baseline_days_count < 5)
    - Computes statistics for volume, notional, trade size, directional bias,
      moneyness distribution, DTE distribution, and concentration metrics

Author: Mariceline Querubin
Created: 2026-02-08
Version: 1.0

Dependencies:
    - pandas >= 1.5.0
    - numpy >= 1.20.0
    - pyarrow >= 10.0.0 (for parquet I/O)

Usage:
    python "scriptname.py"

================================================================================
"""

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta
from typing import Optional, List, Dict, Tuple
import warnings
import logging

warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================

CONFIG = {
    # Input folder containing Phase 03A output files
    "input_folder": Path(r"D:\cyclelabs_codes\CL_20251120_siphontrades\01_FIXINGRAWDATA\output_2_perrowfeateng"),
    
    # Output folder for baseline table
    "output_folder": Path(r"D:\cyclelabs_codes\CL_20251120_siphontrades\01_FIXINGRAWDATA\output_4a1_baseline"),
    
    # Lookback window for baseline calculation (trading days)
    "lookback_days": 20,
    
    # Minimum days required for valid baseline
    "min_baseline_days": 5,
    
    # Date range to process (inclusive)
    # Set to None to process all available dates
    "start_date": "2024-02-01",
    "end_date": "2025-12-31",
    
    # Tickers to process (None or [] for all tickers)
    "tickers_to_process": None,  # e.g., ["AAPL", "TSLA", "CIFR"]
    
    # OTM percentage thresholds
    "otm_threshold": 5.0,        # > 5% = OTM
    "deep_otm_threshold": 20.0,  # > 20% = Deep OTM
    
    # DTE thresholds
    "short_dte_threshold": 14,   # <= 14 days = short-dated
    "medium_dte_threshold": 45,  # <= 45 days = medium-dated
    
    # Logging level
    "log_level": logging.INFO,
}

# =============================================================================
# LOGGING SETUP
# =============================================================================

logging.basicConfig(
    level=CONFIG["log_level"],
    format='%(asctime)s | %(levelname)s | %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================

def parse_filename(filename: str) -> Tuple[Optional[str], Optional[str]]:
    """
    Parse ticker and date from Phase 03A output filename.
    
    Expected format: {TICKER}_perrowfeatures_YYYY-MM-DD.parquet
    
    Args:
        filename: Name of the parquet file
        
    Returns:
        Tuple of (ticker, date_str) or (None, None) if parsing fails
    """
    try:
        # Remove extension
        base = filename.replace('.parquet', '')
        
        # Split by '_perrowfeatures_'
        parts = base.split('_perrowfeatures_')
        if len(parts) != 2:
            return None, None
            
        ticker = parts[0]
        date_str = parts[1]
        
        # Validate date format
        datetime.strptime(date_str, '%Y-%m-%d')
        
        return ticker, date_str
    except Exception:
        return None, None


def get_trading_days_before(target_date: str, n_days: int, 
                            available_dates: List[str]) -> List[str]:
    """
    Get the N trading days before target_date from available dates.
    
    Args:
        target_date: Target date in YYYY-MM-DD format
        n_days: Number of trading days to look back
        available_dates: List of available trading dates
        
    Returns:
        List of trading dates (sorted ascending, oldest first)
    """
    target_dt = datetime.strptime(target_date, '%Y-%m-%d')
    
    # Filter dates strictly before target
    prior_dates = [d for d in available_dates 
                   if datetime.strptime(d, '%Y-%m-%d') < target_dt]
    
    # Sort descending and take n_days
    prior_dates_sorted = sorted(prior_dates, reverse=True)[:n_days]
    
    # Return in ascending order (oldest first)
    return sorted(prior_dates_sorted)


def safe_divide(numerator: float, denominator: float, 
                default: float = 0.0) -> float:
    """Safe division handling zero denominators."""
    if denominator == 0 or pd.isna(denominator):
        return default
    return numerator / denominator


def compute_hhi(values: pd.Series) -> float:
    """
    Compute Herfindahl-Hirschman Index for concentration measurement.
    
    HHI = sum((share_i)^2) where share_i = value_i / total
    Range: 1/n (perfectly distributed) to 1.0 (single entity dominates)
    """
    total = values.sum()
    if total == 0:
        return 0.0
    shares = values / total
    return (shares ** 2).sum()


# =============================================================================
# DAILY STATISTICS COMPUTATION
# =============================================================================

def compute_daily_statistics(df: pd.DataFrame, trade_date: str) -> Dict:
    """
    Compute aggregate statistics for a single underlying on a single day.
    
    Args:
        df: DataFrame containing all trades for one underlying on one day
            Expected to have Phase 03A columns
        trade_date: The date being processed (for reference)
        
    Returns:
        Dictionary of daily statistics
    """
    stats = {"trade_date": trade_date}
    
    # Handle empty dataframe
    if len(df) == 0:
        return None
    
    # -------------------------------------------------------------------------
    # VOLUME & NOTIONAL STATISTICS
    # -------------------------------------------------------------------------
    
    stats["daily_chain_volume"] = df["size"].sum()
    stats["daily_chain_notional"] = df["opt_trade_notional_value"].sum()
    stats["daily_chain_trade_count"] = len(df)
    stats["daily_avg_trade_size"] = df["size"].mean()
    stats["daily_median_trade_size"] = df["size"].median()
    stats["daily_max_trade_size"] = df["size"].max()
    
    # -------------------------------------------------------------------------
    # CALL/PUT DISTRIBUTION
    # -------------------------------------------------------------------------
    
    call_mask = df["option_type_call"] == 1
    put_mask = df["option_type_call"] == 0
    
    call_volume = df.loc[call_mask, "size"].sum()
    put_volume = df.loc[put_mask, "size"].sum()
    call_notional = df.loc[call_mask, "opt_trade_notional_value"].sum()
    put_notional = df.loc[put_mask, "opt_trade_notional_value"].sum()
    
    stats["daily_call_volume"] = call_volume
    stats["daily_put_volume"] = put_volume
    stats["daily_call_volume_share"] = safe_divide(
        call_volume, stats["daily_chain_volume"]
    )
    stats["daily_call_put_volume_ratio"] = safe_divide(
        call_volume, put_volume + 1  # Add 1 to avoid division by zero
    )
    stats["daily_call_put_notional_ratio"] = safe_divide(
        call_notional, put_notional + 1
    )
    
    # -------------------------------------------------------------------------
    # MONEYNESS DISTRIBUTION
    # -------------------------------------------------------------------------
    
    # OTM: otm_percentage > threshold (positive = OTM for both calls and puts)
    otm_threshold = CONFIG["otm_threshold"]
    deep_otm_threshold = CONFIG["deep_otm_threshold"]
    
    otm_mask = df["otm_percentage"] > otm_threshold
    deep_otm_mask = df["otm_percentage"] > deep_otm_threshold
    itm_mask = df["otm_percentage"] < 0
    atm_mask = (df["otm_percentage"] >= -otm_threshold) & \
               (df["otm_percentage"] <= otm_threshold)
    
    otm_volume = df.loc[otm_mask, "size"].sum()
    deep_otm_volume = df.loc[deep_otm_mask, "size"].sum()
    itm_volume = df.loc[itm_mask, "size"].sum()
    atm_volume = df.loc[atm_mask, "size"].sum()
    
    stats["daily_otm_volume"] = otm_volume
    stats["daily_deep_otm_volume"] = deep_otm_volume
    stats["daily_itm_volume"] = itm_volume
    stats["daily_atm_volume"] = atm_volume
    
    stats["daily_otm_volume_share"] = safe_divide(
        otm_volume, stats["daily_chain_volume"]
    )
    stats["daily_deep_otm_volume_share"] = safe_divide(
        deep_otm_volume, stats["daily_chain_volume"]
    )
    stats["daily_itm_volume_share"] = safe_divide(
        itm_volume, stats["daily_chain_volume"]
    )
    
    # Volume-weighted average OTM percentage
    if stats["daily_chain_volume"] > 0:
        stats["daily_volume_weighted_otm_pct"] = (
            (df["otm_percentage"] * df["size"]).sum() / stats["daily_chain_volume"]
        )
    else:
        stats["daily_volume_weighted_otm_pct"] = 0.0
    
    # -------------------------------------------------------------------------
    # DTE DISTRIBUTION
    # -------------------------------------------------------------------------
    
    short_dte_threshold = CONFIG["short_dte_threshold"]
    medium_dte_threshold = CONFIG["medium_dte_threshold"]
    
    short_dte_mask = df["days_to_expiry"] <= short_dte_threshold
    medium_dte_mask = (df["days_to_expiry"] > short_dte_threshold) & \
                      (df["days_to_expiry"] <= medium_dte_threshold)
    long_dte_mask = df["days_to_expiry"] > medium_dte_threshold
    
    short_dte_volume = df.loc[short_dte_mask, "size"].sum()
    medium_dte_volume = df.loc[medium_dte_mask, "size"].sum()
    long_dte_volume = df.loc[long_dte_mask, "size"].sum()
    
    stats["daily_short_dte_volume"] = short_dte_volume
    stats["daily_medium_dte_volume"] = medium_dte_volume
    stats["daily_long_dte_volume"] = long_dte_volume
    
    stats["daily_short_dte_volume_share"] = safe_divide(
        short_dte_volume, stats["daily_chain_volume"]
    )
    stats["daily_medium_dte_volume_share"] = safe_divide(
        medium_dte_volume, stats["daily_chain_volume"]
    )
    stats["daily_long_dte_volume_share"] = safe_divide(
        long_dte_volume, stats["daily_chain_volume"]
    )
    
    # Volume-weighted average DTE
    if stats["daily_chain_volume"] > 0:
        stats["daily_volume_weighted_dte"] = (
            (df["days_to_expiry"] * df["size"]).sum() / stats["daily_chain_volume"]
        )
    else:
        stats["daily_volume_weighted_dte"] = 0.0
    
    # -------------------------------------------------------------------------
    # CONCENTRATION METRICS
    # -------------------------------------------------------------------------
    
    # Contract-level concentration (what % does the most active contract have?)
    contract_volumes = df.groupby("ticker")["size"].sum()
    
    if len(contract_volumes) > 0:
        max_contract_volume = contract_volumes.max()
        stats["daily_max_contract_share"] = safe_divide(
            max_contract_volume, stats["daily_chain_volume"]
        )
        stats["daily_contract_hhi"] = compute_hhi(contract_volumes)
        stats["daily_unique_contracts"] = len(contract_volumes)
    else:
        stats["daily_max_contract_share"] = 0.0
        stats["daily_contract_hhi"] = 0.0
        stats["daily_unique_contracts"] = 0
    
    # Trade-level HHI within the chain
    stats["daily_trade_hhi"] = compute_hhi(df["size"])
    
    # -------------------------------------------------------------------------
    # PRICE / URGENCY METRICS
    # -------------------------------------------------------------------------
    
    # Compute VWAP premium per contract, then average across chain
    # This requires groupby contract first
    if "price" in df.columns and len(df) > 0:
        # For each contract, compute VWAP
        contract_vwap = df.groupby("ticker").apply(
            lambda g: (g["price"] * g["size"]).sum() / g["size"].sum()
            if g["size"].sum() > 0 else np.nan
        )
        
        # Join back and compute premium
        df_temp = df.copy()
        df_temp["contract_vwap"] = df_temp["ticker"].map(contract_vwap)
        df_temp["price_vs_vwap"] = (df_temp["price"] / df_temp["contract_vwap"]) - 1
        
        # Volume-weighted average VWAP premium
        valid_mask = df_temp["price_vs_vwap"].notna()
        if valid_mask.sum() > 0:
            stats["daily_avg_vwap_premium"] = (
                (df_temp.loc[valid_mask, "price_vs_vwap"] * 
                 df_temp.loc[valid_mask, "size"]).sum() / 
                df_temp.loc[valid_mask, "size"].sum()
            )
        else:
            stats["daily_avg_vwap_premium"] = 0.0
    else:
        stats["daily_avg_vwap_premium"] = 0.0
    
    # -------------------------------------------------------------------------
    # OPEN INTEREST / TURNOVER METRICS
    # -------------------------------------------------------------------------
    
    # Get unique contracts with their OI
    if "open_interest_now" in df.columns:
        contract_oi = df.groupby("ticker")["open_interest_now"].first()
        total_oi = contract_oi.sum()
        
        stats["daily_total_oi"] = total_oi
        stats["daily_volume_to_oi"] = safe_divide(
            stats["daily_chain_volume"], total_oi
        )
    else:
        stats["daily_total_oi"] = 0
        stats["daily_volume_to_oi"] = 0.0
    
    # -------------------------------------------------------------------------
    # EXECUTION TYPE DISTRIBUTION
    # -------------------------------------------------------------------------
    
    # Multi-leg trades
    if "cndn_multilegstrategy" in df.columns:
        multileg_volume = df.loc[df["cndn_multilegstrategy"] == 1, "size"].sum()
        stats["daily_multileg_volume_share"] = safe_divide(
            multileg_volume, stats["daily_chain_volume"]
        )
    else:
        stats["daily_multileg_volume_share"] = 0.0
    
    # Sweep/aggressive electronic
    if "cndn_autoelectronic" in df.columns:
        sweep_volume = df.loc[df["cndn_autoelectronic"] == 1, "size"].sum()
        stats["daily_sweep_volume_share"] = safe_divide(
            sweep_volume, stats["daily_chain_volume"]
        )
    else:
        stats["daily_sweep_volume_share"] = 0.0
    
    # Floor trades
    if "cndn_floorexecuted" in df.columns:
        floor_volume = df.loc[df["cndn_floorexecuted"] == 1, "size"].sum()
        stats["daily_floor_volume_share"] = safe_divide(
            floor_volume, stats["daily_chain_volume"]
        )
    else:
        stats["daily_floor_volume_share"] = 0.0
    
    return stats


# =============================================================================
# BASELINE COMPUTATION
# =============================================================================

def compute_baseline_for_date(
    underlying: str,
    target_date: str,
    historical_stats: pd.DataFrame,
    lookback_days: int,
    min_baseline_days: int
) -> Dict:
    """
    Compute rolling baseline statistics for a single underlying on a single date.
    
    Uses only data from dates strictly prior to target_date to avoid lookahead.
    
    Args:
        underlying: Ticker symbol
        target_date: Date to compute baseline for
        historical_stats: DataFrame of daily statistics (output of compute_daily_statistics)
        lookback_days: Number of trading days to look back
        min_baseline_days: Minimum days required for valid baseline
        
    Returns:
        Dictionary of baseline statistics
    """
    baseline = {
        "underlying": underlying,
        "trade_date": target_date,
    }
    
    # Filter to dates strictly before target_date
    target_dt = datetime.strptime(target_date, '%Y-%m-%d')
    hist = historical_stats[
        historical_stats["trade_date"].apply(
            lambda x: datetime.strptime(x, '%Y-%m-%d') < target_dt
        )
    ].copy()
    
    # Sort by date descending and take lookback_days
    hist = hist.sort_values("trade_date", ascending=False).head(lookback_days)
    
    # Record how many days we have
    baseline["baseline_days_count"] = len(hist)
    baseline["baseline_is_sufficient"] = int(len(hist) >= min_baseline_days)
    
    # If no historical data, return empty baseline with NaN values
    if len(hist) == 0:
        baseline_metrics = [
            "baseline_chain_volume_mean", "baseline_chain_volume_std",
            "baseline_chain_volume_median", "baseline_chain_notional_mean",
            "baseline_chain_notional_std", "baseline_chain_trade_count_mean",
            "baseline_chain_trade_count_std", "baseline_avg_trade_size_mean",
            "baseline_avg_trade_size_std", "baseline_call_volume_share_mean",
            "baseline_call_volume_share_std", "baseline_call_put_ratio_mean",
            "baseline_call_put_ratio_std", "baseline_otm_volume_share_mean",
            "baseline_otm_volume_share_std", "baseline_deep_otm_volume_share_mean",
            "baseline_deep_otm_volume_share_std", "baseline_short_dte_volume_share_mean",
            "baseline_short_dte_volume_share_std", "baseline_max_contract_share_mean",
            "baseline_max_contract_share_std", "baseline_volume_to_oi_mean",
            "baseline_volume_to_oi_std", "baseline_vwap_premium_mean",
            "baseline_vwap_premium_std", "baseline_multileg_share_mean",
            "baseline_sweep_share_mean"
        ]
        for metric in baseline_metrics:
            baseline[metric] = np.nan
        return baseline
    
    # -------------------------------------------------------------------------
    # COMPUTE ROLLING STATISTICS
    # -------------------------------------------------------------------------
    
    # Volume statistics
    baseline["baseline_chain_volume_mean"] = hist["daily_chain_volume"].mean()
    baseline["baseline_chain_volume_std"] = hist["daily_chain_volume"].std()
    baseline["baseline_chain_volume_median"] = hist["daily_chain_volume"].median()
    baseline["baseline_chain_volume_p25"] = hist["daily_chain_volume"].quantile(0.25)
    baseline["baseline_chain_volume_p75"] = hist["daily_chain_volume"].quantile(0.75)
    
    # Notional statistics
    baseline["baseline_chain_notional_mean"] = hist["daily_chain_notional"].mean()
    baseline["baseline_chain_notional_std"] = hist["daily_chain_notional"].std()
    baseline["baseline_chain_notional_median"] = hist["daily_chain_notional"].median()
    
    # Trade count statistics
    baseline["baseline_chain_trade_count_mean"] = hist["daily_chain_trade_count"].mean()
    baseline["baseline_chain_trade_count_std"] = hist["daily_chain_trade_count"].std()
    
    # Trade size statistics
    baseline["baseline_avg_trade_size_mean"] = hist["daily_avg_trade_size"].mean()
    baseline["baseline_avg_trade_size_std"] = hist["daily_avg_trade_size"].std()
    baseline["baseline_max_trade_size_mean"] = hist["daily_max_trade_size"].mean()
    
    # Call/Put distribution
    baseline["baseline_call_volume_share_mean"] = hist["daily_call_volume_share"].mean()
    baseline["baseline_call_volume_share_std"] = hist["daily_call_volume_share"].std()
    baseline["baseline_call_put_ratio_mean"] = hist["daily_call_put_volume_ratio"].mean()
    baseline["baseline_call_put_ratio_std"] = hist["daily_call_put_volume_ratio"].std()
    
    # Moneyness distribution
    baseline["baseline_otm_volume_share_mean"] = hist["daily_otm_volume_share"].mean()
    baseline["baseline_otm_volume_share_std"] = hist["daily_otm_volume_share"].std()
    baseline["baseline_deep_otm_volume_share_mean"] = hist["daily_deep_otm_volume_share"].mean()
    baseline["baseline_deep_otm_volume_share_std"] = hist["daily_deep_otm_volume_share"].std()
    baseline["baseline_itm_volume_share_mean"] = hist["daily_itm_volume_share"].mean()
    baseline["baseline_volume_weighted_otm_mean"] = hist["daily_volume_weighted_otm_pct"].mean()
    
    # DTE distribution
    baseline["baseline_short_dte_volume_share_mean"] = hist["daily_short_dte_volume_share"].mean()
    baseline["baseline_short_dte_volume_share_std"] = hist["daily_short_dte_volume_share"].std()
    baseline["baseline_medium_dte_volume_share_mean"] = hist["daily_medium_dte_volume_share"].mean()
    baseline["baseline_long_dte_volume_share_mean"] = hist["daily_long_dte_volume_share"].mean()
    baseline["baseline_volume_weighted_dte_mean"] = hist["daily_volume_weighted_dte"].mean()
    
    # Concentration metrics
    baseline["baseline_max_contract_share_mean"] = hist["daily_max_contract_share"].mean()
    baseline["baseline_max_contract_share_std"] = hist["daily_max_contract_share"].std()
    baseline["baseline_contract_hhi_mean"] = hist["daily_contract_hhi"].mean()
    baseline["baseline_unique_contracts_mean"] = hist["daily_unique_contracts"].mean()
    
    # Turnover metrics
    baseline["baseline_volume_to_oi_mean"] = hist["daily_volume_to_oi"].mean()
    baseline["baseline_volume_to_oi_std"] = hist["daily_volume_to_oi"].std()
    
    # Price/Urgency metrics
    baseline["baseline_vwap_premium_mean"] = hist["daily_avg_vwap_premium"].mean()
    baseline["baseline_vwap_premium_std"] = hist["daily_avg_vwap_premium"].std()
    
    # Execution type metrics
    baseline["baseline_multileg_share_mean"] = hist["daily_multileg_volume_share"].mean()
    baseline["baseline_sweep_share_mean"] = hist["daily_sweep_volume_share"].mean()
    baseline["baseline_floor_share_mean"] = hist["daily_floor_volume_share"].mean()
    
    return baseline


# =============================================================================
# MAIN PROCESSING FUNCTIONS
# =============================================================================

def scan_input_files(input_folder: Path, 
                     tickers_to_process: Optional[List[str]] = None,
                     start_date: Optional[str] = None,
                     end_date: Optional[str] = None) -> Dict[str, List[Tuple[str, Path]]]:
    """
    Scan input folder and organize files by underlying ticker.
    
    Args:
        input_folder: Path to folder containing Phase 03A output files
        tickers_to_process: List of tickers to include (None for all)
        start_date: Start date filter (inclusive)
        end_date: End date filter (inclusive)
        
    Returns:
        Dictionary mapping ticker -> list of (date, filepath) tuples, sorted by date
    """
    files_by_ticker = {}
    
    # Parse date filters
    start_dt = datetime.strptime(start_date, '%Y-%m-%d') if start_date else None
    end_dt = datetime.strptime(end_date, '%Y-%m-%d') if end_date else None
    
    # Scan all parquet files
    for filepath in input_folder.glob("*_perrowfeatures_*.parquet"):
        ticker, date_str = parse_filename(filepath.name)
        
        if ticker is None:
            continue
            
        # Apply ticker filter
        if tickers_to_process and ticker not in tickers_to_process:
            continue
            
        # Apply date filter
        file_dt = datetime.strptime(date_str, '%Y-%m-%d')
        if start_dt and file_dt < start_dt:
            continue
        if end_dt and file_dt > end_dt:
            continue
            
        # Add to dictionary
        if ticker not in files_by_ticker:
            files_by_ticker[ticker] = []
        files_by_ticker[ticker].append((date_str, filepath))
    
    # Sort each ticker's files by date
    for ticker in files_by_ticker:
        files_by_ticker[ticker] = sorted(files_by_ticker[ticker], key=lambda x: x[0])
    
    return files_by_ticker


def process_ticker(ticker: str, 
                   file_list: List[Tuple[str, Path]],
                   lookback_days: int,
                   min_baseline_days: int) -> pd.DataFrame:
    """
    Process a single ticker: compute daily stats, then rolling baselines.
    
    Args:
        ticker: Underlying ticker symbol
        file_list: List of (date, filepath) tuples for this ticker
        lookback_days: Number of trading days for rolling window
        min_baseline_days: Minimum days required for valid baseline
        
    Returns:
        DataFrame of baseline statistics for this ticker
    """
    logger.info(f"Processing {ticker}: {len(file_list)} files")
    
    # -------------------------------------------------------------------------
    # STEP 1: Compute daily statistics for each date
    # -------------------------------------------------------------------------
    
    daily_stats_list = []
    
    for date_str, filepath in file_list:
        try:
            df = pd.read_parquet(filepath)
            
            # Verify this is the correct ticker
            if "underlying" in df.columns:
                df = df[df["underlying"] == ticker]
            
            if len(df) == 0:
                logger.debug(f"  {date_str}: No trades for {ticker}")
                continue
                
            stats = compute_daily_statistics(df, date_str)
            if stats:
                daily_stats_list.append(stats)
                logger.debug(f"  {date_str}: {stats['daily_chain_volume']} contracts, "
                           f"{stats['daily_chain_trade_count']} trades")
                
        except Exception as e:
            logger.warning(f"  Error processing {filepath.name}: {e}")
            continue
    
    if len(daily_stats_list) == 0:
        logger.warning(f"No daily statistics computed for {ticker}")
        return pd.DataFrame()
    
    daily_stats_df = pd.DataFrame(daily_stats_list)
    logger.info(f"  Computed daily stats for {len(daily_stats_df)} days")
    
    # -------------------------------------------------------------------------
    # STEP 2: Compute rolling baselines for each date
    # -------------------------------------------------------------------------
    
    baseline_list = []
    all_dates = sorted(daily_stats_df["trade_date"].unique())
    
    for target_date in all_dates:
        baseline = compute_baseline_for_date(
            underlying=ticker,
            target_date=target_date,
            historical_stats=daily_stats_df,
            lookback_days=lookback_days,
            min_baseline_days=min_baseline_days
        )
        baseline_list.append(baseline)
    
    baseline_df = pd.DataFrame(baseline_list)
    logger.info(f"  Computed baselines for {len(baseline_df)} days")
    
    return baseline_df


def main():
    """Main execution function."""
    
    logger.info("=" * 70)
    logger.info("PHASE 03B-1: HISTORICAL BASELINE CONSTRUCTION")
    logger.info("=" * 70)
    
    # -------------------------------------------------------------------------
    # SETUP
    # -------------------------------------------------------------------------
    
    input_folder = CONFIG["input_folder"]
    output_folder = CONFIG["output_folder"]
    
    # Create output folder if needed
    output_folder.mkdir(parents=True, exist_ok=True)
    
    logger.info(f"Input folder: {input_folder}")
    logger.info(f"Output folder: {output_folder}")
    logger.info(f"Lookback days: {CONFIG['lookback_days']}")
    logger.info(f"Minimum baseline days: {CONFIG['min_baseline_days']}")
    
    # -------------------------------------------------------------------------
    # SCAN INPUT FILES
    # -------------------------------------------------------------------------
    
    logger.info("-" * 70)
    logger.info("SCANNING INPUT FILES")
    logger.info("-" * 70)
    
    files_by_ticker = scan_input_files(
        input_folder=input_folder,
        tickers_to_process=CONFIG["tickers_to_process"],
        start_date=CONFIG["start_date"],
        end_date=CONFIG["end_date"]
    )
    
    if not files_by_ticker:
        logger.error("No input files found!")
        return
    
    total_files = sum(len(v) for v in files_by_ticker.values())
    logger.info(f"Found {len(files_by_ticker)} tickers, {total_files} total files")
    
    # -------------------------------------------------------------------------
    # PROCESS EACH TICKER
    # -------------------------------------------------------------------------
    
    logger.info("-" * 70)
    logger.info("PROCESSING TICKERS")
    logger.info("-" * 70)
    
    all_baselines = []
    
    for i, (ticker, file_list) in enumerate(sorted(files_by_ticker.items())):
        logger.info(f"\n[{i+1}/{len(files_by_ticker)}] Processing {ticker}")
        
        baseline_df = process_ticker(
            ticker=ticker,
            file_list=file_list,
            lookback_days=CONFIG["lookback_days"],
            min_baseline_days=CONFIG["min_baseline_days"]
        )
        
        if len(baseline_df) > 0:
            all_baselines.append(baseline_df)
    
    # -------------------------------------------------------------------------
    # SAVE OUTPUT (Per-Ticker Files)
    # -------------------------------------------------------------------------
    
    if not all_baselines:
        logger.error("No baselines computed!")
        return
    
    logger.info("-" * 70)
    logger.info("SAVING OUTPUT")
    logger.info("-" * 70)
    
    total_rows = 0
    total_files = 0
    
    for baseline_df in all_baselines:
        if len(baseline_df) == 0:
            continue
            
        # Get the underlying ticker from the dataframe
        underlying_ticker = baseline_df["underlying"].iloc[0]
        
        # Sort by date
        baseline_df = baseline_df.sort_values("trade_date").reset_index(drop=True)
        
        # Save to parquet with ticker in filename
        output_filename = f"baseline_{underlying_ticker}_daily.parquet"
        output_path = output_folder / output_filename
        baseline_df.to_parquet(output_path, index=False)
        
        total_rows += len(baseline_df)
        total_files += 1
        
        logger.info(f"Saved: {output_filename} ({len(baseline_df)} rows)")
    
    logger.info("-" * 70)
    logger.info(f"Total files saved: {total_files}")
    logger.info(f"Total rows: {total_rows:,}")
    
    # Print summary statistics
    logger.info("\nBaseline Coverage Summary:")
    final_baseline_df = pd.concat(all_baselines, ignore_index=True)
    coverage = final_baseline_df.groupby("underlying").agg({
        "trade_date": "count",
        "baseline_days_count": "mean",
        "baseline_is_sufficient": "mean"
    }).rename(columns={
        "trade_date": "total_days",
        "baseline_days_count": "avg_baseline_days",
        "baseline_is_sufficient": "pct_sufficient"
    })
    
    logger.info(f"\n{coverage.head(20).to_string()}")
    
    logger.info("\n" + "=" * 70)
    logger.info("PHASE 04A-1 Historical Baseline Construction for the Large Informed Trade -- COMPLETE")
    logger.info("=" * 70)


# =============================================================================
# ENTRY POINT
# =============================================================================

if __name__ == "__main__":
    main()