In [None]:
"""
================================================================================
PHASE 04: ANOMALY DETECTION FOR TYPE A UNUSUAL OPTIONS ACTIVITY
================================================================================

Pipeline Position: 
    Phase 03B-2 (Aggregated Features) → **Phase 04** → Manual Review / Trading

Purpose:
    Detects Type A unusual options activity (large informed trades) using 
    per-underlying Isolation Forest models trained on rolling historical data.
    
Architecture:
    - Per-underlying models: Each underlying has its own IF model
    - Rolling training: 30-day lookback, minimum 20 days required
    - Hierarchical detection: Chain filter → Trade scoring → Contract aggregation
    - Percentile-based thresholds: Top 5% of trades per underlying flagged

Input:
    - Phase 03B-2 output files: {TICKER}_aggregatedfeatures_YYYY-MM-DD.parquet
    
Output:
    - uoa_daily_summary_YYYY-MM-DD.csv: One row per underlying that passed filter
    - uoa_flagged_contracts_YYYY-MM-DD.csv: Contracts with flagged trades
    - uoa_flagged_trades_YYYY-MM-DD.parquet: All trades with anomaly scores
    - uoa_anomalies_YYYY-MM-DD.csv: Flat ranked list of all anomalies
    - uoa_run_log_YYYY-MM-DD.json: Run metadata

Author: [Your Name]
Created: 2026-02-XX
Version: 1.0

Dependencies:
    - pandas >= 1.5.0
    - numpy >= 1.20.0
    - scikit-learn >= 1.0.0
    - pyarrow >= 10.0.0

Usage:
    python Phase_04_anomaly_detection.py

Important Notes: 
-input_folder = must contain one ticker data only from Phase 04A-2
-output_folder = must be subdivided into per-ticker subfolders to avoid confusion

================================================================================
"""

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, timedelta
from typing import Optional, List, Dict, Tuple
import json
import warnings
import logging
from sklearn.ensemble import IsolationForest

warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================

CONFIG = {
    # Input folder containing Phase 03B-2 output files
    "input_folder": Path(r"D:\cyclelabs_codes\CL_20251120_siphontrades\01_FIXINGRAWDATA\output_4a2_aggfeateng"),
    
    # Output folder for anomaly detection results
    "output_folder": Path(r"D:\cyclelabs_codes\CL_20251120_siphontrades\01_FIXINGRAWDATA\output_5a_anomaly\CIFR"),
    
    # -------------------------------------------------------------------------
    # DATE CONFIGURATION
    # -------------------------------------------------------------------------
    # Option 1: Single target date
    #   - Set "single_date_mode": True
    #   - Set "target_date": "YYYY-MM-DD"
    #
    # Option 2: Multiple target dates (date range)
    #   - Set "single_date_mode": False
    #   - Set "start_date" and "end_date": "YYYY-MM-DD"
    # -------------------------------------------------------------------------
    
    "single_date_mode": False,  # True = single date, False = date range
    
    # For single date mode
    "target_date": "2024-03-15",
    
    # For multiple dates mode (ignored if single_date_mode is True)
    "start_date": "2025-01-01",
    "end_date": "2025-12-31",
    
    # Rolling window for training (trading days)
    "training_window_days": 30,
    
    # Minimum history required for valid model
    "min_history_days": 20,
    
    # Chain-level filter thresholds
    "chain_volume_zscore_threshold": 2.0,
    "chain_min_volume": 20,
    "chain_min_trades": 2,
    
    # Anomaly threshold (percentile-based)
    "anomaly_percentile": 90,  # Top 5% flagged
    
    # Contract aggregation
    "contract_top_k": 5,  # Top-K mean for contract scores
    
    # Isolation Forest parameters
    "if_contamination": 0.05,  # Conservative estimate
    "if_n_estimators": 100,
    "if_random_state": 42,
    
    # Features for Isolation Forest
    "if_features": [
        'trade_size_pct_of_chain',
        'trade_size_pct_of_contract',
        'contract_volume_share_of_chain',
        'trade_price_vs_contract_vwap',
        'otm_percentage',
        'days_to_expiry',
        'contract_volume_to_oi',
        'contract_hhi',
    ],
    
    # Logging level
    "log_level": logging.INFO,
}

# =============================================================================
# LOGGING SETUP
# =============================================================================

logging.basicConfig(
    level=CONFIG["log_level"],
    format='%(asctime)s | %(levelname)s | %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)

# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================

def parse_filename(filename: str) -> Tuple[Optional[str], Optional[str]]:
    """
    Parse ticker and date from Phase 03B-2 output filename.
    
    Expected format: {TICKER}_aggregatedfeatures_YYYY-MM-DD.parquet
    """
    try:
        base = filename.replace('.parquet', '')
        parts = base.split('_aggregatedfeatures_')
        if len(parts) != 2:
            return None, None
        ticker = parts[0]
        date_str = parts[1]
        datetime.strptime(date_str, '%Y-%m-%d')
        return ticker, date_str
    except Exception:
        return None, None


def get_available_dates_for_underlying(
    input_folder: Path, 
    underlying: str
) -> List[str]:
    """
    Get all available dates for an underlying, sorted ascending.
    """
    dates = []
    pattern = f"{underlying}_aggregatedfeatures_*.parquet"
    for filepath in input_folder.glob(pattern):
        _, date_str = parse_filename(filepath.name)
        if date_str:
            dates.append(date_str)
    return sorted(dates)


def get_trading_days_before(
    target_date: str, 
    available_dates: List[str], 
    n_days: int
) -> List[str]:
    """
    Get N trading days strictly before target_date.
    """
    target_dt = datetime.strptime(target_date, '%Y-%m-%d')
    prior_dates = [d for d in available_dates 
                   if datetime.strptime(d, '%Y-%m-%d') < target_dt]
    prior_dates_sorted = sorted(prior_dates, reverse=True)[:n_days]
    return sorted(prior_dates_sorted)


def load_underlying_data(
    input_folder: Path, 
    underlying: str, 
    date_str: str
) -> Optional[pd.DataFrame]:
    """
    Load Phase 03B-2 data for a specific underlying and date.
    """
    filename = f"{underlying}_aggregatedfeatures_{date_str}.parquet"
    filepath = input_folder / filename
    
    if not filepath.exists():
        return None
    
    try:
        return pd.read_parquet(filepath)
    except Exception as e:
        logger.warning(f"Error loading {filepath}: {e}")
        return None


# =============================================================================
# CHAIN-LEVEL FILTER
# =============================================================================

def check_chain_filter(df: pd.DataFrame) -> Tuple[bool, str]:
    """
    Check if underlying passes chain-level filter.
    
    Returns:
        Tuple of (passes: bool, reason: str)
    """
    # Get chain-level stats (same for all rows in the df)
    if len(df) == 0:
        return False, "no_trades"
    
    chain_volume_zscore = df['chain_volume_zscore'].iloc[0]
    chain_total_volume = df['chain_total_volume'].iloc[0]
    chain_trade_count = df['chain_trade_count'].iloc[0]
    baseline_is_sufficient = df['baseline_is_sufficient'].iloc[0]
    
    # Check each condition
    if pd.isna(chain_volume_zscore):
        return False, "missing_zscore"
    
    if baseline_is_sufficient != 1:
        return False, "insufficient_baseline"
    
    if chain_volume_zscore < CONFIG["chain_volume_zscore_threshold"]:
        return False, f"volume_zscore_{chain_volume_zscore:.2f}_below_threshold"
    
    if chain_total_volume < CONFIG["chain_min_volume"]:
        return False, f"volume_{chain_total_volume}_below_minimum"
    
    if chain_trade_count < CONFIG["chain_min_trades"]:
        return False, f"trade_count_{chain_trade_count}_below_minimum"
    
    return True, "passed"


# =============================================================================
# ISOLATION FOREST TRAINING AND SCORING
# =============================================================================

def prepare_features(df: pd.DataFrame, features: List[str]) -> pd.DataFrame:
    """
    Prepare feature matrix for Isolation Forest.
    Handles missing values and infinite values.
    """
    X = df[features].copy()
    
    # Replace inf with nan
    X = X.replace([np.inf, -np.inf], np.nan)
    
    # Fill nan with median of each column
    for col in X.columns:
        median_val = X[col].median()
        if pd.isna(median_val):
            median_val = 0
        X[col] = X[col].fillna(median_val)
    
    return X


def train_isolation_forest(
    training_data: pd.DataFrame,
    features: List[str]
) -> Optional[IsolationForest]:
    """
    Train Isolation Forest on historical data.
    
    Args:
        training_data: Historical trades (high-quality only)
        features: List of feature column names
        
    Returns:
        Trained IsolationForest model, or None if insufficient data
    """
    if len(training_data) < 50:  # Minimum samples for meaningful model
        return None
    
    X_train = prepare_features(training_data, features)
    
    model = IsolationForest(
        contamination=CONFIG["if_contamination"],
        n_estimators=CONFIG["if_n_estimators"],
        random_state=CONFIG["if_random_state"],
        n_jobs=-1
    )
    
    model.fit(X_train)
    
    return model


def score_trades(
    model: IsolationForest,
    today_data: pd.DataFrame,
    features: List[str]
) -> pd.DataFrame:
    """
    Score today's trades using trained model.
    
    Returns dataframe with added anomaly_score column.
    Higher score = more anomalous.
    """
    df = today_data.copy()
    
    X = prepare_features(df, features)
    
    # score_samples returns negative values (more negative = more anomalous)
    # We negate to get positive scores where higher = more anomalous
    raw_scores = model.score_samples(X)
    
    # Normalize to 0-1 range (approximately)
    # Typical raw scores range from -0.5 to 0.1
    # We transform so that more anomalous = higher score
    df['anomaly_score_raw'] = -raw_scores
    
    # Normalize to 0-1 using min-max within this underlying's scores
    min_score = df['anomaly_score_raw'].min()
    max_score = df['anomaly_score_raw'].max()
    
    if max_score > min_score:
        df['anomaly_score'] = (df['anomaly_score_raw'] - min_score) / (max_score - min_score)
    else:
        df['anomaly_score'] = 0.5  # All same score
    
    return df


def flag_anomalies_percentile(
    df: pd.DataFrame,
    percentile: float = 95
) -> pd.DataFrame:
    """
    Flag trades above the percentile threshold.
    """
    threshold = np.percentile(df['anomaly_score'], percentile)
    df['is_anomaly'] = (df['anomaly_score'] >= threshold).astype(int)
    df['anomaly_threshold'] = threshold
    
    return df


# =============================================================================
# CONTRACT-LEVEL AGGREGATION
# =============================================================================

def aggregate_to_contract_level(df: pd.DataFrame, top_k: int = 3) -> pd.DataFrame:
    """
    Aggregate trade-level anomaly scores to contract level.
    
    Uses top-K mean: average of top K trade scores per contract.
    """
    def top_k_mean(scores, k=top_k):
        sorted_scores = sorted(scores, reverse=True)
        top_scores = sorted_scores[:min(k, len(sorted_scores))]
        return np.mean(top_scores) if top_scores else 0
    
    contract_agg = df.groupby('ticker').agg(
        underlying=('underlying', 'first'),
        contract_anomaly_score=('anomaly_score', lambda x: top_k_mean(x)),
        contract_max_score=('anomaly_score', 'max'),
        contract_mean_score=('anomaly_score', 'mean'),
        num_trades=('size', 'count'),
        num_flagged_trades=('is_anomaly', 'sum'),
        total_volume=('size', 'sum'),
        flagged_volume=('size', lambda x: x[df.loc[x.index, 'is_anomaly'] == 1].sum()),
        total_notional=('opt_trade_notional_value', 'sum'),
        flagged_notional=('opt_trade_notional_value', 
                          lambda x: x[df.loc[x.index, 'is_anomaly'] == 1].sum()),
        contract_volume_share=('contract_volume_share_of_chain', 'first'),
        otm_percentage=('otm_percentage', 'first'),
        days_to_expiry=('days_to_expiry', 'first'),
        option_type=('option_type_call', lambda x: 'CALL' if x.iloc[0] == 1 else 'PUT'),
        strike_price=('strike_price', 'first'),
    ).reset_index()
    
    # Sort by contract anomaly score descending
    contract_agg = contract_agg.sort_values('contract_anomaly_score', ascending=False)
    
    return contract_agg


# =============================================================================
# MAIN PROCESSING FOR ONE UNDERLYING
# =============================================================================

def process_underlying(
    underlying: str,
    target_date: str,
    input_folder: Path,
    features: List[str]
) -> Dict:
    """
    Process a single underlying through the full anomaly detection pipeline.
    
    Returns:
        Dictionary with results or status
    """
    result = {
        'underlying': underlying,
        'target_date': target_date,
        'status': None,
        'reason': None,
        'today_data': None,
        'flagged_trades': None,
        'contract_summary': None,
        'chain_summary': None,
    }
    
    # -------------------------------------------------------------------------
    # Step 1: Load today's data
    # -------------------------------------------------------------------------
    
    today_data = load_underlying_data(input_folder, underlying, target_date)
    
    if today_data is None or len(today_data) == 0:
        result['status'] = 'skipped'
        result['reason'] = 'no_data_for_target_date'
        return result
    
    # -------------------------------------------------------------------------
    # Step 2: Check chain-level filter
    # -------------------------------------------------------------------------
    
    passes_filter, filter_reason = check_chain_filter(today_data)
    
    if not passes_filter:
        result['status'] = 'filtered'
        result['reason'] = filter_reason
        return result
    
    # -------------------------------------------------------------------------
    # Step 3: Get historical dates for training
    # -------------------------------------------------------------------------
    
    available_dates = get_available_dates_for_underlying(input_folder, underlying)
    historical_dates = get_trading_days_before(
        target_date, 
        available_dates, 
        CONFIG["training_window_days"]
    )
    
    if len(historical_dates) < CONFIG["min_history_days"]:
        result['status'] = 'skipped'
        result['reason'] = f'insufficient_history_{len(historical_dates)}_days'
        return result
    
    # -------------------------------------------------------------------------
    # Step 4: Load historical data for training
    # -------------------------------------------------------------------------
    
    historical_dfs = []
    for hist_date in historical_dates:
        hist_df = load_underlying_data(input_folder, underlying, hist_date)
        if hist_df is not None:
            # Filter to high-quality signals only for training
            hist_df_filtered = hist_df[hist_df['is_high_quality_signal'] == 1].copy()
            if len(hist_df_filtered) > 0:
                historical_dfs.append(hist_df_filtered)
    
    if not historical_dfs:
        result['status'] = 'skipped'
        result['reason'] = 'no_valid_historical_data'
        return result
    
    training_data = pd.concat(historical_dfs, ignore_index=True)
    
    if len(training_data) < 50:
        result['status'] = 'skipped'
        result['reason'] = f'insufficient_training_samples_{len(training_data)}'
        return result
    
    logger.debug(f"  Training data: {len(training_data)} trades from {len(historical_dfs)} days")
    
    # -------------------------------------------------------------------------
    # Step 5: Train Isolation Forest
    # -------------------------------------------------------------------------
    
    model = train_isolation_forest(training_data, features)
    
    if model is None:
        result['status'] = 'skipped'
        result['reason'] = 'model_training_failed'
        return result
    
    # -------------------------------------------------------------------------
    # Step 6: Score today's trades
    # -------------------------------------------------------------------------
    
    # Filter today's data to high-quality signals for scoring
    today_hq = today_data[today_data['is_high_quality_signal'] == 1].copy()
    
    if len(today_hq) == 0:
        result['status'] = 'skipped'
        result['reason'] = 'no_high_quality_trades_today'
        return result
    
    scored_data = score_trades(model, today_hq, features)
    
    # -------------------------------------------------------------------------
    # Step 7: Flag anomalies (top 5%)
    # -------------------------------------------------------------------------
    
    flagged_data = flag_anomalies_percentile(
        scored_data, 
        percentile=CONFIG["anomaly_percentile"]
    )
    
    # -------------------------------------------------------------------------
    # Step 8: Aggregate to contract level
    # -------------------------------------------------------------------------
    
    contract_summary = aggregate_to_contract_level(
        flagged_data, 
        top_k=CONFIG["contract_top_k"]
    )
    
    # -------------------------------------------------------------------------
    # Step 9: Build chain summary
    # -------------------------------------------------------------------------
    
    # Helper function to convert numpy types to native Python
    def to_native(val):
        if isinstance(val, (np.integer,)):
            return int(val)
        elif isinstance(val, (np.floating,)):
            return float(val)
        elif pd.isna(val):
            return None
        return val
    
    chain_summary = {
        'underlying': underlying,
        'target_date': target_date,
        'chain_volume_zscore': to_native(today_data['chain_volume_zscore'].iloc[0]),
        'chain_total_volume': to_native(today_data['chain_total_volume'].iloc[0]),
        'chain_total_notional': to_native(today_data['chain_total_notional'].iloc[0]),
        'chain_trade_count': to_native(today_data['chain_trade_count'].iloc[0]),
        'call_put_volume_ratio': to_native(today_data['call_put_volume_ratio'].iloc[0]),
        'call_volume_share': to_native(today_data['call_volume_share'].iloc[0]),
        'deep_otm_volume_share': to_native(today_data['deep_otm_volume_share'].iloc[0]),
        'short_dte_volume_share': to_native(today_data['short_dte_volume_share'].iloc[0]),
        'direction': 'BULLISH' if today_data['call_volume_share'].iloc[0] > 0.6 else 
                     ('BEARISH' if today_data['call_volume_share'].iloc[0] < 0.4 else 'NEUTRAL'),
        'num_contracts_traded': len(contract_summary),
        'num_contracts_flagged': len(contract_summary[contract_summary['num_flagged_trades'] > 0]),
        'num_trades_scored': len(flagged_data),
        'num_trades_flagged': int(flagged_data['is_anomaly'].sum()),
        'flagged_volume': int(flagged_data.loc[flagged_data['is_anomaly'] == 1, 'size'].sum()),
        'flagged_notional': float(flagged_data.loc[flagged_data['is_anomaly'] == 1, 'opt_trade_notional_value'].sum()),
        'top_contract': contract_summary.iloc[0]['ticker'] if len(contract_summary) > 0 else None,
        'top_contract_score': to_native(contract_summary.iloc[0]['contract_anomaly_score']) if len(contract_summary) > 0 else None,
        'anomaly_threshold': to_native(flagged_data['anomaly_threshold'].iloc[0]),
        'training_days': len(historical_dfs),
        'training_samples': len(training_data),
    }
    
    # -------------------------------------------------------------------------
    # Return results
    # -------------------------------------------------------------------------
    
    result['status'] = 'processed'
    result['reason'] = 'success'
    result['today_data'] = flagged_data
    result['flagged_trades'] = flagged_data[flagged_data['is_anomaly'] == 1].copy()
    result['contract_summary'] = contract_summary
    result['chain_summary'] = chain_summary
    
    return result


# =============================================================================
# OUTPUT GENERATION
# =============================================================================

def generate_anomaly_list(all_results: List[Dict]) -> pd.DataFrame:
    """
    Generate flat ranked list of all anomalies across all underlyings.
    Includes ALL columns from the original trade data.
    """
    anomaly_dfs = []
    
    for result in all_results:
        if result['status'] != 'processed':
            continue
        
        flagged = result['flagged_trades']
        if flagged is None or len(flagged) == 0:
            continue
        
        # Include all columns from flagged trades
        anomaly_dfs.append(flagged.copy())
    
    if not anomaly_dfs:
        return pd.DataFrame()
    
    # Concatenate all flagged trades
    anomaly_df = pd.concat(anomaly_dfs, ignore_index=True)
    
    # Sort by anomaly score descending
    anomaly_df = anomaly_df.sort_values('anomaly_score', ascending=False).reset_index(drop=True)
    
    # Add rank as first column
    anomaly_df.insert(0, 'rank', range(1, len(anomaly_df) + 1))
    
    return anomaly_df


def generate_daily_summary(all_results: List[Dict]) -> pd.DataFrame:
    """
    Generate daily summary with one row per underlying that passed filter.
    """
    summary_rows = []
    
    for result in all_results:
        if result['status'] != 'processed':
            continue
        
        chain = result['chain_summary']
        summary_rows.append(chain)
    
    if not summary_rows:
        return pd.DataFrame()
    
    summary_df = pd.DataFrame(summary_rows)
    
    # Sort by chain_volume_zscore descending (most unusual first)
    summary_df = summary_df.sort_values('chain_volume_zscore', ascending=False).reset_index(drop=True)
    
    # Add rank
    summary_df.insert(0, 'rank', range(1, len(summary_df) + 1))
    
    return summary_df


def generate_flagged_contracts(all_results: List[Dict]) -> pd.DataFrame:
    """
    Generate flagged contracts summary.
    """
    contract_rows = []
    
    for result in all_results:
        if result['status'] != 'processed':
            continue
        
        contracts = result['contract_summary']
        if contracts is None:
            continue
        
        # Only include contracts with flagged trades
        flagged_contracts = contracts[contracts['num_flagged_trades'] > 0].copy()
        
        for _, contract in flagged_contracts.iterrows():
            contract_rows.append({
                'underlying': contract['underlying'],
                'contract': contract['ticker'],
                'option_type': contract['option_type'],
                'strike_price': contract['strike_price'],
                'days_to_expiry': contract['days_to_expiry'],
                'otm_percentage': contract['otm_percentage'],
                'contract_anomaly_score': contract['contract_anomaly_score'],
                'contract_max_score': contract['contract_max_score'],
                'contract_volume_share': contract['contract_volume_share'],
                'num_trades': contract['num_trades'],
                'num_flagged_trades': contract['num_flagged_trades'],
                'total_volume': contract['total_volume'],
                'flagged_volume': contract['flagged_volume'],
                'total_notional': contract['total_notional'],
                'flagged_notional': contract['flagged_notional'],
            })
    
    if not contract_rows:
        return pd.DataFrame()
    
    contract_df = pd.DataFrame(contract_rows)
    
    # Sort by contract_anomaly_score descending
    contract_df = contract_df.sort_values('contract_anomaly_score', ascending=False).reset_index(drop=True)
    
    # Add rank
    contract_df.insert(0, 'rank', range(1, len(contract_df) + 1))
    
    return contract_df


def generate_all_flagged_trades(all_results: List[Dict]) -> pd.DataFrame:
    """
    Combine all scored trades from all underlyings.
    """
    all_trades = []
    
    for result in all_results:
        if result['status'] != 'processed':
            continue
        
        today_data = result['today_data']
        if today_data is not None and len(today_data) > 0:
            all_trades.append(today_data)
    
    if not all_trades:
        return pd.DataFrame()
    
    combined = pd.concat(all_trades, ignore_index=True)
    
    # Sort by anomaly_score descending
    combined = combined.sort_values('anomaly_score', ascending=False).reset_index(drop=True)
    
    return combined


def generate_run_log(
    target_date: str,
    all_results: List[Dict],
    processing_time: float
) -> Dict:
    """
    Generate run metadata log.
    """
    status_counts = {}
    for result in all_results:
        status = result['status']
        reason = result['reason']
        key = f"{status}_{reason}"
        status_counts[key] = status_counts.get(key, 0) + 1
    
    processed_results = [r for r in all_results if r['status'] == 'processed']
    
    total_flagged_trades = sum(
        r['chain_summary']['num_trades_flagged'] 
        for r in processed_results
    )
    
    total_flagged_volume = sum(
        r['chain_summary']['flagged_volume'] 
        for r in processed_results
    )
    
    total_flagged_notional = sum(
        r['chain_summary']['flagged_notional'] 
        for r in processed_results
    )
    
    # Convert numpy types to native Python types for JSON serialization
    def convert_to_native(obj):
        if isinstance(obj, (np.integer,)):
            return int(obj)
        elif isinstance(obj, (np.floating,)):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        elif isinstance(obj, dict):
            return {k: convert_to_native(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [convert_to_native(i) for i in obj]
        return obj
    
    run_log = {
        'run_timestamp': datetime.now().isoformat(),
        'target_date': target_date,
        'config': {
            'training_window_days': CONFIG['training_window_days'],
            'min_history_days': CONFIG['min_history_days'],
            'chain_volume_zscore_threshold': CONFIG['chain_volume_zscore_threshold'],
            'anomaly_percentile': CONFIG['anomaly_percentile'],
            'if_features': CONFIG['if_features'],
        },
        'underlyings_total': len(all_results),
        'underlyings_processed': len(processed_results),
        'underlyings_filtered': len([r for r in all_results if r['status'] == 'filtered']),
        'underlyings_skipped': len([r for r in all_results if r['status'] == 'skipped']),
        'status_breakdown': status_counts,
        'total_flagged_trades': int(total_flagged_trades),
        'total_flagged_volume': int(total_flagged_volume),
        'total_flagged_notional': float(total_flagged_notional),
        'processing_time_seconds': round(processing_time, 2),
    }
    
    return convert_to_native(run_log)


# =============================================================================
# MAIN EXECUTION
# =============================================================================

def get_all_underlyings_for_date(input_folder: Path, target_date: str) -> List[str]:
    """
    Get all underlyings that have data for the target date.
    """
    underlyings = []
    pattern = f"*_aggregatedfeatures_{target_date}.parquet"
    
    for filepath in input_folder.glob(pattern):
        ticker, _ = parse_filename(filepath.name)
        if ticker:
            underlyings.append(ticker)
    
    return sorted(underlyings)


def get_all_available_dates(input_folder: Path) -> List[str]:
    """
    Get all unique dates available in the input folder.
    """
    dates = set()
    
    for filepath in input_folder.glob("*_aggregatedfeatures_*.parquet"):
        _, date_str = parse_filename(filepath.name)
        if date_str:
            dates.add(date_str)
    
    return sorted(dates)


def get_dates_in_range(
    input_folder: Path, 
    start_date: str, 
    end_date: str
) -> List[str]:
    """
    Get all available dates within the specified range (inclusive).
    """
    all_dates = get_all_available_dates(input_folder)
    
    start_dt = datetime.strptime(start_date, '%Y-%m-%d')
    end_dt = datetime.strptime(end_date, '%Y-%m-%d')
    
    filtered_dates = [
        d for d in all_dates
        if start_dt <= datetime.strptime(d, '%Y-%m-%d') <= end_dt
    ]
    
    return filtered_dates


def process_single_date(target_date: str, input_folder: Path, output_folder: Path) -> Dict:
    """
    Process a single target date through the full anomaly detection pipeline.
    
    Returns:
        Dictionary with run statistics
    """
    start_time = datetime.now()
    
    logger.info("=" * 70)
    logger.info(f"PROCESSING DATE: {target_date}")
    logger.info("=" * 70)
    
    # -------------------------------------------------------------------------
    # GET ALL UNDERLYINGS
    # -------------------------------------------------------------------------
    
    underlyings = get_all_underlyings_for_date(input_folder, target_date)
    
    if not underlyings:
        logger.warning(f"No data found for date {target_date}")
        return {
            'target_date': target_date,
            'status': 'skipped',
            'reason': 'no_data',
            'underlyings_total': 0,
        }
    
    logger.info(f"Found {len(underlyings)} underlyings with data for {target_date}")
    
    # -------------------------------------------------------------------------
    # PROCESS EACH UNDERLYING
    # -------------------------------------------------------------------------
    
    logger.info("-" * 70)
    logger.info("PROCESSING UNDERLYINGS")
    logger.info("-" * 70)
    
    all_results = []
    
    for i, underlying in enumerate(underlyings):
        logger.info(f"[{i+1}/{len(underlyings)}] Processing {underlying}...")
        
        result = process_underlying(
            underlying=underlying,
            target_date=target_date,
            input_folder=input_folder,
            features=CONFIG["if_features"]
        )
        
        all_results.append(result)
        
        if result['status'] == 'processed':
            chain = result['chain_summary']
            logger.info(f"  ✓ Processed: {chain['num_trades_flagged']} flagged trades, "
                       f"z-score={chain['chain_volume_zscore']:.2f}, "
                       f"direction={chain['direction']}")
        else:
            logger.info(f"  ✗ {result['status'].upper()}: {result['reason']}")
    
    # -------------------------------------------------------------------------
    # GENERATE OUTPUTS
    # -------------------------------------------------------------------------
    
    logger.info("-" * 70)
    logger.info("GENERATING OUTPUTS")
    logger.info("-" * 70)
    
    # Anomaly list (flat ranked list)
    anomaly_df = generate_anomaly_list(all_results)
    anomaly_path = output_folder / f"uoa_anomalies_{target_date}.csv"
    if len(anomaly_df) > 0:
        anomaly_df.to_csv(anomaly_path, index=False)
        logger.info(f"Saved anomaly list: {anomaly_path.name} ({len(anomaly_df)} anomalies)")
    else:
        logger.info("No anomalies detected")
    
    # Daily summary
    summary_df = generate_daily_summary(all_results)
    summary_path = output_folder / f"uoa_daily_summary_{target_date}.csv"
    if len(summary_df) > 0:
        summary_df.to_csv(summary_path, index=False)
        logger.info(f"Saved daily summary: {summary_path.name} ({len(summary_df)} underlyings)")
    
    # Flagged contracts
    contracts_df = generate_flagged_contracts(all_results)
    contracts_path = output_folder / f"uoa_flagged_contracts_{target_date}.csv"
    if len(contracts_df) > 0:
        contracts_df.to_csv(contracts_path, index=False)
        logger.info(f"Saved flagged contracts: {contracts_path.name} ({len(contracts_df)} contracts)")
    
    # All scored trades
    trades_df = generate_all_flagged_trades(all_results)
    trades_path = output_folder / f"uoa_flagged_trades_{target_date}.parquet"
    if len(trades_df) > 0:
        trades_df.to_parquet(trades_path, index=False)
        logger.info(f"Saved all scored trades: {trades_path.name} ({len(trades_df)} trades)")
    
    # Run log
    processing_time = (datetime.now() - start_time).total_seconds()
    run_log = generate_run_log(target_date, all_results, processing_time)
    log_path = output_folder / f"uoa_run_log_{target_date}.json"
    with open(log_path, 'w') as f:
        json.dump(run_log, f, indent=2)
    logger.info(f"Saved run log: {log_path.name}")
    
    # -------------------------------------------------------------------------
    # PRINT SUMMARY
    # -------------------------------------------------------------------------
    
    logger.info("-" * 70)
    logger.info(f"SUMMARY FOR {target_date}")
    logger.info("-" * 70)
    
    processed_count = len([r for r in all_results if r['status'] == 'processed'])
    filtered_count = len([r for r in all_results if r['status'] == 'filtered'])
    skipped_count = len([r for r in all_results if r['status'] == 'skipped'])
    
    logger.info(f"Underlyings processed: {processed_count}")
    logger.info(f"Underlyings filtered (normal day): {filtered_count}")
    logger.info(f"Underlyings skipped (insufficient data): {skipped_count}")
    
    if len(anomaly_df) > 0:
        logger.info(f"\nTotal anomalies detected: {len(anomaly_df)}")
        logger.info(f"Unique underlyings with anomalies: {anomaly_df['underlying'].nunique()}")
        logger.info(f"Total flagged volume: {anomaly_df['size'].sum():,.0f} contracts")
        logger.info(f"Total flagged notional: ${anomaly_df['opt_trade_notional_value'].sum():,.2f}")
        
        logger.info(f"\nTop 10 Anomalies:")
        logger.info("-" * 50)
        for _, row in anomaly_df.head(10).iterrows():
            logger.info(f"  #{row['rank']}: {row['underlying']} | {row['ticker']} | "
                       f"Score={row['anomaly_score']:.3f} | "
                       f"Size={row['size']:,} | "
                       f"OTM={row['otm_percentage']:.1f}%")
    
    logger.info(f"\nProcessing time: {processing_time:.1f} seconds")
    
    return {
        'target_date': target_date,
        'status': 'completed',
        'underlyings_total': len(underlyings),
        'underlyings_processed': processed_count,
        'underlyings_filtered': filtered_count,
        'underlyings_skipped': skipped_count,
        'anomalies_detected': len(anomaly_df),
        'processing_time_seconds': processing_time,
    }


def main():
    """Main execution function."""
    
    overall_start_time = datetime.now()
    
    logger.info("=" * 70)
    logger.info("PHASE 04: ANOMALY DETECTION FOR TYPE A UOA")
    logger.info("=" * 70)
    
    # -------------------------------------------------------------------------
    # SETUP
    # -------------------------------------------------------------------------
    
    input_folder = CONFIG["input_folder"]
    output_folder = CONFIG["output_folder"]
    single_date_mode = CONFIG["single_date_mode"]
    
    output_folder.mkdir(parents=True, exist_ok=True)
    
    logger.info(f"Input folder: {input_folder}")
    logger.info(f"Output folder: {output_folder}")
    logger.info(f"Mode: {'Single Date' if single_date_mode else 'Multiple Dates'}")
    logger.info(f"Training window: {CONFIG['training_window_days']} days")
    logger.info(f"Minimum history: {CONFIG['min_history_days']} days")
    logger.info(f"Chain volume z-score threshold: {CONFIG['chain_volume_zscore_threshold']}")
    logger.info(f"Anomaly percentile: {CONFIG['anomaly_percentile']}%")
    
    # -------------------------------------------------------------------------
    # DETERMINE DATES TO PROCESS
    # -------------------------------------------------------------------------
    
    if single_date_mode:
        target_dates = [CONFIG["target_date"]]
        logger.info(f"Target date: {CONFIG['target_date']}")
    else:
        start_date = CONFIG["start_date"]
        end_date = CONFIG["end_date"]
        target_dates = get_dates_in_range(input_folder, start_date, end_date)
        logger.info(f"Date range: {start_date} to {end_date}")
        logger.info(f"Found {len(target_dates)} dates with data in range")
    
    if not target_dates:
        logger.error("No dates to process!")
        return
    
    # -------------------------------------------------------------------------
    # PROCESS EACH DATE
    # -------------------------------------------------------------------------
    
    date_results = []
    
    for i, target_date in enumerate(target_dates):
        if not single_date_mode:
            logger.info(f"\n{'#' * 70}")
            logger.info(f"# DATE {i+1}/{len(target_dates)}: {target_date}")
            logger.info(f"{'#' * 70}")
        
        result = process_single_date(target_date, input_folder, output_folder)
        date_results.append(result)
    
    # -------------------------------------------------------------------------
    # OVERALL SUMMARY (for multi-date mode)
    # -------------------------------------------------------------------------
    
    if not single_date_mode and len(target_dates) > 1:
        overall_time = (datetime.now() - overall_start_time).total_seconds()
        
        logger.info("\n" + "=" * 70)
        logger.info("OVERALL SUMMARY (ALL DATES)")
        logger.info("=" * 70)
        
        completed_dates = [r for r in date_results if r['status'] == 'completed']
        skipped_dates = [r for r in date_results if r['status'] == 'skipped']
        
        total_anomalies = sum(r.get('anomalies_detected', 0) for r in completed_dates)
        total_processed = sum(r.get('underlyings_processed', 0) for r in completed_dates)
        
        logger.info(f"Dates processed: {len(completed_dates)}/{len(target_dates)}")
        logger.info(f"Dates skipped (no data): {len(skipped_dates)}")
        logger.info(f"Total anomalies detected: {total_anomalies}")
        logger.info(f"Total underlyings processed: {total_processed}")
        logger.info(f"Overall processing time: {overall_time:.1f} seconds")
        
        # Save overall summary
        def convert_to_native(obj):
            """Convert numpy types to native Python types for JSON serialization."""
            if isinstance(obj, (np.integer,)):
                return int(obj)
            elif isinstance(obj, (np.floating,)):
                return float(obj)
            elif isinstance(obj, np.ndarray):
                return obj.tolist()
            elif isinstance(obj, dict):
                return {k: convert_to_native(v) for k, v in obj.items()}
            elif isinstance(obj, list):
                return [convert_to_native(i) for i in obj]
            return obj
        
        overall_summary = {
            'run_timestamp': datetime.now().isoformat(),
            'mode': 'multiple_dates',
            'start_date': CONFIG['start_date'],
            'end_date': CONFIG['end_date'],
            'dates_processed': len(completed_dates),
            'dates_skipped': len(skipped_dates),
            'total_anomalies': int(total_anomalies),
            'total_underlyings_processed': int(total_processed),
            'overall_processing_time_seconds': round(overall_time, 2),
            'date_results': convert_to_native(date_results),
        }
        
        overall_log_path = output_folder / f"uoa_batch_summary_{CONFIG['start_date']}_to_{CONFIG['end_date']}.json"
        with open(overall_log_path, 'w') as f:
            json.dump(overall_summary, f, indent=2)
        logger.info(f"\nSaved batch summary: {overall_log_path.name}")
    
    logger.info("\n" + "=" * 70)
    logger.info("PHASE 05A - Anomaly Detection for LIT COMPLETE")
    logger.info("=" * 70)


# =============================================================================
# ENTRY POINT
# =============================================================================

if __name__ == "__main__":
    main()