In [None]:
"""
================================================================================
Phase 02A - Merge Underlying Asset OHLCVT Ã— Options Trades Data
================================================================================

DESCRIPTION:
    Enriches raw options trade data with end-of-day (EOD) underlying stock 
    information by joining intraday options trades with daily stock aggregates.

OPERATIONS:
    1. Converts sip_timestamp from nanoseconds (UTC) to NYC timezone string
    2. Maps underlying stock closing price to each trade
    3. Maps underlying stock daily volume to each trade
    4. Exports enriched data as date-partitioned parquet files

--------------------------------------------------------------------------------
INPUT FILES
--------------------------------------------------------------------------------

1. Options Trades (trades_folder):
   - Path:     {trades_folder}/{YYYY-MM-DD}.parquet
   - Required: sip_timestamp (int64, nanoseconds UTC), underlying (string)

2. EOD Stock Aggregates (eod_folder):
   - Path:     {eod_folder}/{YYYY-MM-DD}.parquet
   - Required: ticker (string), close (float), volume (int/float)

--------------------------------------------------------------------------------
OUTPUT FILES
--------------------------------------------------------------------------------

Enriched Trades (output_folder):
   - Path:     {output_folder}/{YYYY-MM-DD}.parquet
   - Adds:     sip_timestamp (converted to 'YYYY-MM-DD HH:MM:SS' NYC time)
               underlying_close (float, EOD close price)
               underlying_volume (float, EOD volume)
   - Note:     Unmatched underlyings will have NaN for close/volume

--------------------------------------------------------------------------------
CONFIGURATION
--------------------------------------------------------------------------------

Modify the variables below:
    start_date_str  : First date to process (inclusive)
    end_date_str    : Last date to process (inclusive)
    trades_folder   : Path to options trades parquet files
    eod_folder      : Path to EOD stock aggregates parquet files
    output_folder   : Path for enriched output files

--------------------------------------------------------------------------------
USAGE
--------------------------------------------------------------------------------

    $ python Phase_02A_merge_underlying.py

--------------------------------------------------------------------------------
DEPENDENCIES
--------------------------------------------------------------------------------

    pandas>=1.5.0, pyarrow>=10.0.0

================================================================================
"""


import pandas as pd
from datetime import datetime, timedelta
import os

# Modify these variables as needed
start_date_str = '2020-01-02'  # Start date in YYYY-MM-DD format
end_date_str = '2020-01-02'    # End date in YYYY-MM-DD format

trades_folder = r'D:\cyclelabs_codes\CL_20251120_siphontrades\ALL_TRADES'  # Folder containing trades parquet files
eod_folder = r"D:\cyclelabs_codes\CL_20251120_siphontrades\US_STOCKS_DAY_AGGS"        # Folder containing EOD stocks parquet files
output_folder = './output'  # Folder to save updated trades parquet files

def process_dates(start_date_str, end_date_str, trades_folder, eod_folder, output_folder):
    # Parse start and end dates
    start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
    end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
    
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)
    
    current_date = start_date
    while current_date <= end_date:
        date_str = current_date.strftime('%Y-%m-%d')
        trade_file = os.path.join(trades_folder, f"{date_str}.parquet")
        eod_file = os.path.join(eod_folder, f"{date_str}.parquet")
        output_file = os.path.join(output_folder, f"{date_str}.parquet")
        
        if os.path.exists(trade_file) and os.path.exists(eod_file):
            try:
                # Load dataframes
                df_trades = pd.read_parquet(trade_file)
                df_eod = pd.read_parquet(eod_file)
                
                # Convert sip_timestamp to human-readable NYC time (handles DST automatically)
                df_trades['sip_timestamp'] = pd.to_datetime(df_trades['sip_timestamp'], unit='ns', utc=True).dt.tz_convert('America/New_York').dt.strftime('%Y-%m-%d %H:%M:%S')
                
                # Create mapping from underlying (ticker in eod) to close price and volume
                underlying_to_close = df_eod.set_index('ticker')['close'].to_dict()
                underlying_to_volume = df_eod.set_index('ticker')['volume'].to_dict()
                
                # Map the close prices and volumes to trades dataframe
                df_trades['underlying_close'] = df_trades['underlying'].map(underlying_to_close)
                df_trades['underlying_volume'] = df_trades['underlying'].map(underlying_to_volume)
                
                # Save the updated trades dataframe
                df_trades.to_parquet(output_file, index=False)
                print(f"Processed and saved: {date_str}")
            except Exception as e:
                print(f"Error processing {date_str}: {e}")
        else:
            print(f"Files not found for {date_str}, skipping.")
        
        # Move to next day
        current_date += timedelta(days=1)

if __name__ == "__main__":
    process_dates(start_date_str, end_date_str, trades_folder, eod_folder, output_folder)