In [None]:
"""
================================================================================
01v1_ingest_optionstrades_massive.ipynb
================================================================================
Options Trade-by-Trade Data Ingestion from Massive S3 Files

DESCRIPTION
-----------
Downloads raw options trade data from Massive's S3-compatible storage and saves
it as daily Parquet files. Each file contains ALL options trades for ALL 
optionable tickers for a single trading day.

CONFIGURATION
-------------
Set these variables before running:

    ACCESS_KEY   : str  - Massive S3 access key
    SECRET_KEY   : str  - Massive S3 secret key
    START_DATE   : str  - Start date in "YYYY-MM-DD" format
    END_DATE     : str  - End date in "YYYY-MM-DD" format
    OUTPUT_DIR   : str  - Directory path for output files (default: "./ALL_TRADES")
    MAX_WORKERS  : int  - Number of parallel workers (default: 10)

OUTPUT
------
Parquet files saved to OUTPUT_DIR with naming convention:

    {OUTPUT_DIR}/YYYY-MM-DD.parquet

Each file contains trades with the following columns:

    ticker         : str  - Option symbol (e.g., "O:TSLA240119C00250000")
                            Format: O:{UNDERLYING}{YYMMDD}{C/P}{STRIKE*1000}
    conditions     : int  - Trade condition code (see Massive docs)
    correction     : int  - Correction indicator (0 = no correction)
    exchange       : int  - Exchange code where trade occurred
    price          : float - Trade price per share (multiply by 100 for contract value)
    sip_timestamp  : str  - Unix timestamp in nanoseconds (UTC)
    size           : int  - Number of contracts traded
    underlying     : str  - Underlying ticker symbol (e.g., "TSLA")

NOTES
-----
- Only processes business days (Monday-Friday)
- Skips dates that already have output files (idempotent)
- Holidays and unavailable dates are logged and skipped gracefully
- Data typically available ~1 hour after market close

USAGE
-----
1. Set ACCESS_KEY and SECRET_KEY with your Massive credentials
2. Set START_DATE and END_DATE for desired date range
3. Run the script

EXAMPLE
-------
    ACCESS_KEY = "your_access_key"
    SECRET_KEY = "your_secret_key"
    START_DATE = "2024-01-01"
    END_DATE   = "2024-01-31"
    OUTPUT_DIR = "./ALL_TRADES"
    
    # Then run the script

================================================================================
"""


import os
import pandas as pd
from datetime import datetime, timedelta
from concurrent.futures import ProcessPoolExecutor, as_completed
import boto3
from botocore.config import Config
from tqdm import tqdm
import tempfile

# ==================== CONFIGURATION ====================

ACCESS_KEY = ""
SECRET_KEY = ""

BUCKET_NAME = "flatfiles"
S3_ENDPOINT = "https://files.massive.com"

START_DATE = "2020-01-01"
END_DATE   = "2020-01-20"

OUTPUT_DIR = "./ALL_TRADES"  # Changed to reflect all tickers
os.makedirs(OUTPUT_DIR, exist_ok=True)

MAX_WORKERS = 10

# ======================================================


def get_s3_client():
    """Create a Massive.com-compatible S3 client."""
    return boto3.client(
        "s3",
        endpoint_url=S3_ENDPOINT,
        region_name="us-east-1",
        aws_access_key_id=ACCESS_KEY,
        aws_secret_access_key=SECRET_KEY,
        config=Config(
            signature_version="s3v4",
            retries={"max_attempts": 3},
            s3={"addressing_style": "path"}
        ),
    )


def daterange(start_date, end_date):
    """Yield business days between start and end date."""
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")
    for n in range((end - start).days + 1):
        dt = start + timedelta(days=n)
        if dt.weekday() < 5:  # Monday–Friday
            yield dt.strftime("%Y-%m-%d")


def download_day(date: str):
    filename = f"{date}.csv.gz"
    year = date[:4]
    month = date[5:7]
    s3_key = f"us_options_opra/trades_v1/{year}/{month}/{filename}"

    temp_path = os.path.join(tempfile.gettempdir(), filename)

    # Early skip: if the daily file already exists, don't download
    outpath = os.path.join(OUTPUT_DIR, f"{date}.parquet")
    if os.path.exists(outpath):
        return f"✓ {date}: already processed"

    try:
        s3 = get_s3_client()
        s3.download_file(BUCKET_NAME, s3_key, temp_path)

        # Read the CSV with actual column names
        df = pd.read_csv(
            temp_path,
            compression="gzip",
            low_memory=False
        )
        
        # Find the ticker column (should be 'ticker' based on the schema)
        ticker_col = None
        possible_ticker_cols = ["ticker", "underlying_ticker", "underlying", "symbol"]
        
        for col in possible_ticker_cols:
            if col in df.columns:
                ticker_col = col
                break
        
        if ticker_col is None:
            return f"✗ {date}: no ticker column found. Available: {list(df.columns)}"
        
        # Extract underlying ticker from option symbols (format: O:SPY230324C00450000)
        # The underlying is between "O:" and the date
        def extract_underlying(option_symbol):
            try:
                if pd.isna(option_symbol):
                    return None
                s = str(option_symbol)
                if s.startswith("O:"):
                    # Remove "O:" prefix
                    s = s[2:]
                    # Find where the date starts (first digit after ticker letters)
                    for i, char in enumerate(s):
                        if char.isdigit():
                            return s[:i]
                return s
            except:
                return None
        
        df['underlying'] = df[ticker_col].apply(extract_underlying)
        
        # Debug: show what underlyings we found (optional, can remove if not needed)
        # unique_underlyings = df['underlying'].unique()[:20]
        
        if df.empty:
            return f"− {date}: no trades"

        # Save the full DataFrame
        df.to_parquet(outpath, compression="snappy")

        size_mb = os.path.getsize(outpath) / 1e6
        return f"✓ {date}: {len(df):,} trades ({size_mb:.2f} MB)"

    except Exception as e:
        msg = str(e)
        # Normal behavior: Massive returns 403 for "file not found"
        if "Forbidden" in msg or "404" in msg or "NoSuchKey" in msg:
            return f"− {date}: no file (holiday or unavailable)"
        return f"✗ {date}: {msg[:200]}"

    finally:
        if os.path.exists(temp_path):
            try:
                os.remove(temp_path)
            except:
                pass


# ==================== MAIN ====================

if __name__ == "__main__":
    dates = list(daterange(START_DATE, END_DATE))

    print(f"Downloading options trades (all tickers)")
    print(f"Date range : {START_DATE} → {END_DATE}")
    print(f"Days       : {len(dates)} business days")
    print(f"Workers    : {MAX_WORKERS}")
    print(f"Output     : {OUTPUT_DIR}\n")

    # Process dates sequentially for easier debugging
    for date in tqdm(dates, desc="Progress"):
        result = download_day(date)
        print(result)

    print("\nSummary:")
    print("=" * 60)
    files = [f for f in os.listdir(OUTPUT_DIR) if f.endswith(".parquet")]
    total_gb = sum(os.path.getsize(os.path.join(OUTPUT_DIR, f)) for f in files) / 1e9
    print(f"  {len(files)} files, {total_gb:.2f} GB")
    print("=" * 60)
    print("Done!")