# Get all Historical Trades where DTE > 0

In [12]:
import pandas as pd
from sqlalchemy import create_engine, text
import os
from datetime import datetime, timedelta
import sys
import time

# Database connection setup
DB_CONFIG = {
    'dbname': 'defaultdb',
    'user': 'doadmin',
    'password': 'AVNS_SrG4Bo3B7uCNEPONkE4', # Consider moving to environment variable
    'host': 'vvv-trading-db-do-user-2110609-0.i.db.ondigitalocean.com',
    'port': '25060'
}

DATABASE_URL = f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['dbname']}"
engine = create_engine(DATABASE_URL, connect_args={'sslmode': 'require'})

# Create data directory
os.makedirs('data', exist_ok=True)

# Generate filenames with current timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
darkpool_filename = f'data/darkpool_trades_historical_dte_gt0_{timestamp}.csv'
options_filename = f'data/options_flow_historical_dte_gt0_{timestamp}.csv'

print("Fetching ALL historical dark pool trades where DTE > 0...")
print("This might take some time depending on the database size...")

# Modified query for dark pool trades - fetch all historical
darkpool_query = """
SELECT 
    t.*,
    date_trunc('hour', t.executed_at) as trade_hour,
    t.price - t.nbbo_bid as price_impact,
    (t.price - t.nbbo_bid) / t.nbbo_bid as price_impact_pct,
    CASE 
        WHEN t.size >= 10000 THEN 'Block Trade'
        WHEN t.premium >= 0.02 THEN 'High Premium'
        ELSE 'Regular'
    END as trade_type,
    count(*) over (partition by t.symbol, date_trunc('hour', t.executed_at)) as trades_per_hour,
    sum(t.size) over (partition by t.symbol, date_trunc('hour', t.executed_at)) as volume_per_hour
FROM trading.darkpool_trades t
ORDER BY t.executed_at DESC
"""

# Modified query for options flow - fetch all historical
options_query = """
SELECT 
    f.*,
    date_trunc('hour', f.collected_at) as flow_hour,
    CASE 
        WHEN f.premium >= 1000000 THEN 'Whale'
        WHEN f.premium >= 100000 THEN 'Large'
        ELSE 'Regular'
    END as flow_size,
    count(*) over (partition by f.symbol, date_trunc('hour', f.collected_at)) as flows_per_hour,
    sum(f.premium) over (partition by f.symbol, date_trunc('hour', f.collected_at)) as premium_per_hour,
    sum(f.contract_size) over (partition by f.symbol, date_trunc('hour', f.collected_at)) as contracts_per_hour
FROM trading.options_flow f
ORDER BY f.collected_at DESC
"""

# Process data in chunks to handle potentially large datasets
def fetch_in_chunks(query, engine, filename, chunk_size=10000):
    """Fetch data in chunks to avoid memory issues with large datasets.
    
    Args:
        query: SQL query as string
        engine: SQLAlchemy engine
        filename: Output file to save chunks
        chunk_size: Number of rows per chunk
    """
    start_time = time.time()
    connection = engine.connect().execution_options(stream_results=True)
    chunks = []
    
    try:
        print(f"Executing query...")
        # Convert the string query to a SQLAlchemy text object
        result = connection.execute(text(query))
        
        total_rows = 0
        chunk_num = 0
        
        while True:
            chunk = result.fetchmany(chunk_size)
            if not chunk:
                break
                
            chunk_df = pd.DataFrame(chunk, columns=result.keys())
            chunks.append(chunk_df)
            total_rows += len(chunk_df)
            chunk_num += 1
            
            elapsed = time.time() - start_time
            rows_per_sec = total_rows / elapsed if elapsed > 0 else 0
            
            print(f"Fetched {total_rows} rows so far... ({rows_per_sec:.2f} rows/sec, chunk {chunk_num})")
            
            # Write chunk to disk to avoid memory issues
            if len(chunks) >= 5:  # After accumulating 5 chunks
                print(f"Writing chunks to {filename}...")
                combined = pd.concat(chunks, ignore_index=True)
                if not os.path.exists(filename):
                    combined.to_csv(filename, index=False)
                else:
                    combined.to_csv(filename, mode='a', header=False, index=False)
                chunks = []  # Clear the chunks from memory
                print(f"Wrote {len(combined)} rows to file, continuing fetch...")
                
        # Process any remaining chunks
        if chunks:
            print(f"Writing final chunks to {filename}...")
            combined = pd.concat(chunks, ignore_index=True)
            if not os.path.exists(filename):
                combined.to_csv(filename, index=False)
            else:
                combined.to_csv(filename, mode='a', header=False, index=False)
                
        total_time = time.time() - start_time
        print(f"Fetch completed in {total_time:.2f} seconds")
        return total_rows
        
    finally:
        connection.close()

# Fetch and save darkpool trades in chunks
print("\nFetching and saving darkpool trades...")
total_darkpool_rows = fetch_in_chunks(darkpool_query, engine, darkpool_filename)
print(f"Completed saving {total_darkpool_rows} darkpool trades to {darkpool_filename}")

# Fetch and save options flow data in chunks
print("\nFetching and saving options flow data...")
total_options_rows = fetch_in_chunks(options_query, engine, options_filename)
print(f"Completed saving {total_options_rows} option flows to {options_filename}")

print("\nFull data fetch complete.")
print("\nGenerating summary statistics...")

# Process and summarize data from the saved files
try:
    # Load a sample of the data to generate summary statistics
    trades_sample = pd.read_csv(darkpool_filename, nrows=100000)
    options_sample = pd.read_csv(options_filename, nrows=100000)
    
    # Process darkpool trades
    trades_sample['executed_at'] = pd.to_datetime(trades_sample['executed_at'])
    if 'collection_time' in trades_sample.columns:
        trades_sample['collection_time'] = pd.to_datetime(trades_sample['collection_time'])
    trades_sample['trade_hour'] = pd.to_datetime(trades_sample['trade_hour'])
    
    # Process options flow
    options_sample['collected_at'] = pd.to_datetime(options_sample['collected_at'])
    if 'created_at' in options_sample.columns:
        options_sample['created_at'] = pd.to_datetime(options_sample['created_at'])
    options_sample['expiry'] = pd.to_datetime(options_sample['expiry'])
    options_sample['flow_hour'] = pd.to_datetime(options_sample['flow_hour'])
    
    # Print darkpool trade summary (from sample)
    print("\nDarkpool Trade sample summary by symbol (first 100k rows):")
    print(trades_sample.groupby('symbol').agg({
        'size': ['count', 'sum', 'mean'],
        'premium': ['mean', 'max'],
        'price_impact_pct': 'mean'
    }).round(2))
    
    # Print options flow summary (from sample)
    print("\nOptions Flow sample summary by symbol (first 100k rows):")
    print(options_sample.groupby('symbol').agg({
        'premium': ['count', 'sum', 'mean', 'max'],
        'contract_size': ['sum', 'mean'],
        'iv_rank': 'mean'
    }).round(2))
    
    # Print date ranges for samples
    print("\nDate ranges (from samples):")
    print("Darkpool Trades:")
    print(f"Earliest trade in sample: {trades_sample['executed_at'].min()}")
    print(f"Latest trade in sample: {trades_sample['executed_at'].max()}")
    print(f"Total trades fetched: {total_darkpool_rows}")
    
    print("\nOptions Flow:")
    print(f"Earliest flow in sample: {options_sample['collected_at'].min()}")
    print(f"Latest flow in sample: {options_sample['collected_at'].max()}")
    print(f"Total flows fetched: {total_options_rows}")
    
except Exception as e:
    print(f"Error generating summary statistics: {str(e)}")
    print("Data has been saved to files, but summary statistics could not be generated.")

print("\nFull data is available in:")
print(f"- {darkpool_filename}")
print(f"- {options_filename}")

Fetching ALL historical dark pool trades where DTE > 0...
This might take some time depending on the database size...

Fetching and saving darkpool trades...
Executing query...
Fetched 10000 rows so far... (5019.17 rows/sec, chunk 1)
Fetched 16062 rows so far... (6534.12 rows/sec, chunk 2)
Writing final chunks to data/darkpool_trades_historical_dte_gt0_20250506_101732.csv...
Fetch completed in 2.91 seconds
Completed saving 16062 darkpool trades to data/darkpool_trades_historical_dte_gt0_20250506_101732.csv

Fetching and saving options flow data...
Executing query...
Fetch completed in 0.27 seconds
Completed saving 0 option flows to data/options_flow_historical_dte_gt0_20250506_101732.csv

Full data fetch complete.

Generating summary statistics...
Error generating summary statistics: [Errno 2] No such file or directory: 'data/options_flow_historical_dte_gt0_20250506_101732.csv'
Data has been saved to files, but summary statistics could not be generated.

Full data is available in:
- da

# Get Trades for Last 24 Hours

In [11]:
import pandas as pd
from sqlalchemy import create_engine
import os
from datetime import datetime, timedelta

# Database connection setup (same as before)
DB_CONFIG = {
    'dbname': 'defaultdb',
    'user': 'doadmin',
    'password': 'AVNS_SrG4Bo3B7uCNEPONkE4',
    'host': 'vvv-trading-db-do-user-2110609-0.i.db.ondigitalocean.com',
    'port': '25060'
}

DATABASE_URL = f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['dbname']}"
engine = create_engine(DATABASE_URL, connect_args={'sslmode': 'require'})

# Calculate timestamp for 24 hours ago
twenty_four_hours_ago = datetime.now() - timedelta(hours=24)

# Query for dark pool trades from last 24 hours
darkpool_query = """
SELECT 
    t.*,
    date_trunc('hour', t.executed_at) as trade_hour,
    t.price - t.nbbo_bid as price_impact,
    (t.price - t.nbbo_bid) / t.nbbo_bid as price_impact_pct,
    CASE 
        WHEN t.size >= 10000 THEN 'Block Trade'
        WHEN t.premium >= 0.02 THEN 'High Premium'
        ELSE 'Regular'
    END as trade_type,
    count(*) over (partition by t.symbol, date_trunc('hour', t.executed_at)) as trades_per_hour,
    sum(t.size) over (partition by t.symbol, date_trunc('hour', t.executed_at)) as volume_per_hour
FROM trading.darkpool_trades t
WHERE t.executed_at >= %(twenty_four_hours_ago)s
ORDER BY t.executed_at DESC
"""

# Query for options flow from last 24 hours
options_query = """
SELECT 
    f.*,
    date_trunc('hour', f.collected_at) as flow_hour,
    CASE 
        WHEN f.premium >= 1000000 THEN 'Whale'
        WHEN f.premium >= 100000 THEN 'Large'
        ELSE 'Regular'
    END as flow_size,
    count(*) over (partition by f.symbol, date_trunc('hour', f.collected_at)) as flows_per_hour,
    sum(f.premium) over (partition by f.symbol, date_trunc('hour', f.collected_at)) as premium_per_hour,
    sum(f.contract_size) over (partition by f.symbol, date_trunc('hour', f.collected_at)) as contracts_per_hour
FROM trading.options_flow f
WHERE f.collected_at >= %(twenty_four_hours_ago)s
ORDER BY f.collected_at DESC
"""

# Fetch both datasets with the time parameter
print("Fetching dark pool trades from last 24 hours...")
trades_df = pd.read_sql_query(darkpool_query, engine, params={'twenty_four_hours_ago': twenty_four_hours_ago})

print("Fetching options flow data from last 24 hours...")
options_df = pd.read_sql_query(options_query, engine, params={'twenty_four_hours_ago': twenty_four_hours_ago})

# Process darkpool trades
trades_df['executed_at'] = pd.to_datetime(trades_df['executed_at'])
trades_df['collection_time'] = pd.to_datetime(trades_df['collection_time'])
trades_df['trade_hour'] = pd.to_datetime(trades_df['trade_hour'])

# Process options flow
options_df['collected_at'] = pd.to_datetime(options_df['collected_at'])
options_df['created_at'] = pd.to_datetime(options_df['created_at'])
options_df['expiry'] = pd.to_datetime(options_df['expiry'])
options_df['flow_hour'] = pd.to_datetime(options_df['flow_hour'])

# Create data directory
os.makedirs('data', exist_ok=True)

# Generate filenames with current timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
darkpool_filename = f'data/darkpool_trades_24h_{timestamp}.csv'
options_filename = f'data/options_flow_24h_{timestamp}.csv'

# Save both datasets
trades_df.to_csv(darkpool_filename, index=False)
options_df.to_csv(options_filename, index=False)

print(f"\nSaved {len(trades_df)} trades to {darkpool_filename}")
print(f"Saved {len(options_df)} option flows to {options_filename}")

# Print darkpool trade summary
print("\nDarkpool Trade summary by symbol:")
print(trades_df.groupby('symbol').agg({
    'size': ['count', 'sum', 'mean'],
    'premium': ['mean', 'max'],
    'price_impact_pct': 'mean'
}).round(2))

# Print options flow summary
print("\nOptions Flow summary by symbol:")
print(options_df.groupby('symbol').agg({
    'premium': ['count', 'sum', 'mean', 'max'],
    'contract_size': ['sum', 'mean'],
    'iv_rank': 'mean'
}).round(2))

# Print date ranges for both datasets
print("\nDate ranges:")
print("Darkpool Trades:")
print(f"Earliest trade: {trades_df['executed_at'].min()}")
print(f"Latest trade: {trades_df['executed_at'].max()}")
print(f"Total trades: {len(trades_df)}")
print(f"Total volume: {trades_df['size'].sum():,.0f}")

print("\nOptions Flow:")
print(f"Earliest flow: {options_df['collected_at'].min()}")
print(f"Latest flow: {options_df['collected_at'].max()}")
print(f"Total flows: {len(options_df)}")
print(f"Total premium: ${options_df['premium'].sum():,.2f}")

Fetching dark pool trades from last 24 hours...
Fetching options flow data from last 24 hours...

Saved 624 trades to data/darkpool_trades_24h_20250505_220016.csv
Saved 0 option flows to data/options_flow_24h_20250505_220016.csv

Darkpool Trade summary by symbol:
        size                           premium              price_impact_pct
       count       sum       mean         mean          max             mean
symbol                                                                      
AAPL      17   15143.0     890.76    178183.42    300217.50              0.0
ABNB       4    6100.0    1525.00    190850.87    200291.84              0.0
ALL        1     600.0     600.00    120820.02    120820.02             -0.0
AMAT       2    1382.0     691.00    107569.09    110260.95              0.0
AMD        2    5700.0    2850.00    288955.84    466424.82              0.0
...      ...       ...        ...          ...          ...              ...
XLK        1  101326.0  101326.00  21954436