# Dark Pool Trade Analysis

This notebook connects to the production database and analyzes dark pool trades.

In [22]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
from datetime import datetime, timedelta
import pytz
from dotenv import load_dotenv

# Set plot style
#plt.style.use('seaborn')
#"sns.set_palette('deep')
%matplotlib inline

In [23]:
# Database connection setup
DB_CONFIG = {
    'dbname': 'defaultdb',
    'user': 'doadmin',
    'password': 'AVNS_SrG4Bo3B7uCNEPONkE4',
    'host': 'vvv-trading-db-do-user-2110609-0.i.db.ondigitalocean.com',
    'port': '25060'
}

# Create database URL
DATABASE_URL = f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['dbname']}"

# Create engine with SSL required
engine = create_engine(
    DATABASE_URL,
    connect_args={
        'sslmode': 'require'
    }
)

In [24]:
# Function to fetch trades for a specific date
def fetch_trades(date_str=None):
    query = """
    SELECT *
    FROM trading.darkpool_trades
    WHERE DATE(executed_at) = %(date)s
    ORDER BY executed_at
    """
    
    if date_str is None:
        date_str = '2025-04-17'  # Default to our test data date
    
    # Pass parameters as a dictionary
    params = {'date': date_str}
    return pd.read_sql_query(query, engine, params=params)

# Fetch trades for April 17th
trades_df = fetch_trades('2025-04-17')
print(f"Fetched {len(trades_df)} trades")
trades_df.head()

Fetched 15 trades


Unnamed: 0,id,tracking_id,symbol,size,price,volume,premium,executed_at,nbbo_ask,nbbo_bid,market_center,sale_cond_codes,collection_time,created_at
0,415,70331564740492,QQQ,493.0,445.3,44810365.0,219532.9,2025-04-17 23:32:11+00:00,445.55,445.18,L,,2025-04-19 19:54:42.651306+00:00,2025-04-19 19:54:42.663577+00:00
1,414,70374608487124,SPY,473.0,527.21,79846628.0,249370.33,2025-04-17 23:32:54+00:00,527.24,527.01,L,,2025-04-19 19:54:42.651306+00:00,2025-04-19 19:54:42.663577+00:00
2,413,70454350009635,QQQ,299.0,445.3884,44811967.0,133171.1316,2025-04-17 23:34:14+00:00,445.39,445.3,L,,2025-04-19 19:54:42.651306+00:00,2025-04-19 19:54:42.663577+00:00
3,412,70469265910490,QQQ,1000.0,445.25,44813023.0,445250.0,2025-04-17 23:34:29+00:00,445.39,445.3,L,,2025-04-19 19:54:42.651306+00:00,2025-04-19 19:54:42.663577+00:00
4,411,70629848116153,QQQ,729.0,445.1111,44814745.0,324485.9919,2025-04-17 23:37:09+00:00,445.4,445.11,L,,2025-04-19 19:54:42.651306+00:00,2025-04-19 19:54:42.663577+00:00


# Get Latest Trades Log from Prod DB

In [21]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection setup
DB_CONFIG = {
    'dbname': 'defaultdb',
    'user': 'doadmin',
    'password': 'AVNS_SrG4Bo3B7uCNEPONkE4',
    'host': 'vvv-trading-db-do-user-2110609-0.i.db.ondigitalocean.com',
    'port': '25060'
}

# Create database URL
DATABASE_URL = f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['dbname']}"

# Create engine with SSL required
engine = create_engine(
    DATABASE_URL,
    connect_args={
        'sslmode': 'require'
    }
)

# Query the most recent logs
query = """
SELECT 
    timestamp,
    level,
    message,
    date_trunc('minute', timestamp) as log_minute,
    count(*) over (partition by date_trunc('minute', timestamp)) as logs_per_minute
FROM trading.collector_logs
ORDER BY timestamp DESC
LIMIT 10
"""

logs_df = pd.read_sql_query(query, engine)
print(f"\nMost recent {len(logs_df)} log entries:")
display(logs_df)


Most recent 10 log entries:


Unnamed: 0,timestamp,level,message,log_minute,logs_per_minute
0,2025-05-03 11:30:02.981436+00:00,INFO,Market closed - weekend (Saturday),2025-05-03 11:30:00+00:00,1
1,2025-05-03 11:25:02.817692+00:00,INFO,Market closed - weekend (Saturday),2025-05-03 11:25:00+00:00,1
2,2025-05-03 11:20:02.814416+00:00,INFO,Market closed - weekend (Saturday),2025-05-03 11:20:00+00:00,1
3,2025-05-03 11:15:02.773588+00:00,INFO,Market closed - weekend (Saturday),2025-05-03 11:15:00+00:00,1
4,2025-05-03 11:10:02.932881+00:00,INFO,Market closed - weekend (Saturday),2025-05-03 11:10:00+00:00,1
5,2025-05-03 11:05:03.010078+00:00,INFO,Market closed - weekend (Saturday),2025-05-03 11:05:00+00:00,1
6,2025-05-03 11:00:03.149783+00:00,INFO,Market closed - weekend (Saturday),2025-05-03 11:00:00+00:00,1
7,2025-05-03 10:55:03.087744+00:00,INFO,Market closed - weekend (Saturday),2025-05-03 10:55:00+00:00,1
8,2025-05-03 10:50:03.044299+00:00,INFO,Market closed - weekend (Saturday),2025-05-03 10:50:00+00:00,1
9,2025-05-03 10:45:03.115710+00:00,INFO,Market closed - weekend (Saturday),2025-05-03 10:45:00+00:00,1


# Get All Trades and Save as CSV

In [24]:
import pandas as pd
from sqlalchemy import create_engine
import os
from datetime import datetime

DB_CONFIG = {
    'dbname': 'defaultdb',
    'user': 'doadmin',
    'password': 'AVNS_SrG4Bo3B7uCNEPONkE4',
    'host': 'vvv-trading-db-do-user-2110609-0.i.db.ondigitalocean.com',
    'port': '25060',
    'sslmode': 'require'
}

# Create database URL
DATABASE_URL = f"postgresql://{DB_CONFIG['user']}:{DB_CONFIG['password']}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['dbname']}"

# Create engine with SSL required
engine = create_engine(
    DATABASE_URL,
    connect_args={
        'sslmode': 'require'
    }
)

# Query all dark pool trades with enhanced metrics
query = """
SELECT 
    t.*,
    date_trunc('hour', t.executed_at) as trade_hour,
    t.price - t.nbbo_bid as price_impact,
    (t.price - t.nbbo_bid) / t.nbbo_bid as price_impact_pct,
    CASE 
        WHEN t.size >= 10000 THEN 'Block Trade'
        WHEN t.premium >= 0.02 THEN 'High Premium'
        ELSE 'Regular'
    END as trade_type,
    count(*) over (partition by t.symbol, date_trunc('hour', t.executed_at)) as trades_per_hour,
    sum(t.size) over (partition by t.symbol, date_trunc('hour', t.executed_at)) as volume_per_hour
FROM trading.darkpool_trades t
ORDER BY t.executed_at DESC
"""

# Fetch trades
print("Fetching all dark pool trades...")
trades_df = pd.read_sql_query(query, engine)

# Convert timestamp columns
trades_df['executed_at'] = pd.to_datetime(trades_df['executed_at'])
trades_df['collection_time'] = pd.to_datetime(trades_df['collection_time'])
trades_df['trade_hour'] = pd.to_datetime(trades_df['trade_hour'])

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

# Generate filename with current timestamp
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'data/darkpool_trades_all_{timestamp}.csv'

# Save to CSV
trades_df.to_csv(filename, index=False)
print(f"\nSaved {len(trades_df)} trades to {filename}")

# Print summary statistics
print("\nTrade summary by symbol:")
print(trades_df.groupby('symbol').agg({
    'size': ['count', 'sum', 'mean'],
    'premium': ['mean', 'max'],
    'price_impact_pct': 'mean'
}).round(2))

print("\nDate range of trades:")
print(f"Earliest trade: {trades_df['executed_at'].min()}")
print(f"Latest trade: {trades_df['executed_at'].max()}")
print(f"Total number of trades: {len(trades_df)}")
print(f"Total volume: {trades_df['size'].sum():,.0f}")

Fetching all dark pool trades...

Saved 15438 trades to data/darkpool_trades_all_20250505_163629.csv

Trade summary by symbol:
        size                        premium               price_impact_pct
       count        sum     mean       mean           max             mean
symbol                                                                    
EFA      173   650794.0  3761.82  317011.44  6.541874e+06              0.0
EZU       16    82671.0  5166.94  286552.86  7.372200e+05              0.0
FXF       11    23632.0  2148.36  231496.59  5.462500e+05              0.0
GLD     1390  1506038.0  1083.48  332324.27  2.323045e+07              0.0
QQQ     4370  4098614.0   937.90  433871.53  1.137197e+08              0.0
SLV      109   944937.0  8669.15  259586.57  1.963644e+06             -0.0
SMH      190   859456.0  4523.45  905758.92  6.960936e+07              0.0
SPY     8307  5729794.0   689.75  373708.70  8.161365e+07              0.0
TLT      504  1651191.0  3276.17  289415.31  3.5

# GEt all trades from last 24 hrs

In [25]:
import pandas as pd
import psycopg2
from psycopg2.extras import RealDictCursor
import os
from datetime import datetime, timedelta

# Database connection setup
DB_CONFIG = {
    'dbname': 'trading-pool',
    'user': 'doadmin',
    'password': 'AVNS_SrG4Bo3B7uCNEPONkE4',
    'host': 'vvv-trading-db-do-user-2110609-0.i.db.ondigitalocean.com',
    'port': '25061',
    'sslmode': 'require'
}

# Calculate timestamp for 24 hours ago
twenty_four_hours_ago = datetime.now() - timedelta(hours=24)

# Query dark pool trades from last 24 hours with enhanced metrics
query = """
SELECT 
    t.*,
    date_trunc('hour', t.executed_at) as trade_hour,
    t.price - t.nbbo_bid as price_impact,
    (t.price - t.nbbo_bid) / t.nbbo_bid as price_impact_pct,
    CASE 
        WHEN t.size >= 10000 THEN 'Block Trade'
        WHEN t.premium >= 0.02 THEN 'High Premium'
        ELSE 'Regular'
    END as trade_type,
    count(*) over (partition by t.symbol, date_trunc('hour', t.executed_at)) as trades_per_hour,
    sum(t.size) over (partition by t.symbol, date_trunc('hour', t.executed_at)) as volume_per_hour
FROM trading.darkpool_trades t
WHERE t.executed_at >= %s
ORDER BY t.executed_at DESC
"""

try:
    # Fetch trades
    print("Fetching dark pool trades from last 24 hours...")
    
    # Create a direct connection
    with psycopg2.connect(**DB_CONFIG) as conn:
        with conn.cursor(cursor_factory=RealDictCursor) as cur:
            cur.execute(query, (twenty_four_hours_ago,))
            results = cur.fetchall()
            
    # Convert to DataFrame
    trades_df = pd.DataFrame(results)

    if not trades_df.empty:
        # Convert timestamp columns
        trades_df['executed_at'] = pd.to_datetime(trades_df['executed_at'])
        trades_df['collection_time'] = pd.to_datetime(trades_df['collection_time'])
        trades_df['trade_hour'] = pd.to_datetime(trades_df['trade_hour'])

        # Create data directory if it doesn't exist
        os.makedirs('data', exist_ok=True)

        # Generate filename with current timestamp
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        filename = f'data/darkpool_trades_24h_{timestamp}.csv'

        # Save to CSV
        trades_df.to_csv(filename, index=False)
        print(f"\nSaved {len(trades_df)} trades to {filename}")

        # Print summary statistics
        print("\nTrade summary by symbol:")
        print(trades_df.groupby('symbol').agg({
            'size': ['count', 'sum', 'mean'],
            'premium': ['mean', 'max'],
            'price_impact_pct': 'mean'
        }).round(2))

        print("\nDate range of trades:")
        print(f"Earliest trade: {trades_df['executed_at'].min()}")
        print(f"Latest trade: {trades_df['executed_at'].max()}")
        print(f"Total number of trades: {len(trades_df)}")
        print(f"Total volume: {trades_df['size'].sum():,.0f}")

        # Additional time-based analysis
        print("\nHourly trade distribution:")
        hourly_stats = trades_df.groupby(trades_df['executed_at'].dt.hour).agg({
            'size': ['count', 'sum'],
            'premium': 'mean'
        }).round(2)
        print(hourly_stats)
    else:
        print("No trades found in the last 24 hours")

except Exception as e:
    print(f"Error: {str(e)}")

Fetching dark pool trades from last 24 hours...
No trades found in the last 24 hours
