# Analysis Pipeline

Download and analyze `node_fills_by_block` data from Hyperliquid S3.

**Output**: `data/hl-mainnet-node-data/node_fills_by_block/hourly/` (shared with scripts)

Skips files that already exist locally.

---

## Setup

In [None]:
import pandas as pd
import polars as pl

from vigil import get_s3_client, LOCAL_DATA_DIR, download
from vigil.config import HL_BUCKET, HL_PREFIX
from vigil.transforms import parse_fills, save_parquet

pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)

s3 = get_s3_client()
FILLS_DIR = LOCAL_DATA_DIR / HL_BUCKET / HL_PREFIX
FILLS_DIR.mkdir(parents=True, exist_ok=True)

print(f"S3 client configured")
print(f"Output: {FILLS_DIR.resolve()}")

---

## Download Configuration

Configure date range and hours to download. Data is available from Jul 27, 2025.

In [None]:
# =============================================================================
# DOWNLOAD CONFIGURATION
# =============================================================================
# node_fills_by_block available: Jul 27, 2025 → present
# Each hour file is ~15-25 MB compressed, ~50-100 MB decompressed
# Full day = 24 files = ~400-600 MB compressed
# =============================================================================

# Full UTC day (matches Allium's daily volume)
DOWNLOADS = [
    ("20251101", list(range(24))),  # Nov 1: hours 0-23 (full day UTC)
]

total_hours = sum(len(hours) for _, hours in DOWNLOADS)
print(f"Time window: Nov 1 00:00 UTC → Nov 1 23:59 UTC")
print(f"Total hours: {total_hours}")
for date, hours in DOWNLOADS:
    print(f"  {date}: hours {hours[0]}-{hours[-1]} ({len(hours)} files)")

Time window: Nov 1 00:00 UTC → Nov 1 23:59 UTC
Total hours: 24
  20251101: hours 0-23 (24 files)


In [None]:
from datetime import datetime, timedelta


def download_fills(date: str, hour: int) -> tuple[list, bool]:
    """Download fills and save as parquet. Returns (fills, was_cached)."""
    parquet_path = FILLS_DIR / date / f"{hour}.parquet"

    # Load from parquet if exists
    if parquet_path.exists():
        df = pl.read_parquet(parquet_path)
        return df.to_dicts(), True

    # Download from S3 using vigil
    try:
        key = f"{HL_PREFIX}/{date}/{hour}.lz4"
        lz4_data = download(HL_BUCKET, key, s3)
    except Exception as e:
        print(f"Error downloading {date}/{hour}: {e}")
        return [], False

    # Parse and save using vigil
    fills = parse_fills(lz4_data)
    if fills:
        save_parquet(fills, parquet_path)

    return fills, False


def date_range(start: str, end: str):
    """Generate dates between start and end (inclusive)."""
    start_dt = datetime.strptime(start, "%Y%m%d")
    end_dt = datetime.strptime(end, "%Y%m%d")
    current = start_dt
    while current <= end_dt:
        yield current.strftime("%Y%m%d")
        current += timedelta(days=1)


print("Download functions defined")

---

## Download Data

In [None]:
# Download all configured date/hour combinations
all_fills = []
total_files = sum(len(hours) for _, hours in DOWNLOADS)
cached = 0
downloaded = 0

print(f"Processing {total_files} files...")
for date, hours in DOWNLOADS:
    for hour in hours:
        fills, was_cached = download_fills(date, hour)
        all_fills.extend(fills)
        if was_cached:
            cached += 1
            print(f"  {date}/{hour:02d}: {len(fills):,} fills (cached)")
        else:
            downloaded += 1
            print(f"  {date}/{hour:02d}: {len(fills):,} fills (downloaded → saved)")

print(f"\nTotal: {len(all_fills):,} fills")
print(f"Downloaded: {downloaded}, From cache: {cached}")

Processing 24 files...
  20251101/00: 192,439 fills (cached)
  20251101/01: 245,390 fills (cached)
  20251101/02: 206,714 fills (cached)
  20251101/03: 221,804 fills (cached)
  20251101/04: 199,720 fills (cached)
  20251101/05: 164,158 fills (cached)
  20251101/06: 182,996 fills (cached)
  20251101/07: 192,330 fills (cached)
  20251101/08: 202,526 fills (cached)
  20251101/09: 169,798 fills (cached)
  20251101/10: 151,956 fills (cached)
  20251101/11: 159,560 fills (cached)
  20251101/12: 155,006 fills (cached)
  20251101/13: 196,854 fills (cached)
  20251101/14: 249,310 fills (cached)
  20251101/15: 250,014 fills (cached)
  20251101/16: 202,614 fills (cached)
  20251101/17: 212,180 fills (cached)
  20251101/18: 157,028 fills (cached)
  20251101/19: 151,440 fills (cached)
  20251101/20: 158,960 fills (cached)
  20251101/21: 165,362 fills (cached)
  20251101/22: 155,848 fills (cached)
  20251101/23: 126,712 fills (cached)

Total: 4,470,719 fills
Downloaded: 0, From cache: 24


In [None]:
# Convert to DataFrame with proper types
df = pd.DataFrame(all_fills)

# Parse types
df['time'] = pd.to_datetime(df['time'], unit='ms')
df['px'] = pd.to_numeric(df['px'])
df['sz'] = pd.to_numeric(df['sz'])
df['closedPnl'] = pd.to_numeric(df['closedPnl'], errors='coerce').fillna(0)
df['fee'] = pd.to_numeric(df['fee'], errors='coerce').fillna(0)

# Add computed columns
df['volume'] = df['px'] * df['sz']
df['is_maker'] = ~df['crossed']
df['is_close'] = df['dir'].str.startswith('Close')
df['is_win'] = (df['closedPnl'] > 0) & df['is_close']

print(f"DataFrame shape: {df.shape}")
df.head()

DataFrame shape: (4470719, 25)


Unnamed: 0,coin,px,sz,side,time,startPosition,dir,closedPnl,hash,oid,crossed,fee,tid,cloid,feeToken,twapId,user,block_time,builder,builderFee,liquidation,volume,is_maker,is_close,is_win
0,PURR/USDC,0.12967,14.0,B,2025-11-01 00:00:00.078,13836023.40823,Buy,0.0,0x00000000000000000000000000000000000000000000...,218761682244,False,-1.8e-05,460005683590811,0x00000000000000001a11502348312c7f,USDC,,0x9266865bb6afb4c4f618544dd3b8c970f17aa664,2025-11-01T00:00:00.078617617,,,,1.81538,True,False,False
1,PURR/USDC,0.12967,14.0,A,2025-11-01 00:00:00.078,14.88976,Sell,-0.00238,0x00000000000000000000000000000000000000000000...,218761756374,True,0.001271,460005683590811,,USDC,,0xeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee,2025-11-01T00:00:00.078617617,,,,1.81538,False,False,False
2,PURR/USDC,0.12967,0.02012,A,2025-11-01 00:00:00.078,0.02012,Spot Dust Conversion,0.0,0x00000000000000000000000000000000000000000000...,218761756374,True,0.0,0,,USDC,,0x07265663b688c5123e6600f9374f989e0869cdcd,2025-11-01T00:00:00.078617617,,,,0.002609,False,False,False
3,PURR/USDC,0.12967,0.57875,A,2025-11-01 00:00:00.078,0.57875,Spot Dust Conversion,0.0,0x00000000000000000000000000000000000000000000...,218761756374,True,0.0,0,,USDC,,0x1d117a040a8847e8b5f17009e8b272b6b42718c8,2025-11-01T00:00:00.078617617,,,,0.075047,False,False,False
4,PURR/USDC,0.12967,0.5154,A,2025-11-01 00:00:00.078,0.5154,Spot Dust Conversion,0.0,0x00000000000000000000000000000000000000000000...,218761756374,True,0.0,0,,USDC,,0x25bcbd6f3841b5d5b58502dea182ca1d570648c6,2025-11-01T00:00:00.078617617,,,,0.066832,False,False,False


In [None]:
# Schema overview
print("Column types:")
print(df.dtypes)

Column types:
coin                     object
px                      float64
sz                      float64
side                     object
time             datetime64[ns]
startPosition            object
dir                      object
closedPnl               float64
hash                     object
oid                       int64
crossed                    bool
fee                     float64
tid                       int64
cloid                    object
feeToken                 object
twapId                  float64
user                     object
block_time               object
builder                  object
builderFee               object
liquidation              object
volume                  float64
is_maker                   bool
is_close                   bool
is_win                     bool
dtype: object


---

## Data Validation

Verify data integrity and provide hashes for external verification on [HypurrScan](https://hypurrscan.io).

In [None]:
# Sample transaction for external verification on HypurrScan
print("=" * 60)
print("SAMPLE TRANSACTION FOR VERIFICATION")
print("=" * 60)
print("\nVerify on https://hypurrscan.io/tx/<hash>\n")

# Get one sample transaction (exclude null/zero hashes, decent volume)
# Zero hashes start with many zeros like 0x00000000000000000000...
valid_hashes = df[
    df['hash'].str.startswith('0x') & 
    ~df['hash'].str.startswith('0x00000000') &  # Exclude zero-prefixed hashes
    (df['volume'] > 100)
]
sample = valid_hashes.sample(1).iloc[0]

print(f"Hash: {sample['hash']}")
print(f"URL: https://hypurrscan.io/tx/{sample['hash']}")
print(f"\nExpected values on HypurrScan:")
print(f"  Coin: {sample['coin']}")
print(f"  Side: {'Buy' if sample['side'] == 'B' else 'Sell'}")
print(f"  Price: {sample['px']}")
print(f"  Size: {sample['sz']}")
print(f"  User: {sample['user']}")
print(f"  Time: {sample['time']}")

# Show THIS specific fill's trade pair (by tid = trade ID, not hash)
# One hash can have multiple fills, but each fill has a unique tid
print(f"\n" + "=" * 60)
print("THIS FILL'S TRADE PAIR (buyer + seller)")
print("=" * 60)
df[df['tid'] == sample['tid']][['coin', 'side', 'px', 'sz', 'user', 'time', 'volume']]

SAMPLE TRANSACTION FOR VERIFICATION

Verify on https://hypurrscan.io/tx/<hash>

Hash: 0x2223aafc2bcd612f239d042ea7bc3c0201c800e1c6c08001c5ec564eeac13b19
URL: https://hypurrscan.io/tx/0x2223aafc2bcd612f239d042ea7bc3c0201c800e1c6c08001c5ec564eeac13b19

Expected values on HypurrScan:
  Coin: ZEC
  Side: Sell
  Price: 411.9
  Size: 1.33
  User: 0xa289ee1e56c0d5d041db762e6123e78af0f7d9ad
  Time: 2025-11-01 22:45:53.207000

THIS FILL'S TRADE PAIR (buyer + seller)


Unnamed: 0,coin,side,px,sz,user,time,volume
4312235,ZEC,B,411.9,1.33,0x203826e24261223d0afd5c3f54255c317f813c9d,2025-11-01 22:45:53.207,547.827
4312236,ZEC,A,411.9,1.33,0xa289ee1e56c0d5d041db762e6123e78af0f7d9ad,2025-11-01 22:45:53.207,547.827


In [None]:
# Daily summary for comparison with Allium dashboard
print("=" * 60)
print("DAILY SUMMARY (compare with Allium)")
print("=" * 60)
print(f"\nTime window: Nov 1 00:00 UTC → Nov 1 23:59 UTC")

total_volume = df['volume'].sum()
one_sided_volume = total_volume / 2  # Standard volume = one side only

print(f"\nMetrics:")
print(f"  Raw volume (both sides): ${total_volume:,.0f}")
print(f"  One-sided volume:        ${one_sided_volume:,.0f}  ← Compare to Allium")
print(f"  Total trades: {len(df) // 2:,}")
print(f"  Unique traders: {df['user'].nunique():,}")
print(f"  Unique coins: {df['coin'].nunique():,}")

# Compute top coins inline
top_coins = df.groupby('coin')['volume'].sum().sort_values(ascending=False).head(3) / 2
print(f"\nTop 3 coins by volume (one-sided):")
for coin, vol in top_coins.items():
    print(f"  {coin}: ${vol:,.0f}")

print(f"\nCompare at: https://hyperliquid.allium.so/")

DAILY SUMMARY (compare with Allium)

Time window: Nov 1 00:00 UTC → Nov 1 23:59 UTC

Metrics:
  Raw volume (both sides): $6,287,586,074
  One-sided volume:        $3,143,793,037  ← Compare to Allium
  Total trades: 2,235,359
  Unique traders: 33,654
  Unique coins: 284

Top 3 coins by volume (one-sided):
  BTC: $1,036,236,948
  ETH: $531,646,739
  HYPE: $290,804,669

Compare at: https://hyperliquid.allium.so/


---

## Analysis 1: Volume by Trader

In [None]:
# Volume by trader (one-sided, divide by 2)
volume_by_trader = df.groupby('user').agg(
    volume_raw=('volume', 'sum'),
    fills=('volume', 'count')
).assign(
    volume=lambda x: x['volume_raw'] / 2,  # One-sided volume
    trades=lambda x: x['fills'] / 2        # Each trade = 2 fills
).drop(columns=['volume_raw', 'fills']).sort_values('volume', ascending=False)

print("Top 10 Traders by Volume (one-sided):")
volume_by_trader.head(10)

Top 10 Traders by Volume (one-sided):


Unnamed: 0_level_0,volume,trades
user,Unnamed: 1_level_1,Unnamed: 2_level_1
0xecb63caa47c7c4e77f60f1ce858cf28dc2b82b00,83556750.0,66930.0
0x023a3d058020fb76cca98f01b3c48c8938a22355,78826880.0,24486.0
0xc6ac58a7a63339898aeda32499a8238a46d88e84,77905960.0,4770.5
0xb4321b142b2a03ce20fcab2007ff6990b9acba93,62473150.0,53619.5
0x53babe76166eae33c861aeddf9ce89af20311cd0,62040340.0,3134.0
0x0fd468a73084daa6ea77a9261e40fdec3e67e0c7,61605070.0,5401.5
0x4129c62faf652fea61375dcd9ca8ce24b2bb8b95,59178360.0,2928.5
0xf9109ada2f73c62e9889b45453065f0d99260a2d,47318420.0,11196.5
0x31ca8395cf837de08b24da3f660e77761dfb974b,44357620.0,118755.0
0x7ca165f354e3260e2f8d5a7508cc9dd2fa009235,41933830.0,1509.5


---

## Analysis 2: PnL by Trader

In [None]:
pnl_by_trader = df.groupby('user').agg(
    realized_pnl=('closedPnl', 'sum'),
    fees=('fee', 'sum')
).assign(
    net_pnl=lambda x: x['realized_pnl'] - x['fees']
).sort_values('net_pnl', ascending=False)

print("Top 10 Traders by Net PnL:")
pnl_by_trader.head(10)

Top 10 Traders by Net PnL:


Unnamed: 0_level_0,realized_pnl,fees,net_pnl
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0x519c721de735f7c9e6146d167852e60d60496a47,1547679.0,3013.097564,1544666.0
0xb83de012dba672c76a7dbbbf3e459cb59d7d6e36,1406339.0,4625.399776,1401713.0
0xc7847f80861d2e47c40dfacf9f69e0d0c1393e53,652373.9,515.614846,651858.2
0xe7ec7fbf4f195fc8e57d814e15c3a2857cb632a3,523958.7,2456.97127,521501.7
0x152e41f0b83e6cad4b5dc730c1d6279b7d67c9dc,382439.4,62.606912,382376.8
0xd28e005c992b168d0b20fca312958e105bb260ab,367686.5,226.978921,367459.5
0x5b5d51203a0f9079f8aeb098a6523a13f298c060,264403.3,600.22566,263803.1
0x6f97d329b072e0f7b74575565d806a4351b8f824,252960.3,3630.859715,249329.4
0x45d26f28196d226497130c4bac709d808fed4029,237665.2,7.741103,237657.4
0xb88f3bc2ad32d3d256e26347d1ad24332a18185d,240288.4,4347.065724,235941.3


---

## Analysis 3: Maker vs Taker Ratio

In [None]:
maker_ratio = df.groupby('user').agg(
    total_trades=('is_maker', 'count'),
    maker_trades=('is_maker', 'sum')
).assign(
    maker_pct=lambda x: x['maker_trades'] / x['total_trades'] * 100
).sort_values('maker_pct', ascending=False)

print("Top 10 Traders by Maker %:")
maker_ratio[maker_ratio['total_trades'] >= 10].head(10)

Top 10 Traders by Maker %:


Unnamed: 0_level_0,total_trades,maker_trades,maker_pct
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0x66ef7afad1017c6c88a4fadf62390896c7310c17,130,130,100.0
0x67711c813ba72e4a78e917bce13e5512b74937e4,125,125,100.0
0x67684f0a7feb016a74ecb59b118218fa0c35e18b,12,12,100.0
0x6766011c6cb266b8bbf012ae3134b85efb808eef,100,100,100.0
0x675673e13942a539327d39cd508132b65180ca91,212,212,100.0
0x6712fb7df334528bcc5f3bfeb6af2ba1f2a2cc6b,137,137,100.0
0x6707013459fa40ab655710bb08690693d539b113,175,175,100.0
0x6706073a3895a204293aa1504c42369bc0eef0da,38,38,100.0
0x66face9a3ad6adf9e16fa7d4aac6a2f5ff229add,137,137,100.0
0x66f159caa7b4d39a9a0f589f5ef6952a7e220b83,132,132,100.0


---

## Analysis 4: Win Rate

In [None]:
# Only consider closing trades
closes = df[df['is_close']]

win_rate = closes.groupby('user').agg(
    total_closes=('is_win', 'count'),
    wins=('is_win', 'sum')
).assign(
    win_rate=lambda x: x['wins'] / x['total_closes'] * 100
).sort_values('win_rate', ascending=False)

print("Top 10 Traders by Win Rate (min 5 closes):")
win_rate[win_rate['total_closes'] >= 5].head(10)

Top 10 Traders by Win Rate (min 5 closes):


Unnamed: 0_level_0,total_closes,wins,win_rate
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0xa10b79e143c02340f666bcd8c2a0cf19e930d254,23,23,100.0
0x5559da6ec434c5723d0ce9c4da7f29e3f8a3d43b,117,117,100.0
0xa14bd11714ac6393f745a9be7a508a4bb2de9073,31,31,100.0
0x359531e03b3616e38160b5f76885ff3fbb6aea9a,23,23,100.0
0x358bbbcf62c2163778476b3637b72905535dbd92,9,9,100.0
0x35d039a6035c5052b627b78af16a594fa3df4371,13,13,100.0
0xa06179d29ffe3742beb69e35c4b086d8ba37c80c,9,9,100.0
0xa23190045c4aebeb724844ce622465475e539bae,6549,6549,100.0
0x352deb23bebae8b4c57d0ae341d9c1951fd8425a,34,34,100.0
0x352cf4057384b42646c219c769224ba77264dc52,11,11,100.0


---

## Analysis 5: Volume by Coin

In [None]:
# Volume by coin (one-sided, divide by 2)
volume_by_coin = df.groupby('coin').agg(
    volume_raw=('volume', 'sum'),
    fills=('volume', 'count'),
    unique_traders=('user', 'nunique')
).assign(
    volume=lambda x: x['volume_raw'] / 2,  # One-sided volume
    trades=lambda x: x['fills'] / 2        # Each trade = 2 fills
).drop(columns=['volume_raw', 'fills']).sort_values('volume', ascending=False)

print("Top 10 Coins by Volume (one-sided):")
volume_by_coin.head(10)

Top 10 Coins by Volume (one-sided):


Unnamed: 0_level_0,unique_traders,volume,trades
coin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BTC,8610,1036237000.0,207306.0
ETH,5021,531646700.0,91169.0
HYPE,6834,290804700.0,371692.0
SOL,4844,241754900.0,102639.0
ZEC,3930,230293100.0,159427.0
VIRTUAL,2305,78544740.0,94680.0
@107,2369,77861980.0,70605.0
PUMP,1515,47350950.0,44568.0
xyz:XYZ100,1001,44942140.0,23541.0
@142,1355,43311290.0,50029.0


---

## Analysis 6: Trader Profile (Combined Metrics)

In [None]:
def trader_profile(df):
    closes = df[df['is_close']]
    
    profile = df.groupby('user').agg(
        volume_raw=('volume', 'sum'),
        fills=('volume', 'count'),
        realized_pnl=('closedPnl', 'sum'),
        fees=('fee', 'sum'),
        maker_trades=('is_maker', 'sum'),
        coins_traded=('coin', 'nunique')
    )
    
    # Add win rate from closes
    win_stats = closes.groupby('user').agg(
        total_closes=('is_win', 'count'),
        wins=('is_win', 'sum')
    )
    
    profile = profile.join(win_stats)
    profile['volume'] = profile['volume_raw'] / 2      # One-sided volume
    profile['trades'] = profile['fills'] / 2          # Each trade = 2 fills
    profile['net_pnl'] = profile['realized_pnl'] - profile['fees']
    profile['maker_pct'] = profile['maker_trades'] / profile['fills'] * 100
    profile['win_rate'] = profile['wins'] / profile['total_closes'] * 100
    
    return profile.sort_values('net_pnl', ascending=False)

profiles = trader_profile(df)
print("Trader Profiles (Top 10 by Net PnL):")
profiles[['volume', 'trades', 'net_pnl', 'maker_pct', 'win_rate', 'coins_traded']].head(10)

Trader Profiles (Top 10 by Net PnL):


Unnamed: 0_level_0,volume,trades,net_pnl,maker_pct,win_rate,coins_traded
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0x519c721de735f7c9e6146d167852e60d60496a47,4615655.0,1947.5,1544666.0,0.0,100.0,1
0xb83de012dba672c76a7dbbbf3e459cb59d7d6e36,13043860.0,4916.0,1401713.0,8.980879,100.0,5
0xc7847f80861d2e47c40dfacf9f69e0d0c1393e53,766631.7,167.5,651858.2,90.447761,100.0,2
0xe7ec7fbf4f195fc8e57d814e15c3a2857cb632a3,5121742.0,3841.0,521501.7,0.0,100.0,2
0x152e41f0b83e6cad4b5dc730c1d6279b7d67c9dc,7769157.0,2622.5,382376.8,99.161106,100.0,1
0xd28e005c992b168d0b20fca312958e105bb260ab,927204.4,696.5,367459.5,100.0,100.0,2
0x5b5d51203a0f9079f8aeb098a6523a13f298c060,3223345.0,3150.0,263803.1,10.380952,100.0,4
0x6f97d329b072e0f7b74575565d806a4351b8f824,15982660.0,1125.0,249329.4,63.333333,100.0,1
0x45d26f28196d226497130c4bac709d808fed4029,12287.51,23.0,237657.4,0.0,100.0,1
0xb88f3bc2ad32d3d256e26347d1ad24332a18185d,6210097.0,1953.0,235941.3,0.0,77.292046,2


---

## Summary Statistics

In [None]:
print("Dataset Summary")
print("=" * 40)
print(f"Total fills: {len(df):,}")
print(f"Total trades: {len(df) // 2:,}")
print(f"Unique traders: {df['user'].nunique():,}")
print(f"Unique coins: {df['coin'].nunique():,}")
print(f"Total volume (one-sided): ${df['volume'].sum() / 2:,.0f}")
print(f"Date range: {df['time'].min()} to {df['time'].max()}")

Dataset Summary
Total fills: 4,470,719
Total trades: 2,235,359
Unique traders: 33,654
Unique coins: 284
Total volume (one-sided): $3,143,793,037
Date range: 2025-11-01 00:00:00.078000 to 2025-11-01 23:59:59.752000
