# Analysis Pipeline

Download and analyze `node_fills_by_block` data from Hyperliquid S3.

---

## Setup

In [178]:
import os
import json
import boto3
import lz4.frame
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
from dotenv import load_dotenv

load_dotenv()

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# S3 client with requester-pays
s3 = boto3.client(
    's3',
    aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
    region_name=os.getenv('AWS_REGION', 'us-east-2')
)
REQUEST_PAYER = {'RequestPayer': 'requester'}
BUCKET = 'hl-mainnet-node-data'

print("S3 client configured")

S3 client configured


---

## Download Configuration

Configure date range and hours to download. Data is available from Jul 27, 2025.

In [179]:
# =============================================================================
# DOWNLOAD CONFIGURATION
# =============================================================================
# node_fills_by_block available: Jul 27, 2025 → present
# Each hour file is ~15-25 MB compressed, ~50-100 MB decompressed
# Full day = 24 files = ~400-600 MB compressed
# =============================================================================

# Full UTC day (matches Allium's daily volume)
DOWNLOADS = [
    ("20251101", list(range(24))),  # Nov 1: hours 0-23 (full day UTC)
]

total_hours = sum(len(hours) for _, hours in DOWNLOADS)
print(f"Time window: Nov 1 00:00 UTC → Nov 1 23:59 UTC")
print(f"Total hours: {total_hours}")
for date, hours in DOWNLOADS:
    print(f"  {date}: hours {hours[0]}-{hours[-1]} ({len(hours)} files)")

Time window: Nov 1 00:00 UTC → Nov 1 23:59 UTC
Total hours: 24
  20251101: hours 0-23 (24 files)


In [180]:
def download_fills(date: str, hour: int) -> list:
    """Download and parse fills for a specific date/hour from S3."""
    key = f'node_fills_by_block/hourly/{date}/{hour}.lz4'
    
    try:
        response = s3.get_object(Bucket=BUCKET, Key=key, **REQUEST_PAYER)
        compressed = response['Body'].read()
        decompressed = lz4.frame.decompress(compressed)
        
        # Parse JSON lines - each line is a block
        fills = []
        for line in decompressed.decode().strip().split('\n'):
            if not line.strip():
                continue
            block = json.loads(line)
            # Each event is [user_address, fill_data]
            for user, fill_data in block.get('events', []):
                fill_data['user'] = user
                fill_data['block_time'] = block['block_time']
                fills.append(fill_data)
        
        return fills
    except Exception as e:
        print(f"Error downloading {key}: {e}")
        return []

def date_range(start: str, end: str):
    """Generate dates between start and end (inclusive)."""
    start_dt = datetime.strptime(start, '%Y%m%d')
    end_dt = datetime.strptime(end, '%Y%m%d')
    current = start_dt
    while current <= end_dt:
        yield current.strftime('%Y%m%d')
        current += timedelta(days=1)

print("Download functions defined")


Download functions defined


---

## Download Data

In [None]:
# Download all configured date/hour combinations
all_fills = []
total_files = sum(len(hours) for _, hours in DOWNLOADS)

print(f"Downloading {total_files} files...")
for date, hours in DOWNLOADS:
    for hour in hours:
        fills = download_fills(date, hour)
        all_fills.extend(fills)
        print(f"  {date}/{hour:02d}: {len(fills):,} fills")

print(f"\nTotal: {len(all_fills):,} fills downloaded")

Downloading 24 files...
  20251031/04: 302,422 fills
  20251031/05: 222,824 fills
  20251031/06: 211,852 fills
  20251031/07: 254,628 fills
  20251031/08: 325,216 fills
  20251031/09: 272,156 fills
  20251031/10: 225,986 fills
  20251031/11: 333,592 fills
  20251031/12: 405,896 fills
  20251031/13: 362,562 fills
  20251031/14: 393,624 fills
  20251031/15: 355,778 fills
  20251031/16: 379,106 fills
  20251031/17: 411,294 fills
  20251031/18: 358,770 fills
  20251031/19: 279,270 fills
  20251031/20: 214,492 fills
  20251031/21: 189,298 fills
  20251031/22: 181,854 fills
  20251031/23: 145,750 fills
  20251101/00: 192,439 fills
  20251101/01: 245,390 fills
  20251101/02: 206,714 fills
  20251101/03: 221,804 fills

Total: 6,692,717 fills downloaded


In [None]:
# Convert to DataFrame with proper types
df = pd.DataFrame(all_fills)

# Parse types
df['time'] = pd.to_datetime(df['time'], unit='ms')
df['px'] = pd.to_numeric(df['px'])
df['sz'] = pd.to_numeric(df['sz'])
df['closedPnl'] = pd.to_numeric(df['closedPnl'], errors='coerce').fillna(0)
df['fee'] = pd.to_numeric(df['fee'], errors='coerce').fillna(0)

# Add computed columns
df['volume'] = df['px'] * df['sz']
df['is_maker'] = ~df['crossed']
df['is_close'] = df['dir'].str.startswith('Close')
df['is_win'] = (df['closedPnl'] > 0) & df['is_close']

print(f"DataFrame shape: {df.shape}")
df.head()

DataFrame shape: (6692717, 25)


Unnamed: 0,coin,px,sz,side,time,startPosition,dir,closedPnl,hash,oid,crossed,fee,tid,feeToken,twapId,user,block_time,cloid,builderFee,builder,liquidation,volume,is_maker,is_close,is_win
0,ORDI,4.6229,4.14,B,2025-10-31 03:59:59.883,1018.81,Open Long,0.0,0x358a7cb7075161343704042e89e33701c200949ca254...,217843589969,True,0.0,580290431370130,USDC,,0x31ca8395cf837de08b24da3f660e77761dfb974b,2025-10-31T03:59:59.883472673,,,,,19.138806,False,False,False
1,ORDI,4.6229,4.14,A,2025-10-31 03:59:59.883,-14.19,Open Short,0.0,0x358a7cb7075161343704042e89e33701c200949ca254...,217843583792,False,-0.000191,580290431370130,USDC,,0xc1fce740d83a60de67d039aa927a678ff78c202f,2025-10-31T03:59:59.883472673,0x20251031000000000000000000167742,,,,19.138806,True,False,False
2,PUMP,0.004436,37584.0,B,2025-10-31 04:00:00.038,88485617.0,Open Long,0.0,0x00000000000000000000000000000000000000000000...,217843590259,True,0.066689,105512302174433,USDC,1320081.0,0x3d9ee32b4d8c5f4232b48a4d05e9325b23fe7a61,2025-10-31T04:00:00.038284606,,,,,166.722624,False,False,False
3,PUMP,0.004436,37584.0,A,2025-10-31 04:00:00.038,37584.0,Close Long,0.864432,0x00000000000000000000000000000000000000000000...,217843542818,False,-0.001667,105512302174433,USDC,,0x230633a6e555acac1a865d09db2197c864e58927,2025-10-31T04:00:00.038284606,0x20251031000000000000000000883347,,,,166.722624,True,True,True
4,PUMP,0.004436,233428.0,B,2025-10-31 04:00:00.038,88523201.0,Open Long,0.0,0x00000000000000000000000000000000000000000000...,217843590259,True,0.414194,956346725003962,USDC,1320081.0,0x3d9ee32b4d8c5f4232b48a4d05e9325b23fe7a61,2025-10-31T04:00:00.038284606,,,,,1035.486608,False,False,False


In [None]:
# Schema overview
print("Column types:")
print(df.dtypes)

Column types:
coin                     object
px                      float64
sz                      float64
side                     object
time             datetime64[ns]
startPosition            object
dir                      object
closedPnl               float64
hash                     object
oid                       int64
crossed                    bool
fee                     float64
tid                       int64
feeToken                 object
twapId                  float64
user                     object
block_time               object
cloid                    object
builderFee               object
builder                  object
liquidation              object
volume                  float64
is_maker                   bool
is_close                   bool
is_win                     bool
dtype: object


---

## Data Validation

Verify data integrity and provide hashes for external verification on [HypurrScan](https://hypurrscan.io).

In [None]:
# Sample transaction for external verification on HypurrScan
print("=" * 60)
print("SAMPLE TRANSACTION FOR VERIFICATION")
print("=" * 60)
print("\nVerify on https://hypurrscan.io/tx/<hash>\n")

# Get one sample transaction (exclude null/zero hashes, decent volume)
# Zero hashes start with many zeros like 0x00000000000000000000...
valid_hashes = df[
    df['hash'].str.startswith('0x') & 
    ~df['hash'].str.startswith('0x00000000') &  # Exclude zero-prefixed hashes
    (df['volume'] > 100)
]
sample = valid_hashes.sample(1).iloc[0]

print(f"Hash: {sample['hash']}")
print(f"URL: https://hypurrscan.io/tx/{sample['hash']}")
print(f"\nExpected values on HypurrScan:")
print(f"  Coin: {sample['coin']}")
print(f"  Side: {'Buy' if sample['side'] == 'B' else 'Sell'}")
print(f"  Price: {sample['px']}")
print(f"  Size: {sample['sz']}")
print(f"  User: {sample['user']}")
print(f"  Time: {sample['time']}")

# Show THIS specific fill's trade pair (by tid = trade ID, not hash)
# One hash can have multiple fills, but each fill has a unique tid
print(f"\n" + "=" * 60)
print("THIS FILL'S TRADE PAIR (buyer + seller)")
print("=" * 60)
df[df['tid'] == sample['tid']][['coin', 'side', 'px', 'sz', 'user', 'time', 'volume']]

SAMPLE TRANSACTION FOR VERIFICATION

Verify on https://hypurrscan.io/tx/<hash>

Hash: 0xbb9115811bf56121bd0a042e91b1130208500066b6f87ff35f59c0d3daf93b0c
URL: https://hypurrscan.io/tx/0xbb9115811bf56121bd0a042e91b1130208500066b6f87ff35f59c0d3daf93b0c

Expected values on HypurrScan:
  Coin: ZEC
  Side: Sell
  Price: 386.16
  Size: 1.37
  User: 0x5661e424b3b482231b8fc2a59c96dbe157fd70fc
  Time: 2025-10-31 15:09:39.043000

THIS FILL'S TRADE PAIR (buyer + seller)


Unnamed: 0,coin,side,px,sz,user,time,volume
3379460,ZEC,B,386.16,1.37,0x744cf47e88d9d0847544f0ac2fa7575cf5925f79,2025-10-31 15:09:39.043,529.0392
3379461,ZEC,A,386.16,1.37,0x5661e424b3b482231b8fc2a59c96dbe157fd70fc,2025-10-31 15:09:39.043,529.0392


In [None]:
# Daily summary for comparison with Allium dashboard
print("=" * 60)
print("DAILY SUMMARY (compare with Allium)")
print("=" * 60)
print(f"\nTime window: Nov 1 00:00 UTC → Nov 1 23:59 UTC")

total_volume = df['volume'].sum()
one_sided_volume = total_volume / 2  # Standard volume = one side only

print(f"\nMetrics:")
print(f"  Raw volume (both sides): ${total_volume:,.0f}")
print(f"  One-sided volume:        ${one_sided_volume:,.0f}  ← Compare to Allium")
print(f"  Total trades: {len(df) // 2:,}")
print(f"  Unique traders: {df['user'].nunique():,}")
print(f"  Unique coins: {df['coin'].nunique():,}")

# Compute top coins inline
top_coins = df.groupby('coin')['volume'].sum().sort_values(ascending=False).head(3) / 2
print(f"\nTop 3 coins by volume (one-sided):")
for coin, vol in top_coins.items():
    print(f"  {coin}: ${vol:,.0f}")

print(f"\nCompare at: https://hyperliquid.allium.so/")

---

## Analysis 1: Volume by Trader

In [None]:
# Volume by trader (one-sided, divide by 2)
volume_by_trader = df.groupby('user').agg(
    volume_raw=('volume', 'sum'),
    fills=('volume', 'count')
).assign(
    volume=lambda x: x['volume_raw'] / 2,  # One-sided volume
    trades=lambda x: x['fills'] / 2        # Each trade = 2 fills
).drop(columns=['volume_raw', 'fills']).sort_values('volume', ascending=False)

print("Top 10 Traders by Volume (one-sided):")
volume_by_trader.head(10)

Top 10 Traders by Volume (one-sided):


Unnamed: 0_level_0,volume,trades
user,Unnamed: 1_level_1,Unnamed: 2_level_1
0xc6ac58a7a63339898aeda32499a8238a46d88e84,299081600.0,17520.5
0x2db3a3ffd278c7f5335fcff936403e01e75c5209,156657200.0,4811.0
0xecb63caa47c7c4e77f60f1ce858cf28dc2b82b00,142119900.0,100156.0
0x0fd468a73084daa6ea77a9261e40fdec3e67e0c7,132836500.0,10500.5
0x53babe76166eae33c861aeddf9ce89af20311cd0,131400000.0,6756.5
0x023a3d058020fb76cca98f01b3c48c8938a22355,123007400.0,29169.0
0x7ca165f354e3260e2f8d5a7508cc9dd2fa009235,115055100.0,25102.5
0x4129c62faf652fea61375dcd9ca8ce24b2bb8b95,112052800.0,7247.0
0x09bc1cf4d9f0b59e1425a8fde4d4b1f7d3c9410d,98129870.0,3422.0
0xf517639a8872e756ac98d3c65507d2ebc25cc032,81968120.0,2217.5


---

## Analysis 2: PnL by Trader

In [None]:
pnl_by_trader = df.groupby('user').agg(
    realized_pnl=('closedPnl', 'sum'),
    fees=('fee', 'sum')
).assign(
    net_pnl=lambda x: x['realized_pnl'] - x['fees']
).sort_values('net_pnl', ascending=False)

print("Top 10 Traders by Net PnL:")
pnl_by_trader.head(10)

Top 10 Traders by Net PnL:


Unnamed: 0_level_0,realized_pnl,fees,net_pnl
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0x5b5d51203a0f9079f8aeb098a6523a13f298c060,1435727.0,8101.406731,1427626.0
0xffbd3e51ae0e2c4407434e157965c064f2a11628,1125287.0,4172.003643,1121115.0
0x7fe8cfe481ec2f702692a9152e3f002f3e417ac6,1098269.0,3114.582042,1095154.0
0xb83de012dba672c76a7dbbbf3e459cb59d7d6e36,700297.9,1673.483551,698624.5
0x9263c1bd29aa87a118242f3fbba4517037f8cc7a,565241.9,6325.819761,558916.1
0xf35a60331a38326a6af92badd89622555181fb59,432050.2,10155.579636,421894.6
0x88b6addc407b2b809443aea0cf54221c6149b5c0,419094.6,4756.485259,414338.1
0xdc519874071cca5457fdf8b3c31760ac1ca16ef7,359619.8,936.872607,358682.9
0x34365f472a3af2ff8167970a519931437e2c2094,327327.9,1541.918027,325786.0
0xc26cbb6483229e0d0f9a1cab675271eda535b8f4,265178.4,1182.289136,263996.1


---

## Analysis 3: Maker vs Taker Ratio

In [None]:
maker_ratio = df.groupby('user').agg(
    total_trades=('is_maker', 'count'),
    maker_trades=('is_maker', 'sum')
).assign(
    maker_pct=lambda x: x['maker_trades'] / x['total_trades'] * 100
).sort_values('maker_pct', ascending=False)

print("Top 10 Traders by Maker %:")
maker_ratio[maker_ratio['total_trades'] >= 10].head(10)

Top 10 Traders by Maker %:


Unnamed: 0_level_0,total_trades,maker_trades,maker_pct
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0x0000000000d6ac00071c44a2375a8dd624176623,68,68,100.0
0x63dcc7a72f32015c09e09233d877809725bc02ac,81,81,100.0
0x662823f3280bb67da4eec0c4195709caacf20215,16,16,100.0
0x6606f1da9da7cc51d30c5020fb9d5f48c32c5a58,259,259,100.0
0x65e4741ba9d57e0b9e3481dc0e2e5846b8f1c7cc,259,259,100.0
0x65b1a489057d36539a6ce135e081d256730ddca0,255,255,100.0
0x65aee08c9235025355ac6c5ad020fb167ecef4fe,18,18,100.0
0x6569d5eb0e2ffcfa895b051b85718640de78a913,248,248,100.0
0x65628165c784712daf0e8939e7daa5d07974379b,259,259,100.0
0x654b150b69f7bd49a74b4f038d55405b70156c6c,254,254,100.0


---

## Analysis 4: Win Rate

In [None]:
# Only consider closing trades
closes = df[df['is_close']]

win_rate = closes.groupby('user').agg(
    total_closes=('is_win', 'count'),
    wins=('is_win', 'sum')
).assign(
    win_rate=lambda x: x['wins'] / x['total_closes'] * 100
).sort_values('win_rate', ascending=False)

print("Top 10 Traders by Win Rate (min 5 closes):")
win_rate[win_rate['total_closes'] >= 5].head(10)

Top 10 Traders by Win Rate (min 5 closes):


Unnamed: 0_level_0,total_closes,wins,win_rate
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0x7f567d00603ac57727e5b5befb2f3cc4fffc4113,28,28,100.0
0x4212661bc3e818cc2eac01867ad6072f72cfa28b,5,5,100.0
0x821088bf8886dfe6652f0c49039994eef8a08d99,5,5,100.0
0x41ff103cfa1d2636cea85b5276513d2f4b9adfd1,13,13,100.0
0x4221f7f14dc9c4564240fdfb8c7b5d26dd65e067,14,14,100.0
0xd6c8d66920f450ef6603acf7560b958dc51b03cb,10,10,100.0
0x41ee75cd2d5c532cd7b0ba3304a8a2ca15917865,55,55,100.0
0xaf525fd01f417a2c59b68985c5fdddea2d5776d2,9,9,100.0
0xd7180d066cd92310cff8b90979db4638031e33dc,25,25,100.0
0xd6f1707716e423cba19dc7b8c198779b4920194b,36,36,100.0


---

## Analysis 5: Volume by Coin

In [None]:
# Volume by coin (one-sided, divide by 2)
volume_by_coin = df.groupby('coin').agg(
    volume_raw=('volume', 'sum'),
    fills=('volume', 'count'),
    unique_traders=('user', 'nunique')
).assign(
    volume=lambda x: x['volume_raw'] / 2,  # One-sided volume
    trades=lambda x: x['fills'] / 2        # Each trade = 2 fills
).drop(columns=['volume_raw', 'fills']).sort_values('volume', ascending=False)

print("Top 10 Coins by Volume (one-sided):")
volume_by_coin.head(10)

Top 10 Coins by Volume (one-sided):


Unnamed: 0_level_0,unique_traders,volume,trades
coin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BTC,12964,3027283000.0,519032.0
ETH,7512,1037832000.0,185941.0
HYPE,8709,551806800.0,699531.0
SOL,6772,481664500.0,171630.0
ZEC,3729,233098800.0,168813.0
@107,3091,140947200.0,111381.0
PUMP,2197,116875900.0,84488.0
XRP,2564,82143310.0,31481.0
TRUMP,1434,78891720.0,41449.0
TAO,1951,65291100.0,58846.0


---

## Analysis 6: Trader Profile (Combined Metrics)

In [None]:
def trader_profile(df):
    closes = df[df['is_close']]
    
    profile = df.groupby('user').agg(
        volume_raw=('volume', 'sum'),
        fills=('volume', 'count'),
        realized_pnl=('closedPnl', 'sum'),
        fees=('fee', 'sum'),
        maker_trades=('is_maker', 'sum'),
        coins_traded=('coin', 'nunique')
    )
    
    # Add win rate from closes
    win_stats = closes.groupby('user').agg(
        total_closes=('is_win', 'count'),
        wins=('is_win', 'sum')
    )
    
    profile = profile.join(win_stats)
    profile['volume'] = profile['volume_raw'] / 2      # One-sided volume
    profile['trades'] = profile['fills'] / 2          # Each trade = 2 fills
    profile['net_pnl'] = profile['realized_pnl'] - profile['fees']
    profile['maker_pct'] = profile['maker_trades'] / profile['fills'] * 100
    profile['win_rate'] = profile['wins'] / profile['total_closes'] * 100
    
    return profile.sort_values('net_pnl', ascending=False)

profiles = trader_profile(df)
print("Trader Profiles (Top 10 by Net PnL):")
profiles[['volume', 'trades', 'net_pnl', 'maker_pct', 'win_rate', 'coins_traded']].head(10)

Trader Profiles (Top 10 by Net PnL):


Unnamed: 0_level_0,volume,trades,net_pnl,maker_pct,win_rate,coins_traded
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0x5b5d51203a0f9079f8aeb098a6523a13f298c060,20910700.0,13220.0,1427626.0,3.78593,100.0,5
0xffbd3e51ae0e2c4407434e157965c064f2a11628,9172488.0,1836.5,1121115.0,59.78764,100.0,5
0x7fe8cfe481ec2f702692a9152e3f002f3e417ac6,31865740.0,1669.5,1095154.0,78.556454,98.048634,3
0xb83de012dba672c76a7dbbbf3e459cb59d7d6e36,4369541.0,2507.5,698624.5,4.386839,99.658613,6
0x9263c1bd29aa87a118242f3fbba4517037f8cc7a,11714740.0,2247.5,558916.1,2.98109,90.54505,3
0xf35a60331a38326a6af92badd89622555181fb59,24804110.0,2658.0,421894.6,19.657637,86.089058,5
0x88b6addc407b2b809443aea0cf54221c6149b5c0,19745630.0,3026.5,414338.1,50.140426,100.0,5
0xdc519874071cca5457fdf8b3c31760ac1ca16ef7,904591.7,596.0,358682.9,12.583893,,1
0x34365f472a3af2ff8167970a519931437e2c2094,8290382.0,206.5,325786.0,70.217918,100.0,4
0xc26cbb6483229e0d0f9a1cab675271eda535b8f4,3340449.0,283.0,263996.1,41.519435,100.0,3


---

## Summary Statistics

In [None]:
print("Dataset Summary")
print("=" * 40)
print(f"Total fills: {len(df):,}")
print(f"Total trades: {len(df) // 2:,}")
print(f"Unique traders: {df['user'].nunique():,}")
print(f"Unique coins: {df['coin'].nunique():,}")
print(f"Total volume (one-sided): ${df['volume'].sum() / 2:,.0f}")
print(f"Date range: {df['time'].min()} to {df['time'].max()}")

Dataset Summary
Total fills: 6,692,717
Total trades: 3,346,358
Unique traders: 40,753
Unique coins: 310
Total volume (one-sided): $6,567,212,527
Date range: 2025-10-31 03:59:59.883000 to 2025-11-01 03:59:59.663000
