# Analysis Pipeline

Download and analyze `node_fills_by_block` data from Hyperliquid S3.

---

## Setup

In [47]:
import os
import json
import boto3
import lz4.frame
import pandas as pd
from pathlib import Path
from datetime import datetime, timedelta
from dotenv import load_dotenv

load_dotenv()

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

# S3 client with requester-pays
s3 = boto3.client(
    's3',
    aws_access_key_id=os.getenv('AWS_ACCESS_KEY_ID'),
    aws_secret_access_key=os.getenv('AWS_SECRET_ACCESS_KEY'),
    region_name=os.getenv('AWS_REGION', 'us-east-2')
)
REQUEST_PAYER = {'RequestPayer': 'requester'}
BUCKET = 'hl-mainnet-node-data'

print("S3 client configured")

S3 client configured


---

## Download Configuration

Configure date range and hours to download. Data is available from Jul 27, 2025.

In [48]:
# =============================================================================
# DOWNLOAD CONFIGURATION
# =============================================================================
# node_fills_by_block available: Jul 27, 2025 â†’ present
# Each hour file is ~15-25 MB compressed, ~50-100 MB decompressed
# =============================================================================

# Date range to download (YYYYMMDD format)
START_DATE = "20251101"  # Nov 1, 2025
END_DATE = "20251101"    # Nov 1, 2025 (same day = 1 day of data)

# Hours to download (0-23). Use range for multiple hours.
HOURS = [12]  # Just noon for quick test
# HOURS = list(range(24))  # Full day

print(f"Will download: {START_DATE} to {END_DATE}")
print(f"Hours: {HOURS}")
print(f"Estimated files: {len(HOURS)} per day")

Will download: 20251101 to 20251101
Hours: [12]
Estimated files: 1 per day


In [49]:
def download_fills(date: str, hour: int) -> list:
    """Download and parse fills for a specific date/hour from S3."""
    key = f'node_fills_by_block/hourly/{date}/{hour}.lz4'
    
    try:
        response = s3.get_object(Bucket=BUCKET, Key=key, **REQUEST_PAYER)
        compressed = response['Body'].read()
        decompressed = lz4.frame.decompress(compressed)
        
        # Parse JSON lines - each line is a block
        fills = []
        for line in decompressed.decode().strip().split('\n'):
            if not line.strip():
                continue
            block = json.loads(line)
            # Each event is [user_address, fill_data]
            for user, fill_data in block.get('events', []):
                fill_data['user'] = user
                fill_data['block_time'] = block['block_time']
                fills.append(fill_data)
        
        return fills
    except Exception as e:
        print(f"Error downloading {key}: {e}")
        return []

def date_range(start: str, end: str):
    """Generate dates between start and end (inclusive)."""
    start_dt = datetime.strptime(start, '%Y%m%d')
    end_dt = datetime.strptime(end, '%Y%m%d')
    current = start_dt
    while current <= end_dt:
        yield current.strftime('%Y%m%d')
        current += timedelta(days=1)

print("Download functions defined")

Download functions defined


---

## Download Data

In [50]:
# Download all configured date/hour combinations
all_fills = []
dates = list(date_range(START_DATE, END_DATE))
total_files = len(dates) * len(HOURS)

print(f"Downloading {total_files} files...")
for i, date in enumerate(dates):
    for hour in HOURS:
        fills = download_fills(date, hour)
        all_fills.extend(fills)
        print(f"  {date}/{hour:02d}: {len(fills):,} fills")

print(f"\nTotal: {len(all_fills):,} fills downloaded")

Downloading 1 files...
  20251101/12: 155,006 fills

Total: 155,006 fills downloaded


In [60]:
# Convert to DataFrame with proper types
df = pd.DataFrame(all_fills)

# Parse types
df['time'] = pd.to_datetime(df['time'], unit='ms')
df['px'] = pd.to_numeric(df['px'])
df['sz'] = pd.to_numeric(df['sz'])
df['closedPnl'] = pd.to_numeric(df['closedPnl'], errors='coerce').fillna(0)
df['fee'] = pd.to_numeric(df['fee'], errors='coerce').fillna(0)

# Add computed columns
df['volume'] = df['px'] * df['sz']
df['is_maker'] = ~df['crossed']
df['is_close'] = df['dir'].str.startswith('Close')
df['is_win'] = (df['closedPnl'] > 0) & df['is_close']

print(f"DataFrame shape: {df.shape}")
df.head()

DataFrame shape: (155006, 25)


Unnamed: 0,coin,px,sz,side,time,startPosition,dir,closedPnl,hash,oid,crossed,fee,tid,feeToken,twapId,user,block_time,cloid,builderFee,builder,liquidation,volume,is_maker,is_close,is_win
0,SOL,186.1,0.07,B,2025-11-01 12:00:00.072,129.84,Open Long,0.0,0x00000000000000000000000000000000000000000000...,219188562384,True,0.005569,574880777843654,USDC,1323168.0,0x2f79e7993359e37091f8298c9706c75243da65a5,2025-11-01T12:00:00.072708858,,,,,13.027,False,False,False
1,SOL,186.1,0.07,A,2025-11-01 12:00:00.072,23.29,Close Long,0.005327,0x00000000000000000000000000000000000000000000...,219188406659,False,-0.00026,574880777843654,USDC,,0xa880d6cc607a05ea617307ab3b0d335e8d8424ee,2025-11-01T12:00:00.072708858,0x00000000000000000000000001c11135,,,,13.027,True,True,True
2,kLUNC,0.043634,727.0,B,2025-11-01 12:00:00.072,93324.0,Open Long,0.0,0xd1bb92f72debcd68d335042ea03c6b02010f00dcc8ee...,219188347328,False,-0.000317,310875290034250,USDC,,0x7717a7a245d9f950e586822b8c9b46863ed7bd7e,2025-11-01T12:00:00.072708858,0x3a46f6a9f96fe583be081c8f6f04205d,,,,31.721918,True,False,False
3,kLUNC,0.043634,727.0,A,2025-11-01 12:00:00.072,2796963.0,Close Long,-0.292254,0xd1bb92f72debcd68d335042ea03c6b02010f00dcc8ee...,219188563177,True,0.0,310875290034250,USDC,,0x31ca8395cf837de08b24da3f660e77761dfb974b,2025-11-01T12:00:00.072708858,,,,,31.721918,False,True,False
4,MEME,0.001559,9906.0,B,2025-11-01 12:00:00.273,52469471.0,Open Long,0.0,0x52a54788e5cf05c5541f042ea03c6e0205ee006e80c2...,219188565409,True,0.0,557384256234142,USDC,,0x31ca8395cf837de08b24da3f660e77761dfb974b,2025-11-01T12:00:00.273834313,,,,,15.443454,False,False,False


In [61]:
# Schema overview
print("Column types:")
print(df.dtypes)

Column types:
coin                     object
px                      float64
sz                      float64
side                     object
time             datetime64[ns]
startPosition            object
dir                      object
closedPnl               float64
hash                     object
oid                       int64
crossed                    bool
fee                     float64
tid                       int64
feeToken                 object
twapId                  float64
user                     object
block_time               object
cloid                    object
builderFee               object
builder                  object
liquidation              object
volume                  float64
is_maker                   bool
is_close                   bool
is_win                     bool
dtype: object


---

## Analysis 1: Volume by Trader

In [62]:
volume_by_trader = df.groupby('user').agg(
    volume=('volume', 'sum'),
    trades=('volume', 'count')
).sort_values('volume', ascending=False)

print("Top 10 Traders by Volume:")
volume_by_trader.head(10)

Top 10 Traders by Volume:


Unnamed: 0_level_0,volume,trades
user,Unnamed: 1_level_1,Unnamed: 2_level_1
0x2db3a3ffd278c7f5335fcff936403e01e75c5209,7142583.0,307
0x023a3d058020fb76cca98f01b3c48c8938a22355,5111722.0,1771
0xecb63caa47c7c4e77f60f1ce858cf28dc2b82b00,4197145.0,4524
0x4129c62faf652fea61375dcd9ca8ce24b2bb8b95,3211587.0,171
0x31ca8395cf837de08b24da3f660e77761dfb974b,3108309.0,9395
0x0fd468a73084daa6ea77a9261e40fdec3e67e0c7,2980411.0,350
0x010461c14e146ac35fe42271bdc1134ee31c703a,2813421.0,8679
0xf9109ada2f73c62e9889b45453065f0d99260a2d,2787546.0,766
0x558afca8d68f6a7335837b06ce218a234978225b,2781562.0,430
0x17fc9786b2f98de35f5447ce70d49e4067ebefb0,2775274.0,226


---

## Analysis 2: PnL by Trader

In [63]:
pnl_by_trader = df.groupby('user').agg(
    realized_pnl=('closedPnl', 'sum'),
    fees=('fee', 'sum')
).assign(
    net_pnl=lambda x: x['realized_pnl'] - x['fees']
).sort_values('net_pnl', ascending=False)

print("Top 10 Traders by Net PnL:")
pnl_by_trader.head(10)

Top 10 Traders by Net PnL:


Unnamed: 0_level_0,realized_pnl,fees,net_pnl
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0x382aaa98abb877bb6661035987ebe12af1f60f34,37482.717237,470.865391,37011.851846
0x4ae1de7391fc9fca5746f41b0a00750d6e954ea7,26963.114324,396.683547,26566.430777
0xb83de012dba672c76a7dbbbf3e459cb59d7d6e36,24633.50184,82.152164,24551.349676
0xaa82c72777b61e6be717c1cfb8420b78d87dc600,9459.99279,269.16605,9190.82674
0x413c7a0a3489563350219bc96965a7da02f0fffc,8768.258702,38.870932,8729.38777
0x3d9ee32b4d8c5f4232b48a4d05e9325b23fe7a61,8528.143252,33.844637,8494.298615
0x9e02aca9865e1859bb7865f6f64801e804a173df,5044.328652,55.656611,4988.672041
0xd831409a48d8b21d59245c12b1540a0820446203,4559.1,549.7112,4009.3888
0xb4321b142b2a03ce20fcab2007ff6990b9acba93,4009.122316,94.404034,3914.718282
0x0d7661a7b7a89b40b6db128900557e4f1d1b3789,3710.254377,55.356048,3654.898329


---

## Analysis 3: Maker vs Taker Ratio

In [64]:
maker_ratio = df.groupby('user').agg(
    total_trades=('is_maker', 'count'),
    maker_trades=('is_maker', 'sum')
).assign(
    maker_pct=lambda x: x['maker_trades'] / x['total_trades'] * 100
).sort_values('maker_pct', ascending=False)

print("Top 10 Traders by Maker %:")
maker_ratio[maker_ratio['total_trades'] >= 10].head(10)

Top 10 Traders by Maker %:


Unnamed: 0_level_0,total_trades,maker_trades,maker_pct
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0x86b6796b1b05d7890c9770f2f1762cf4ba3a8360,22,22,100.0
0x874c53f1ffbeb38707bd66e8bfa20e6177a7e207,11,11,100.0
0x8557558a43e853e4a2af7136eb241e2c8595f6f9,22,22,100.0
0x8311e3d4eff32c1b59b83e844bbffa824ab0f0da,312,312,100.0
0x839fb7089af2091b9bd9487a977c41daf56394c0,68,68,100.0
0x8bf3ac5c417624b3bde7901de3c4ad7659599060,119,119,100.0
0x8c14e3341eb81a21358ae62fff9598e7d8505185,25,25,100.0
0x8c21e89a714a724932957c8a679ee67888e69d2a,13,13,100.0
0x8d6f23c74c88611aa40c81bf6d73be609812093b,29,29,100.0
0x88ff73aef0d2de3c9217a9299e39ece3c5589837,32,32,100.0


---

## Analysis 4: Win Rate

In [65]:
# Only consider closing trades
closes = df[df['is_close']]

win_rate = closes.groupby('user').agg(
    total_closes=('is_win', 'count'),
    wins=('is_win', 'sum')
).assign(
    win_rate=lambda x: x['wins'] / x['total_closes'] * 100
).sort_values('win_rate', ascending=False)

print("Top 10 Traders by Win Rate (min 5 closes):")
win_rate[win_rate['total_closes'] >= 5].head(10)

Top 10 Traders by Win Rate (min 5 closes):


Unnamed: 0_level_0,total_closes,wins,win_rate
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0x001005306685021a4edaf9de69c9b5b2a34c4caa,18,18,100.0
0x931ec1f0c23d4238d66b82e14e0e414f77d85125,35,35,100.0
0x9380413d90fdd69bc8590875e002475336aa7e79,7,7,100.0
0x929c8bfca61241d3367650a26a4bacdcfcdc5b77,10,10,100.0
0x9207896a6aff93f11acb398ab5b59827cafa9985,6,6,100.0
0x98d710b3c1e229bd2f6236ff1cd8a5195ae0719d,8,8,100.0
0x99eb6bf6cdc3f06eb9a8d1d2df30ea12d5dc0323,13,13,100.0
0x9767a3bd890ca584bc990a38f7d4b9c14cb9f3e0,11,11,100.0
0x9549fd0926212664e9a00092d0813bd687c63d74,5,5,100.0
0x8935d4ccbf09479ebb86efadf8077edba84acfb8,7,7,100.0


---

## Analysis 5: Volume by Coin

In [66]:
volume_by_coin = df.groupby('coin').agg(
    volume=('volume', 'sum'),
    trades=('volume', 'count'),
    unique_traders=('user', 'nunique')
).sort_values('volume', ascending=False)

print("Top 10 Coins by Volume:")
volume_by_coin.head(10)

Top 10 Coins by Volume:


Unnamed: 0_level_0,volume,trades,unique_traders
coin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BTC,46042490.0,11978,851
ETH,29928970.0,5568,576
HYPE,16972650.0,19140,2035
SOL,8285681.0,6422,459
ZEC,7122311.0,7732,364
@107,6519163.0,4298,205
VIRTUAL,4991725.0,7838,434
PUMP,3692366.0,3840,278
XRP,3591921.0,1664,212
TAO,2688449.0,3118,240


---

## Analysis 6: Trader Profile (Combined Metrics)

In [67]:
def trader_profile(df):
    closes = df[df['is_close']]
    
    profile = df.groupby('user').agg(
        volume=('volume', 'sum'),
        trades=('volume', 'count'),
        realized_pnl=('closedPnl', 'sum'),
        fees=('fee', 'sum'),
        maker_trades=('is_maker', 'sum'),
        coins_traded=('coin', 'nunique')
    )
    
    # Add win rate from closes
    win_stats = closes.groupby('user').agg(
        total_closes=('is_win', 'count'),
        wins=('is_win', 'sum')
    )
    
    profile = profile.join(win_stats)
    profile['net_pnl'] = profile['realized_pnl'] - profile['fees']
    profile['maker_pct'] = profile['maker_trades'] / profile['trades'] * 100
    profile['win_rate'] = profile['wins'] / profile['total_closes'] * 100
    
    return profile.sort_values('net_pnl', ascending=False)

profiles = trader_profile(df)
print("Trader Profiles (Top 10 by Net PnL):")
profiles[['volume', 'trades', 'net_pnl', 'maker_pct', 'win_rate', 'coins_traded']].head(10)

Trader Profiles (Top 10 by Net PnL):


Unnamed: 0_level_0,volume,trades,net_pnl,maker_pct,win_rate,coins_traded
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0x382aaa98abb877bb6661035987ebe12af1f60f34,1345330.0,33,37011.851846,0.0,100.0,1
0x4ae1de7391fc9fca5746f41b0a00750d6e954ea7,953774.4,249,26566.430777,93.574297,,1
0xb83de012dba672c76a7dbbbf3e459cb59d7d6e36,409848.6,113,24551.349676,28.318584,100.0,1
0xaa82c72777b61e6be717c1cfb8420b78d87dc600,672915.2,142,9190.82674,0.0,90.140845,8
0x413c7a0a3489563350219bc96965a7da02f0fffc,114326.3,49,8729.38777,0.0,100.0,3
0x3d9ee32b4d8c5f4232b48a4d05e9325b23fe7a61,96699.04,60,8494.298615,0.0,100.0,1
0x9e02aca9865e1859bb7865f6f64801e804a173df,139142.4,694,4988.672041,0.0,77.956204,64
0xd831409a48d8b21d59245c12b1540a0820446203,1221580.0,44,4009.3888,0.0,100.0,1
0xb4321b142b2a03ce20fcab2007ff6990b9acba93,2121199.0,2618,3914.718282,31.359817,59.195003,28
0x0d7661a7b7a89b40b6db128900557e4f1d1b3789,153766.9,53,3654.898329,0.0,100.0,1


---

## Summary Statistics

In [68]:
print("Dataset Summary")
print("=" * 40)
print(f"Total fills: {len(df):,}")
print(f"Unique traders: {df['user'].nunique():,}")
print(f"Unique coins: {df['coin'].nunique():,}")
print(f"Total volume: ${df['volume'].sum():,.0f}")
print(f"Date range: {df['time'].min()} to {df['time'].max()}")

Dataset Summary
Total fills: 155,006
Unique traders: 5,661
Unique coins: 227
Total volume: $160,886,878
Date range: 2025-11-01 12:00:00.072000 to 2025-11-01 12:59:59.844000
