In [40]:
market='0xf379849db9a1b62cff6c201d84b744fdb751b326fa7bcd771735b34fc31643dd'

In [41]:
import pathlib, json, time, datetime, requests
from typing import List, Dict, Any, Optional

MARKET_ID = market

BASE_URL = "https://data-api.polymarket.com/trades"  # per docs
MAX_LIMIT = 500  # API max
OUTPUT_DIR = pathlib.Path('/Users/kate/projects/polymarket/data/trades')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def fetch_trades_batch(market: str, limit: int = MAX_LIMIT, offset: int = 0, taker_only: bool = True) -> List[Dict[str, Any]]:
    params = {
        'market': market,
        'limit': min(limit, MAX_LIMIT),
        'offset': offset,
        'takerOnly': str(taker_only).lower(),  # 'true' / 'false'
    }
    r = requests.get(BASE_URL, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    if not isinstance(data, list):
        raise ValueError(f'Unexpected response type: {type(data)} | {data}')
    return data

def download_all_trades(market: str, batch_limit: int = MAX_LIMIT, sleep: float = 0.1, max_batches: Optional[int] = None, taker_only: bool = True) -> pathlib.Path:
    ts = datetime.datetime.now(datetime.UTC).strftime('%Y%m%dT%H%M%SZ')
    base = f"{market[:16]}_{ts}"
    jsonl_path = OUTPUT_DIR / f"{base}.jsonl"
    total = 0
    offset = 0
    batch_no = 0
    with jsonl_path.open('w') as f:
        while True:
            batch_no += 1
            batch = fetch_trades_batch(market, limit=batch_limit, offset=offset, taker_only=taker_only)
            if not batch:
                break
            for tr in batch:
                f.write(json.dumps(tr) + '\n')
            got = len(batch)
            total += got
            offset += got
            # Stop conditions
            if got < batch_limit:
                # Last page
                break
            if max_batches and batch_no >= max_batches:
                break
            if sleep:
                time.sleep(sleep)
    summary = {
        'market': market,
        'file': jsonl_path.name,
        'total_trades': total,
        'batches': batch_no,
        'generated_at_utc': ts,
        'note': 'Trades returned newest-first (descending timestamp). Offset pagination accumulates all until exhaustion.'
    }
    (OUTPUT_DIR / f"{base}_summary.json").write_text(json.dumps(summary, indent=2))
    print(f"Saved {total} trades across {batch_no} batch(es) -> {jsonl_path}")
    return jsonl_path

# Run download
_ = download_all_trades(MARKET_ID)


Saved 1539 trades across 4 batch(es) -> /Users/kate/projects/polymarket/data/trades/0xf379849db9a1b6_20250823T212035Z.jsonl


In [42]:
# Load latest downloaded trades for the current `market` into a DataFrame (asset forced to string)
import pandas as pd, pathlib, json, datetime, os
from typing import List, Dict, Any

MARKET_ID = globals().get('market')
if not MARKET_ID:
    raise ValueError("`market` variable not defined. Set it before running this cell.")

trades_dir = pathlib.Path('/Users/kate/projects/polymarket/data/trades')
prefix = MARKET_ID[:16] + '_'

# Find matching jsonl files
candidates = sorted(
    [p for p in trades_dir.glob(f'{prefix}*.jsonl') if p.is_file()],
    key=lambda p: p.stat().st_mtime,
    reverse=True
)
if not candidates:
    raise FileNotFoundError(f'No trade files found matching prefix {prefix} in {trades_dir}')

latest_file = candidates[0]
print(f'Loading trades from {latest_file.name}')

# Manually parse JSONL to control dtypes and preserve very large identifiers as strings
rows: List[Dict[str, Any]] = []
with latest_file.open() as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)
        # Force asset to string exactly as in file (avoid pandas numeric inference / float formatting)
        if 'asset' in obj:
            obj['asset'] = str(obj['asset'])
        rows.append(obj)

df = pd.DataFrame(rows)

# # Ensure pandas didn't coerce asset
# if 'asset' in df.columns:
#     df['asset'] = df['asset'].astype('string')  # pandas string dtype retains exact text

# Add a human-readable UTC datetime column if timestamp present (assumes seconds)
if 'timestamp' in df.columns:
    def _to_dt(x):
        try:
            return datetime.datetime.fromtimestamp(int(x), datetime.timezone.utc)
        except Exception:
            return pd.NaT
    df['dt'] = df['timestamp'].map(_to_dt)

# Sort ascending by timestamp if present
if 'timestamp' in df.columns:
    df.sort_values('timestamp', inplace=True)

print(f'Trades loaded: {len(df)} rows')
if 'asset' in df.columns:
    sample_asset = df['asset'].iloc[0]
    print(f'asset dtype: {df['asset'].dtype}; sample asset length: {len(sample_asset)}')

df.head(5)

Loading trades from 0xf379849db9a1b6_20250823T212035Z.jsonl
Trades loaded: 1539 rows
asset dtype: object; sample asset length: 77


Unnamed: 0,proxyWallet,side,asset,conditionId,size,price,timestamp,title,slug,icon,eventSlug,outcome,outcomeIndex,name,pseudonym,bio,profileImage,profileImageOptimized,transactionHash,dt
1538,0xe33d60a1aa150ae45bad73fbe9538e9ed1c86cd1,BUY,4409588219637920227474221118166029281666484960...,0xf379849db9a1b62cff6c201d84b744fdb751b326fa7b...,1.960783,0.51,1755864076,"Bitcoin Up or Down - August 23, 3AM ET",bitcoin-up-or-down-august-23-3am-et,https://polymarket-upload.s3.us-east-2.amazona...,bitcoin-up-or-down-august-23-3am-et,Up,0,gogi1983,Low-Simvastatin,,,,0x1cce66de880f243fcb6a2b9a9d42e96aee4f1f04e942...,2025-08-22 12:01:16+00:00
1537,0xe33d60a1aa150ae45bad73fbe9538e9ed1c86cd1,BUY,8121000211403539345216250678934244437616051254...,0xf379849db9a1b62cff6c201d84b744fdb751b326fa7b...,13.725489,0.51,1755864082,"Bitcoin Up or Down - August 23, 3AM ET",bitcoin-up-or-down-august-23-3am-et,https://polymarket-upload.s3.us-east-2.amazona...,bitcoin-up-or-down-august-23-3am-et,Down,1,gogi1983,Low-Simvastatin,,,,0xd62398cbd68d359b82c71f28a38c10af7cef51ed3a42...,2025-08-22 12:01:22+00:00
1536,0xd245d5be0b37020971f5a173b77d0b25b822356a,BUY,8121000211403539345216250678934244437616051254...,0xf379849db9a1b62cff6c201d84b744fdb751b326fa7b...,196.07843,0.51,1755907166,"Bitcoin Up or Down - August 23, 3AM ET",bitcoin-up-or-down-august-23-3am-et,https://polymarket-upload.s3.us-east-2.amazona...,bitcoin-up-or-down-august-23-3am-et,Down,1,duskmere,Downright-Accountability,,,,0xcf409482f36cdeec15d16a09583cb779a573769ff3e3...,2025-08-22 23:59:26+00:00
1535,0xd245d5be0b37020971f5a173b77d0b25b822356a,SELL,8121000211403539345216250678934244437616051254...,0xf379849db9a1b62cff6c201d84b744fdb751b326fa7b...,196.07,0.49,1755915757,"Bitcoin Up or Down - August 23, 3AM ET",bitcoin-up-or-down-august-23-3am-et,https://polymarket-upload.s3.us-east-2.amazona...,bitcoin-up-or-down-august-23-3am-et,Down,1,duskmere,Downright-Accountability,,,,0xa82afee76f36e4720a009fab076c65d41c4b6e8526b2...,2025-08-23 02:22:37+00:00
1534,0xd60a93d052a3baf029e6d19819c625c6f734b069,BUY,4409588219637920227474221118166029281666484960...,0xf379849db9a1b62cff6c201d84b744fdb751b326fa7b...,27.156861,0.51,1755918975,"Bitcoin Up or Down - August 23, 3AM ET",bitcoin-up-or-down-august-23-3am-et,https://polymarket-upload.s3.us-east-2.amazona...,bitcoin-up-or-down-august-23-3am-et,Up,0,palmergabriela,Parched-Clarification,,,,0x4159d4510f00ca01a6134f370cc68cac05f34a7b2f1c...,2025-08-23 03:16:15+00:00


In [43]:
# Compute per-wallet UP positions at a cutoff timestamp
from typing import Optional
import pandas as pd, math, datetime

CUTOFF = datetime.datetime(2025, 8, 23, 13, 0, 0, tzinfo=datetime.timezone.utc)
UP_LABELS = {"Yes", "YES", "Up", "UP"}
DOWN_LABELS = {"No", "NO", "Down", "DOWN"}

if 'dt' not in df.columns:
    raise ValueError("DataFrame df must contain 'dt' datetime column; re-run load cell.")

required_cols = {'proxyWallet', 'side', 'size', 'price', 'name'}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing required trade columns: {missing}")

if 'outcome' not in df.columns and 'outcomeIndex' not in df.columns:
    raise ValueError("Need either 'outcome' or 'outcomeIndex' column to classify trades.")

work = df[df['dt'] <= CUTOFF].copy()

def classify_outcome(row):
    if 'outcome' in row and isinstance(row['outcome'], str):
        o = row['outcome']
        if o in UP_LABELS:
            return 'UP'
        if o in DOWN_LABELS:
            return 'DOWN'
    # if 'outcomeIndex' in row:
    #     try:
    #         return 'UP' if int(row['outcomeIndex']) == 0 else 'DOWN'
    #     except Exception:
    #         return 'UNKNOWN'
    # return 'UNKNOWN'

work['__class'] = work.apply(classify_outcome, axis=1)
unknown_ct = (work['__class'] == 'UNKNOWN').sum()
if unknown_ct:
    print(f"Warning: {unknown_ct} trades could not be classified (tagged UNKNOWN). They are ignored.")
work = work[work['__class'] != 'UNKNOWN']

def compute_deltas(row):
    side = row['side']
    size = float(row['size'])
    price = float(row['price'])
    cls = row['__class']
    if cls == 'UP':
        if side == 'BUY':
            return size, -price * size
        else:
            return -size, +price * size
    else:  # DOWN
        price_up = 1.0 - price
        if side == 'BUY':
            return -size, +price_up * size
        else:
            return +size, -price_up * size

work[['delta_shares_up','cash_flow']] = work.apply(lambda r: pd.Series(compute_deltas(r)), axis=1)

agg = work.groupby('proxyWallet', as_index=False).agg(
    name=('name','first'),
    shares_up=('delta_shares_up','sum'),
    cash_flow=('cash_flow','sum'),
    trade_count=('delta_shares_up','count')
)

def avg_entry(row):
    if row['shares_up'] > 0:
        return (-row['cash_flow']) / row['shares_up'] if row['shares_up'] != 0 else math.nan
    return math.nan

agg['avg_entry_price_est'] = agg.apply(avg_entry, axis=1)
agg.sort_values('shares_up', ascending=False, inplace=True)
agg.reset_index(drop=True, inplace=True)

print(f"Cutoff: {CUTOFF.isoformat()} | Wallets: {len(agg)}")
agg.head(25)

Cutoff: 2025-08-23T13:00:00+00:00 | Wallets: 189


Unnamed: 0,proxyWallet,name,shares_up,cash_flow,trade_count,avg_entry_price_est
0,0x3d2d66eb933cfa7aa7b9fc21e6614f080de99360,,2009.97,-1082.133,23,0.538383
1,0x13b066438ef6690b0d4064f5839a61e833e2bb21,reikano,1520.37,-775.3887,1,0.51
2,0xb563eb0184543459596fd1011d013b7451600115,,1287.8,-1028.412,28,0.798581
3,0x98b5ca5a6a02a75477cfb69e37ba97b8a70c4c9e,,1061.13,-551.6677,13,0.519887
4,0x0f863d92dd2b960e3eb6a23a35fd92a91981404e,Qualitative,701.773949,-281.717923,28,0.401437
5,0x7485d661b858b117a66e1b4fcbecfaea87ac1393,1TickWonder2,635.87,94.4454,60,-0.148529
6,0x834ea21b0b55e1fa3804e2aaf5c15fe0b5648015,XnXo,510.0,-169.2,102,0.331765
7,0x969ae5dca76cd421caa13db352c9c3498312e4c5,WXYXPo,414.99,-296.6478,1,0.714831
8,0xb9fc8078fd6c0275c631ec10fcf8d5cc52d6da76,jacobfox,343.0,-175.56,2,0.511837
9,0x68a7047c014bd0e18327404a2f76fa19ad9adf02,lukaspieza,300.0,-153.0,1,0.51


In [47]:
# Plot btcPrice from estimator JSONL vs Polymarket UP price + model estimate (cropped to estimator time range)
import json, pathlib, pandas as pd, plotly.graph_objects as go, datetime, re
from typing import List, Dict, Any

# Ensure trades df exists
if 'df' not in globals():
    raise ValueError("Trades DataFrame `df` not found. Run the load trades cell first.")

plot_df = df.copy()
for col in ['price','size','dt']:
    if col not in plot_df.columns:
        raise ValueError(f'Missing column {col} in trades DataFrame.')

# Classify outcomes if needed
if '__class' not in plot_df.columns:
    UP_LABELS = {"Yes", "YES", "Up", "UP"}
    DOWN_LABELS = {"No", "NO", "Down", "DOWN"}
    def _cls(row):
        if 'outcome' in row and isinstance(row['outcome'], str):
            if row['outcome'] in UP_LABELS: return 'UP'
            if row['outcome'] in DOWN_LABELS: return 'DOWN'
        if 'outcomeIndex' in row:
            try:
                return 'UP' if int(row['outcomeIndex']) == 0 else 'DOWN'
            except Exception:
                return 'UNKNOWN'
        return 'UNKNOWN'
    plot_df['__class'] = plot_df.apply(_cls, axis=1)

plot_df = plot_df[plot_df['__class'] != 'UNKNOWN'].copy()
plot_df['up_price'] = plot_df.apply(lambda r: float(r['price']) if r['__class'] == 'UP' else 1.0 - float(r['price']), axis=1)
plot_df['size'] = plot_df['size'].astype(float)

# --- Load estimator file (provided path) ---
estimator_dir = pathlib.Path('/Users/kate/projects/polymarket/data/estimator')
base_id = None
if 'market' in globals():
    base_id = re.sub(r'^0x','', str(market))

estimator_file = None
if base_id:
    candidate = estimator_dir / f"{base_id}.jsonl"
    if candidate.exists():
        estimator_file = candidate
if estimator_file is None and base_id:
    patt = base_id[:12]
    matches = [p for p in estimator_dir.glob('*.jsonl') if patt in p.name]
    if matches:
        estimator_file = matches[0]
if estimator_file is None or not estimator_file.exists():
    manual = estimator_dir / f'{market}.jsonl'
    if manual.exists():
        estimator_file = manual

est_df = pd.DataFrame()
if estimator_file and estimator_file.exists():
    rows = []
    with estimator_file.open() as f:
        for line in f:
            line=line.strip()
            if not line: continue
            try:
                rows.append(json.loads(line))
            except Exception:
                pass
    if rows:
        est_df = pd.DataFrame(rows)
        tcol = 'timestamp' if 'timestamp' in est_df.columns else None
        if tcol is not None:
            est_df['dt'] = pd.to_datetime(est_df[tcol], utc=True, errors='coerce')
        else:
            for alt in ['time','ts','datetime','dt']:
                if alt in est_df.columns:
                    est_df['dt'] = pd.to_datetime(est_df[alt], utc=True, errors='coerce')
                    break
        est_df = est_df.dropna(subset=['dt'])
        for c in ['btcPrice','target','estimate']:
            if c in est_df.columns:
                est_df[c] = pd.to_numeric(est_df[c], errors='coerce')
        keep_cols = [c for c in ['dt','btcPrice','estimate','target'] if c in est_df.columns]
        est_df = est_df[keep_cols].sort_values('dt')
else:
    print('Estimator file not found; only Polymarket prices will be shown.')

# Crop trades to estimator time range if estimator data present
if not est_df.empty:
    est_start, est_end = est_df['dt'].min(), est_df['dt'].max()
    plot_df = plot_df[(plot_df['dt'] >= est_start) & (plot_df['dt'] <= est_end)].copy()

# Recompute per-second weighted average after potential cropping
if not plot_df.empty:
    plot_df['dt_sec'] = plot_df['dt'].dt.floor('s')
    agg_up = (plot_df.groupby('dt_sec')
              .apply(lambda g: (g['up_price'] * g['size']).sum() / g['size'].sum(), include_groups=False)
              .rename('up_wap')
              .to_frame()
              .reset_index())
else:
    agg_up = pd.DataFrame(columns=['dt_sec','up_wap'])


In [50]:
fig = go.Figure()

# btcPrice (and target) on primary axis
if not est_df.empty and 'btcPrice' in est_df.columns:
    fig.add_trace(go.Scatter(x=est_df['dt'], y=est_df['btcPrice'], name='btcPrice', mode='lines', line=dict(color='#1f77b4')))
if not est_df.empty and 'target' in est_df.columns:
    fig.add_trace(go.Scatter(x=est_df['dt'], y=est_df['target'], name='Target', mode='lines', line=dict(color='#9467bd', dash='dot')))  # primary axis now

# Polymarket probability & estimator outputs on secondary axis
if not agg_up.empty:
    fig.add_trace(go.Scatter(x=agg_up['dt_sec'], y=agg_up['up_wap'], name='Polymarket UP WAP', mode='lines', line=dict(color='#ff7f0e'), yaxis='y2'))
if not plot_df.empty:
    fig.add_trace(go.Scatter(x=plot_df['dt'], y=plot_df['up_price'], name='Trades (UP price)', mode='markers', marker=dict(color='rgba(255,127,14,0.35)', size=6), hovertext=plot_df['size'], hovertemplate='Time=%{x}<br>Price=%{y:.3f}<br>Size=%{hovertext}<extra></extra>', yaxis='y2'))
if not est_df.empty and 'estimate' in est_df.columns:
    fig.add_trace(go.Scatter(x=est_df['dt'], y=est_df['estimate'], name='Model Estimate', mode='lines', line=dict(color='#2ca02c', dash='dash'), yaxis='y2'))

# Determine dynamic probability axis range
prob_series = []
if not agg_up.empty:
    prob_series.append(agg_up['up_wap'])
if not plot_df.empty:
    prob_series.append(plot_df['up_price'])
if not est_df.empty and 'estimate' in est_df.columns:
    prob_series.append(est_df['estimate'])

if prob_series:
    import numpy as _np
    combined = _np.concatenate([s.dropna().to_numpy() for s in prob_series])
    if combined.size:
        pmin = float(combined.min())
        pmax = float(combined.max())
        span = pmax - pmin if pmax > pmin else 0.05
        pad = span * 0.05
        y2_range = [max(0.0, pmin - pad), min(1.0 if pmax <= 1.0 else pmax + pad, pmax + pad)]
    else:
        y2_range = [0,1]
else:
    y2_range = [0,1]

xrange = None
if not est_df.empty:
    xrange = [est_df['dt'].min(), est_df['dt'].max()]

fig.update_layout(
    xaxis=dict(title='Time', range=xrange),
    yaxis=dict(title='btcPrice / Target', side='left', fixedrange=False),
    yaxis2=dict(title='Probability (UP / Estimate)', overlaying='y', side='right', range=y2_range, tickformat='.3f'),
    legend=dict(orientation='h', yanchor='bottom', y=1.02, x=0),
    margin=dict(l=60,r=60,t=60,b=40),
    hovermode='x unified'
)
fig.show(renderer="browser")

In [52]:
trades2 = df.copy()[['dt', 'side', 'outcome', 'name', 'price', 'size', 'proxyWallet']]

In [66]:
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

UP_LABELS = {"Yes", "YES", "Up", "UP"}

# up_price: normalize to UP outcome probability
trades2['up_price'] = trades2.apply(
    lambda r: float(r['price']) if r.get('outcome') in UP_LABELS else 1.0 - float(r['price']),
    axis=1
)

# up_side: perspective of the UP outcome (BUY means increasing UP exposure)
def _up_side(r):
    side = r.get('side')
    if r.get('outcome') in UP_LABELS:
        return side  # already oriented to UP
    # Outcome is DOWN: a BUY on DOWN reduces UP exposure => treat as SELL (and vice‑versa)
    if side == 'BUY':
        return 'SELL'
    if side == 'SELL':
        return 'BUY'
    return side

trades2['up_side'] = trades2.apply(_up_side, axis=1)

trades2[['dt', 'up_side', 'up_price', 'size', 'name', 'proxyWallet', 'side', 'outcome']].tail(100)

Unnamed: 0,dt,up_side,up_price,size,name,proxyWallet,side,outcome
99,2025-08-23 07:59:27+00:00,SELL,0.305542,143.99714,Ispa,0x61dd979431ee8a115ed7d0f259b9c24ea3aecb5a,BUY,Down
98,2025-08-23 07:59:29+00:00,BUY,0.39,17.1,1TickWonder2,0x7485d661b858b117a66e1b4fcbecfaea87ac1393,BUY,Up
96,2025-08-23 07:59:31+00:00,SELL,0.309827,47.814082,dCent,0xcff059bfd494ea15775bb8e822527464d3198db3,BUY,Down
97,2025-08-23 07:59:31+00:00,SELL,0.293138,141.47042,badseer,0xcb8cf3a14bb4e679287b8d8ae5f0830a22920a55,BUY,Down
94,2025-08-23 07:59:33+00:00,SELL,0.39,8.5,0xf247584e41117bbBe4Cc06E4d2C95741792a5216-1742469835200,0xf247584e41117bbbe4cc06e4d2c95741792a5216,SELL,Up
93,2025-08-23 07:59:33+00:00,BUY,0.39,17.0,1TickWonder2,0x7485d661b858b117a66e1b4fcbecfaea87ac1393,BUY,Up
95,2025-08-23 07:59:33+00:00,BUY,0.35,15.6,0xf247584e41117bbBe4Cc06E4d2C95741792a5216-1742469835200,0xf247584e41117bbbe4cc06e4d2c95741792a5216,SELL,Down
92,2025-08-23 07:59:35+00:00,SELL,0.317057,87.85507,TheCryptoEcon,0x14fe1af09048acd90a447fb8527c695086e7e5eb,BUY,Down
90,2025-08-23 07:59:35+00:00,BUY,0.35,15.3,1TickWonder2,0x7485d661b858b117a66e1b4fcbecfaea87ac1393,BUY,Up
91,2025-08-23 07:59:35+00:00,SELL,0.315127,29.450722,jd143793,0xb9ea1b3b75a59100be1da6ae6eb33acec84d5590,BUY,Down
