In [None]:
from pathlib import Path

estimator_path = 'data/estimator/2025-08-23/bitcoin-up-or-down-august-22-8pm-et_0x5b1a5ba5964d16da4817b15c5d056c7a2f9f4de32bfd0b980a46fbeb09e144f3.jsonljgh'

market = Path(estimator_path).name.rsplit('_', 1)[-1].removesuffix('.jsonl')
slug = Path(estimator_path).name.rsplit('_', 1)[0]
print(f'market: {market} slug: {slug}')

market: 0x5b1a5ba5964d16da4817b15c5d056c7a2f9f4de32bfd0b980a46fbeb09e144f3 slug: bitcoin-up-or-down-august-22-8pm-et


In [None]:
import pathlib, json, time, datetime, requests
from typing import List, Dict, Any, Optional

MARKET_ID = market

BASE_URL = "https://data-api.polymarket.com/trades"  # per docs
MAX_LIMIT = 500  # API max
OUTPUT_DIR = pathlib.Path('/Users/kate/projects/polymarket/data/trades')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def fetch_trades_batch(market: str, limit: int = MAX_LIMIT, offset: int = 0, taker_only: bool = True) -> List[Dict[str, Any]]:
    params = {
        'market': market,
        'limit': min(limit, MAX_LIMIT),
        'offset': offset,
        'takerOnly': str(taker_only).lower(),  # 'true' / 'false'
    }
    r = requests.get(BASE_URL, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    if not isinstance(data, list):
        raise ValueError(f'Unexpected response type: {type(data)} | {data}')
    return data

base = f"{market[:16]}"
jsonl_path = OUTPUT_DIR / f"{base}.jsonl"

def download_all_trades(market: str, batch_limit: int = MAX_LIMIT, sleep: float = 0.1, max_batches: Optional[int] = None, taker_only: bool = True) -> pathlib.Path:
    total = 0
    offset = 0
    batch_no = 0
    with jsonl_path.open('w') as f:
        while True:
            batch_no += 1
            batch = fetch_trades_batch(market, limit=batch_limit, offset=offset, taker_only=taker_only)
            if not batch:
                break
            for tr in batch:
                f.write(json.dumps(tr) + '\n')
            got = len(batch)
            total += got
            offset += got
            # Stop conditions
            if got < batch_limit:
                # Last page
                break
            if max_batches and batch_no >= max_batches:
                break
            if sleep:
                time.sleep(sleep)
    summary = {
        'market': market,
        'file': jsonl_path.name,
        'total_trades': total,
        'batches': batch_no,
        'generated_at_utc': ts,
        'note': 'Trades returned newest-first (descending timestamp). Offset pagination accumulates all until exhaustion.'
    }
    (OUTPUT_DIR / f"{base}_summary.json").write_text(json.dumps(summary, indent=2))
    print(f"Saved {total} trades across {batch_no} batch(es) -> {jsonl_path}")
    return jsonl_path


# download trades if file does not exist: 
if not jsonl_path.exists():
    download_all_trades(MARKET_ID)


Saved 1603 trades across 4 batch(es) -> /Users/kate/projects/polymarket/data/trades/0x5b1a5ba5964d16.jsonl


In [103]:
# Load latest downloaded trades for the current `market` into a DataFrame (asset forced to string)
import pandas as pd, pathlib, json, datetime, os
from typing import List, Dict, Any

MARKET_ID = globals().get('market')
if not MARKET_ID:
    raise ValueError("`market` variable not defined. Set it before running this cell.")

trades_dir = pathlib.Path('/Users/kate/projects/polymarket/data/trades')
prefix = MARKET_ID[:16] + '_'


print(f'Loading trades from {jsonl_path}')

# Manually parse JSONL to control dtypes and preserve very large identifiers as strings
rows: List[Dict[str, Any]] = []
with jsonl_path.open() as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)
        # Force asset to string exactly as in file (avoid pandas numeric inference / float formatting)
        if 'asset' in obj:
            obj['asset'] = str(obj['asset'])
        rows.append(obj)

df = pd.DataFrame(rows)

# # Ensure pandas didn't coerce asset
# if 'asset' in df.columns:
#     df['asset'] = df['asset'].astype('string')  # pandas string dtype retains exact text

# Add a human-readable UTC datetime column if timestamp present (assumes seconds)
if 'timestamp' in df.columns:
    def _to_dt(x):
        try:
            return datetime.datetime.fromtimestamp(int(x), datetime.timezone.utc)
        except Exception:
            return pd.NaT
    df['dt'] = df['timestamp'].map(_to_dt)

# Sort ascending by timestamp if present
if 'timestamp' in df.columns:
    df.sort_values('timestamp', inplace=True)

print(f'Trades loaded: {len(df)} rows')
if 'asset' in df.columns:
    sample_asset = df['asset'].iloc[0]
    print(f'asset dtype: {df['asset'].dtype}; sample asset length: {len(sample_asset)}')

df.head(5)

Loading trades from /Users/kate/projects/polymarket/data/trades/0x5b1a5ba5964d16.jsonl
Trades loaded: 1603 rows
asset dtype: object; sample asset length: 77
Trades loaded: 1603 rows
asset dtype: object; sample asset length: 77


Unnamed: 0,proxyWallet,side,asset,conditionId,size,price,timestamp,title,slug,icon,eventSlug,outcome,outcomeIndex,name,pseudonym,bio,profileImage,profileImageOptimized,transactionHash,dt
1602,0xa87b16935a61ae9ff437ac32629902672eff4a2c,BUY,46349047764236974385535746285296226035177175012761594138753819910305185388740,0x5b1a5ba5964d16da4817b15c5d056c7a2f9f4de32bfd0b980a46fbeb09e144f3,1.960783,0.51,1755836177,"Bitcoin Up or Down - August 22, 8PM ET",bitcoin-up-or-down-august-22-8pm-et,https://polymarket-upload.s3.us-east-2.amazonaws.com/BTC+fullsize.png,bitcoin-up-or-down-august-22-8pm-et,Down,1,juanana,Altruistic-Waitress,,,,0xbb14b97cefa968d1955ad66246e56db8369e4f646c7eafc7c6aee52c5f1861e8,2025-08-22 04:16:17+00:00
1601,0xcaa561d03036d8a08aa51482dd8d0166a9925059,BUY,82515967107008156628613328664833259059202574148668817462341408913714350323199,0x5b1a5ba5964d16da4817b15c5d056c7a2f9f4de32bfd0b980a46fbeb09e144f3,5.0,0.51,1755839995,"Bitcoin Up or Down - August 22, 8PM ET",bitcoin-up-or-down-august-22-8pm-et,https://polymarket-upload.s3.us-east-2.amazonaws.com/BTC+fullsize.png,bitcoin-up-or-down-august-22-8pm-et,Up,0,hhlovequantt,Emotional-Wish,,,,0xab11089d9813f435d6939e42b6d680d7583909b499cfa85772c4cd13d1748af3,2025-08-22 05:19:55+00:00
1600,0xcaa561d03036d8a08aa51482dd8d0166a9925059,BUY,46349047764236974385535746285296226035177175012761594138753819910305185388740,0x5b1a5ba5964d16da4817b15c5d056c7a2f9f4de32bfd0b980a46fbeb09e144f3,5.0,0.51,1755840005,"Bitcoin Up or Down - August 22, 8PM ET",bitcoin-up-or-down-august-22-8pm-et,https://polymarket-upload.s3.us-east-2.amazonaws.com/BTC+fullsize.png,bitcoin-up-or-down-august-22-8pm-et,Down,1,hhlovequantt,Emotional-Wish,,,,0xe3c613553baaeb80a8ac7270583a176d717d281ee95da17d7e64e251fffad3d6,2025-08-22 05:20:05+00:00
1599,0xe33d60a1aa150ae45bad73fbe9538e9ed1c86cd1,BUY,82515967107008156628613328664833259059202574148668817462341408913714350323199,0x5b1a5ba5964d16da4817b15c5d056c7a2f9f4de32bfd0b980a46fbeb09e144f3,1.960783,0.51,1755859024,"Bitcoin Up or Down - August 22, 8PM ET",bitcoin-up-or-down-august-22-8pm-et,https://polymarket-upload.s3.us-east-2.amazonaws.com/BTC+fullsize.png,bitcoin-up-or-down-august-22-8pm-et,Up,0,gogi1983,Low-Simvastatin,,,,0x0f292ad997afa7e6a5a0a93f35e1f5f51cb7534631abef769156855cd9f717d1,2025-08-22 10:37:04+00:00
1598,0xe33d60a1aa150ae45bad73fbe9538e9ed1c86cd1,BUY,46349047764236974385535746285296226035177175012761594138753819910305185388740,0x5b1a5ba5964d16da4817b15c5d056c7a2f9f4de32bfd0b980a46fbeb09e144f3,5.882351,0.51,1755859030,"Bitcoin Up or Down - August 22, 8PM ET",bitcoin-up-or-down-august-22-8pm-et,https://polymarket-upload.s3.us-east-2.amazonaws.com/BTC+fullsize.png,bitcoin-up-or-down-august-22-8pm-et,Down,1,gogi1983,Low-Simvastatin,,,,0xc63109d575cb48bbdf5e150e6c2d031f2ef8dabdd0980e6aa687a345b79e596f,2025-08-22 10:37:10+00:00


In [104]:
# Compute per-wallet UP positions at a cutoff timestamp
from typing import Optional
import pandas as pd, math, datetime

UP_LABELS = {"Yes", "YES", "Up", "UP"}
DOWN_LABELS = {"No", "NO", "Down", "DOWN"}

if 'dt' not in df.columns:
    raise ValueError("DataFrame df must contain 'dt' datetime column; re-run load cell.")

required_cols = {'proxyWallet', 'side', 'size', 'price', 'name'}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing required trade columns: {missing}")

if 'outcome' not in df.columns and 'outcomeIndex' not in df.columns:
    raise ValueError("Need either 'outcome' or 'outcomeIndex' column to classify trades.")

work = df.copy()

def classify_outcome(row):
    if 'outcome' in row and isinstance(row['outcome'], str):
        o = row['outcome']
        if o in UP_LABELS:
            return 'UP'
        if o in DOWN_LABELS:
            return 'DOWN'
    # if 'outcomeIndex' in row:
    #     try:
    #         return 'UP' if int(row['outcomeIndex']) == 0 else 'DOWN'
    #     except Exception:
    #         return 'UNKNOWN'
    # return 'UNKNOWN'

work['__class'] = work.apply(classify_outcome, axis=1)
unknown_ct = (work['__class'] == 'UNKNOWN').sum()
if unknown_ct:
    print(f"Warning: {unknown_ct} trades could not be classified (tagged UNKNOWN). They are ignored.")
work = work[work['__class'] != 'UNKNOWN']

def compute_deltas(row):
    side = row['side']
    size = float(row['size'])
    price = float(row['price'])
    cls = row['__class']
    if cls == 'UP':
        if side == 'BUY':
            return size, -price * size
        else:
            return -size, +price * size
    else:  # DOWN
        price_up = 1.0 - price
        if side == 'BUY':
            return -size, +price_up * size
        else:
            return +size, -price_up * size

work[['delta_shares_up','cash_flow']] = work.apply(lambda r: pd.Series(compute_deltas(r)), axis=1)

agg = work.groupby('proxyWallet', as_index=False).agg(
    name=('name','first'),
    shares_up=('delta_shares_up','sum'),
    cash_flow=('cash_flow','sum'),
    trade_count=('delta_shares_up','count')
)

def avg_entry(row):
    if row['shares_up'] > 0:
        return (-row['cash_flow']) / row['shares_up'] if row['shares_up'] != 0 else math.nan
    return math.nan

agg['avg_entry_price_est'] = agg.apply(avg_entry, axis=1)
agg.sort_values('shares_up', ascending=False, inplace=True)
agg.reset_index(drop=True, inplace=True)

print(f"Cutoff: {CUTOFF.isoformat()} | Wallets: {len(agg)}")
agg.head(25)

Cutoff: 2025-08-23T13:00:00+00:00 | Wallets: 136


Unnamed: 0,proxyWallet,name,shares_up,cash_flow,trade_count,avg_entry_price_est
0,0xa9b73b68f02f26238e2fa948fe0b7dbeb5bff43c,mts0215,2938.26,2.9364,2,-0.000999
1,0xba2c47e32555714e5dc3f623f9b1a1ade2fc050e,a.h.,2871.761856,-49.799995,52,0.017341
2,0x35c0732e069faea97c11aa9cab045562eaab81d6,,1699.0,-1.699,1,0.001
3,0xbd88ce95f23f674a41dcc6735052942115321ca6,xlw2na,1570.06,-18.297,5,0.011654
4,0x834ea21b0b55e1fa3804e2aaf5c15fe0b5648015,XnXo,845.55,-253.9168,171,0.300298
5,0x544658c3633215c89e0d92cad8f972d53ebcfe65,PikachuMoney,820.5,-311.79,1,0.38
6,0x7485d661b858b117a66e1b4fcbecfaea87ac1393,1TickWonder2,561.03,-291.1895,29,0.519027
7,0x104a733d68fa06e916daa5ba7ab9f464ace2d487,1749177280,472.1,59.51769,2,-0.12607
8,0x51626a91e631a2d36ac04966f1e7ac4a69ec3991,Dimidima,413.2009,-253.869998,2,0.614398
9,0x76bc5994bf0a12d08a791b897d1fe1affea7205b,Brundle,409.01,-92.1986,4,0.225419


In [105]:
estimator_path

'data/estimator/2025-08-23/bitcoin-up-or-down-august-22-8pm-et_0x5b1a5ba5964d16da4817b15c5d056c7a2f9f4de32bfd0b980a46fbeb09e144f3.jsonl'

In [106]:
# Plot btcPrice from estimator JSONL vs Polymarket UP price + model estimate (cropped to estimator time range)
import json, pathlib, pandas as pd, plotly.graph_objects as go, datetime, re
from typing import List, Dict, Any

# Ensure trades df exists
if 'df' not in globals():
    raise ValueError("Trades DataFrame `df` not found. Run the load trades cell first.")

plot_df = df.copy()
for col in ['price','size','dt']:
    if col not in plot_df.columns:
        raise ValueError(f'Missing column {col} in trades DataFrame.')

# Classify outcomes if needed
if '__class' not in plot_df.columns:
    UP_LABELS = {"Yes", "YES", "Up", "UP"}
    DOWN_LABELS = {"No", "NO", "Down", "DOWN"}
    def _cls(row):
        if 'outcome' in row and isinstance(row['outcome'], str):
            if row['outcome'] in UP_LABELS: return 'UP'
            if row['outcome'] in DOWN_LABELS: return 'DOWN'
        if 'outcomeIndex' in row:
            try:
                return 'UP' if int(row['outcomeIndex']) == 0 else 'DOWN'
            except Exception:
                return 'UNKNOWN'
        return 'UNKNOWN'
    plot_df['__class'] = plot_df.apply(_cls, axis=1)

plot_df = plot_df[plot_df['__class'] != 'UNKNOWN'].copy()
plot_df['up_price'] = plot_df.apply(lambda r: float(r['price']) if r['__class'] == 'UP' else 1.0 - float(r['price']), axis=1)
plot_df['size'] = plot_df['size'].astype(float)

# --- Load estimator file (provided path) ---
estimator_dir = pathlib.Path('/Users/kate/projects/polymarket/')
base_id = None
if 'market' in globals():
    base_id = re.sub(r'^0x','', str(market))

est_df = pd.DataFrame()
rows = []
with (estimator_dir / estimator_path).open() as f:
    for line in f:
        line=line.strip()
        if not line: continue
        try:
            rows.append(json.loads(line))
        except Exception:
            pass
if rows:
    est_df = pd.DataFrame(rows)
    tcol = 'timestamp' if 'timestamp' in est_df.columns else None
    if tcol is not None:
        est_df['dt'] = pd.to_datetime(est_df[tcol], utc=True, errors='coerce')
    else:
        for alt in ['time','ts','datetime','dt']:
            if alt in est_df.columns:
                est_df['dt'] = pd.to_datetime(est_df[alt], utc=True, errors='coerce')
                break
    est_df = est_df.dropna(subset=['dt'])
    for c in ['btcPrice','target','estimate']:
        if c in est_df.columns:
            est_df[c] = pd.to_numeric(est_df[c], errors='coerce')
    keep_cols = [c for c in ['dt','btcPrice','estimate','target'] if c in est_df.columns]
    est_df = est_df[keep_cols].sort_values('dt')

# Crop trades to estimator time range if estimator data present
if not est_df.empty:
    est_start, est_end = est_df['dt'].min(), est_df['dt'].max()
    plot_df = plot_df[(plot_df['dt'] >= est_start) & (plot_df['dt'] <= est_end)].copy()

# Recompute per-second weighted average after potential cropping
if not plot_df.empty:
    plot_df['dt_sec'] = plot_df['dt'].dt.floor('s')
    agg_up = (plot_df.groupby('dt_sec')
              .apply(lambda g: (g['up_price'] * g['size']).sum() / g['size'].sum(), include_groups=False)
              .rename('up_wap')
              .to_frame()
              .reset_index())
else:
    agg_up = pd.DataFrame(columns=['dt_sec','up_wap'])


In [107]:
fig = go.Figure()

# btcPrice (and target) on primary axis
if not est_df.empty and 'btcPrice' in est_df.columns:
    fig.add_trace(go.Scatter(x=est_df['dt'], y=est_df['btcPrice'], name='btcPrice', mode='lines', line=dict(color='#1f77b4')))
if not est_df.empty and 'target' in est_df.columns:
    fig.add_trace(go.Scatter(x=est_df['dt'], y=est_df['target'], name='Target', mode='lines', line=dict(color='#9467bd', dash='dot')))  # primary axis now

# Polymarket probability & estimator outputs on secondary axis
if not agg_up.empty:
    fig.add_trace(go.Scatter(x=agg_up['dt_sec'], y=agg_up['up_wap'], name='Polymarket UP WAP', mode='lines', line=dict(color='#ff7f0e'), yaxis='y2'))
if not plot_df.empty:
    fig.add_trace(go.Scatter(x=plot_df['dt'], y=plot_df['up_price'], name='Trades (UP price)', mode='markers', marker=dict(color='rgba(255,127,14,0.35)', size=6), hovertext=plot_df['size'], hovertemplate='Time=%{x}<br>Price=%{y:.3f}<br>Size=%{hovertext}<extra></extra>', yaxis='y2'))
if not est_df.empty and 'estimate' in est_df.columns:
    fig.add_trace(go.Scatter(x=est_df['dt'], y=est_df['estimate'], name='Model Estimate', mode='lines', line=dict(color='#2ca02c', dash='dash'), yaxis='y2'))

# Determine dynamic probability axis range
prob_series = []
if not agg_up.empty:
    prob_series.append(agg_up['up_wap'])
if not plot_df.empty:
    prob_series.append(plot_df['up_price'])
if not est_df.empty and 'estimate' in est_df.columns:
    prob_series.append(est_df['estimate'])

if prob_series:
    import numpy as _np
    combined = _np.concatenate([s.dropna().to_numpy() for s in prob_series])
    if combined.size:
        pmin = float(combined.min())
        pmax = float(combined.max())
        span = pmax - pmin if pmax > pmin else 0.05
        pad = span * 0.05
        y2_range = [max(0.0, pmin - pad), min(1.0 if pmax <= 1.0 else pmax + pad, pmax + pad)]
    else:
        y2_range = [0,1]
else:
    y2_range = [0,1]

xrange = None
if not est_df.empty:
    xrange = [est_df['dt'].min(), est_df['dt'].max()]

fig.update_layout(
    title=f"Market: {slug} ({market})",
    xaxis=dict(title='Time', range=xrange),
    yaxis=dict(title='btcPrice / Target', side='left', fixedrange=False),
    yaxis2=dict(title='Probability (UP / Estimate)', overlaying='y', side='right', range=y2_range, tickformat='.3f'),
    # legend=dict(orientation='h', yanchor='bottom', y=1.02, x=0),
    legend=dict(
        orientation='h',
        yanchor='top',
        y=-0.15,          # push below plot
        x=0,
        xanchor='left'
    ),
    margin=dict(l=60,r=60,t=60,b=40),
    hovermode='x unified'
)
fig.show(renderer="browser")

In [108]:
trades2 = df.copy()[['dt', 'side', 'outcome', 'name', 'price', 'size', 'proxyWallet']]

In [109]:
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

UP_LABELS = {"Yes", "YES", "Up", "UP"}

# up_price: normalize to UP outcome probability
trades2['up_price'] = trades2.apply(
    lambda r: float(r['price']) if r.get('outcome') in UP_LABELS else 1.0 - float(r['price']),
    axis=1
)

# up_side: perspective of the UP outcome (BUY means increasing UP exposure)
def _up_side(r):
    side = r.get('side')
    if r.get('outcome') in UP_LABELS:
        return side  # already oriented to UP
    # Outcome is DOWN: a BUY on DOWN reduces UP exposure => treat as SELL (and vice‑versa)
    if side == 'BUY':
        return 'SELL'
    if side == 'SELL':
        return 'BUY'
    return side

trades2['up_side'] = trades2.apply(_up_side, axis=1)

trades2[['dt', 'up_side', 'up_price', 'size', 'name', 'proxyWallet', 'side', 'outcome']].tail(10)

Unnamed: 0,dt,up_side,up_price,size,name,proxyWallet,side,outcome
9,2025-08-23 01:09:22+00:00,BUY,0.01,10.0,0x8EdACaad527Bff6a64C3d3fdE42fE2DE6dA8ACc9-1750224522875,0x8edacaad527bff6a64c3d3fde42fe2de6da8acc9,SELL,Down
8,2025-08-23 01:25:55+00:00,BUY,0.001,3591.0,mts0215,0xa9b73b68f02f26238e2fa948fe0b7dbeb5bff43c,SELL,Down
7,2025-08-23 01:26:59+00:00,BUY,0.001,8.19,quchuanping,0xac578fb03d867fa484dc3d70678ba955da67602b,SELL,Down
6,2025-08-23 01:27:19+00:00,BUY,0.001,1699.0,,0x35c0732e069faea97c11aa9cab045562eaab81d6,BUY,Up
5,2025-08-23 01:30:09+00:00,BUY,0.001,250.0,TechNinjaX,0x959567abebfe0fb7c4fd4be0136b17d0fe391b16,SELL,Down
4,2025-08-23 01:33:17+00:00,BUY,0.001,1660.0,xlw2na,0xbd88ce95f23f674a41dcc6735052942115321ca6,SELL,Down
3,2025-08-23 01:41:53+00:00,BUY,0.001,5.0,take888,0xf278a463908a736e139046c917f826d44a1ebe16,SELL,Down
2,2025-08-23 01:46:43+00:00,BUY,0.001,600.01,1749177280,0x104a733d68fa06e916daa5ba7ab9f464ace2d487,SELL,Down
1,2025-08-23 01:48:55+00:00,BUY,0.001,5.0,matrix12,0xbd8a49743aec88e31731f7a9ef5ffbb70ea9a5cf,SELL,Down
0,2025-08-23 01:49:57+00:00,BUY,0.001,27.45,Bluedog47,0xaa669a8624affb7ba6973d086726126526f06a3c,SELL,Down
