In [426]:
from pathlib import Path

estimator_path = 'data/estimator/2025-08-23/bitcoin-up-or-down-august-23-8am-et_0x127c15c02f13cc43c57754d9c40babc2f93233eb6a0bd7a08d34835148991d41.jsonl'

market = Path(estimator_path).name.rsplit('_', 1)[-1].removesuffix('.jsonl')
slug = Path(estimator_path).name.rsplit('_', 1)[0]
print(f'market: {market} slug: {slug}')

market: 0x127c15c02f13cc43c57754d9c40babc2f93233eb6a0bd7a08d34835148991d41 slug: bitcoin-up-or-down-august-23-8am-et


In [427]:
import pathlib, json, time, datetime, requests
from typing import List, Dict, Any, Optional

MARKET_ID = market

BASE_URL = "https://data-api.polymarket.com/trades"  # per docs
MAX_LIMIT = 500  # API max
OUTPUT_DIR = pathlib.Path('/Users/kate/projects/polymarket/data/trades')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

def fetch_trades_batch(market: str, limit: int = MAX_LIMIT, offset: int = 0, taker_only: bool = True) -> List[Dict[str, Any]]:
    params = {
        'market': market,
        'limit': min(limit, MAX_LIMIT),
        'offset': offset,
        'takerOnly': str(taker_only).lower(),  # 'true' / 'false'
    }
    r = requests.get(BASE_URL, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()
    if not isinstance(data, list):
        raise ValueError(f'Unexpected response type: {type(data)} | {data}')
    return data

base = f"{market[:16]}"
jsonl_path = OUTPUT_DIR / f"{base}.jsonl"

def download_all_trades(market: str, batch_limit: int = MAX_LIMIT, sleep: float = 0.1, max_batches: Optional[int] = None, taker_only: bool = True) -> pathlib.Path:
    total = 0
    offset = 0
    batch_no = 0
    with jsonl_path.open('w') as f:
        while True:
            batch_no += 1
            batch = fetch_trades_batch(market, limit=batch_limit, offset=offset, taker_only=taker_only)
            if not batch:
                break
            for tr in batch:
                f.write(json.dumps(tr) + '\n')
            got = len(batch)
            total += got
            offset += got
            # Stop conditions
            if got < batch_limit:
                # Last page
                break
            if max_batches and batch_no >= max_batches:
                break
            if sleep:
                time.sleep(sleep)
    summary = {
        'market': market,
        'file': jsonl_path.name,
        'total_trades': total,
        'batches': batch_no,
        'note': 'Trades returned newest-first (descending timestamp). Offset pagination accumulates all until exhaustion.'
    }
    (OUTPUT_DIR / f"{base}_summary.json").write_text(json.dumps(summary, indent=2))
    print(f"Saved {total} trades across {batch_no} batch(es) -> {jsonl_path}")
    return jsonl_path


# download trades if file does not exist: 
if not jsonl_path.exists():
    download_all_trades(MARKET_ID)


In [428]:
# Load latest downloaded trades for the current `market` into a DataFrame (asset forced to string)
import pandas as pd, pathlib, json, datetime, os
from typing import List, Dict, Any

MARKET_ID = globals().get('market')
if not MARKET_ID:
    raise ValueError("`market` variable not defined. Set it before running this cell.")

trades_dir = pathlib.Path('/Users/kate/projects/polymarket/data/trades')
prefix = MARKET_ID[:16] + '_'


print(f'Loading trades from {jsonl_path}')

# Manually parse JSONL to control dtypes and preserve very large identifiers as strings
rows: List[Dict[str, Any]] = []
with jsonl_path.open() as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)
        # Force asset to string exactly as in file (avoid pandas numeric inference / float formatting)
        if 'asset' in obj:
            obj['asset'] = str(obj['asset'])
        rows.append(obj)

df = pd.DataFrame(rows)

# # Ensure pandas didn't coerce asset
# if 'asset' in df.columns:
#     df['asset'] = df['asset'].astype('string')  # pandas string dtype retains exact text

# Add a human-readable UTC datetime column if timestamp present (assumes seconds)
if 'timestamp' in df.columns:
    def _to_dt(x):
        try:
            return datetime.datetime.fromtimestamp(int(x), datetime.timezone.utc)
        except Exception:
            return pd.NaT
    df['dt'] = df['timestamp'].map(_to_dt)

# Sort ascending by timestamp if present
if 'timestamp' in df.columns:
    df.sort_values('timestamp', inplace=True)

print(f'Trades loaded: {len(df)} rows')
if 'asset' in df.columns:
    sample_asset = df['asset'].iloc[0]
    print(f'asset dtype: {df['asset'].dtype}; sample asset length: {len(sample_asset)}')

df.head(5)

Loading trades from /Users/kate/projects/polymarket/data/trades/0x127c15c02f13cc.jsonl


Trades loaded: 1375 rows
asset dtype: object; sample asset length: 77


Unnamed: 0,proxyWallet,side,asset,conditionId,size,price,timestamp,title,slug,icon,eventSlug,outcome,outcomeIndex,name,pseudonym,bio,profileImage,profileImageOptimized,transactionHash,dt
1374,0xe33d60a1aa150ae45bad73fbe9538e9ed1c86cd1,BUY,39464925550823896963253209197317144316931661251374887215448106844001972097392,0x127c15c02f13cc43c57754d9c40babc2f93233eb6a0bd7a08d34835148991d41,13.725489,0.51,1755864874,"Bitcoin Up or Down - August 23, 8AM ET",bitcoin-up-or-down-august-23-8am-et,https://polymarket-upload.s3.us-east-2.amazonaws.com/BTC+fullsize.png,bitcoin-up-or-down-august-23-8am-et,Up,0,gogi1983,Low-Simvastatin,,,,0x5855595565ecd4f502529cd6ba43a0d1a6ce32a1291e5df4d3e5e8563a8c49d7,2025-08-22 12:14:34+00:00
1373,0xe33d60a1aa150ae45bad73fbe9538e9ed1c86cd1,BUY,83019924045357935608809501851767199087742603664266370680549095065457529134480,0x127c15c02f13cc43c57754d9c40babc2f93233eb6a0bd7a08d34835148991d41,1.960783,0.51,1755864878,"Bitcoin Up or Down - August 23, 8AM ET",bitcoin-up-or-down-august-23-8am-et,https://polymarket-upload.s3.us-east-2.amazonaws.com/BTC+fullsize.png,bitcoin-up-or-down-august-23-8am-et,Down,1,gogi1983,Low-Simvastatin,,,,0x8eba6517030c6058ed110354c080833c6d19dea3caa997f1bbdf323018c8df55,2025-08-22 12:14:38+00:00
1372,0xd245d5be0b37020971f5a173b77d0b25b822356a,BUY,83019924045357935608809501851767199087742603664266370680549095065457529134480,0x127c15c02f13cc43c57754d9c40babc2f93233eb6a0bd7a08d34835148991d41,196.07843,0.51,1755908342,"Bitcoin Up or Down - August 23, 8AM ET",bitcoin-up-or-down-august-23-8am-et,https://polymarket-upload.s3.us-east-2.amazonaws.com/BTC+fullsize.png,bitcoin-up-or-down-august-23-8am-et,Down,1,duskmere,Downright-Accountability,,,,0x6b507537a15a7160a8a4581bce4a871d8faf4a28d9596a9027db8c0ef9010df9,2025-08-23 00:19:02+00:00
1371,0xd245d5be0b37020971f5a173b77d0b25b822356a,SELL,83019924045357935608809501851767199087742603664266370680549095065457529134480,0x127c15c02f13cc43c57754d9c40babc2f93233eb6a0bd7a08d34835148991d41,196.07,0.49,1755914853,"Bitcoin Up or Down - August 23, 8AM ET",bitcoin-up-or-down-august-23-8am-et,https://polymarket-upload.s3.us-east-2.amazonaws.com/BTC+fullsize.png,bitcoin-up-or-down-august-23-8am-et,Down,1,duskmere,Downright-Accountability,,,,0xd9db418c2ec99db2e652d51a19ee0e83e6202baff10e765a1499a2d66aa1ca0a,2025-08-23 02:07:33+00:00
1370,0x09bc1527a6f9ebc0fcf0c5b555dfdf8ad56a0082,BUY,39464925550823896963253209197317144316931661251374887215448106844001972097392,0x127c15c02f13cc43c57754d9c40babc2f93233eb6a0bd7a08d34835148991d41,1.960783,0.51,1755923831,"Bitcoin Up or Down - August 23, 8AM ET",bitcoin-up-or-down-august-23-8am-et,https://polymarket-upload.s3.us-east-2.amazonaws.com/BTC+fullsize.png,bitcoin-up-or-down-august-23-8am-et,Up,0,bobofish,Happy-Go-Lucky-Disclaimer,,https://polymarket-upload.s3.us-east-2.amazonaws.com/profile-image-3263295-c9b704d8-c868-4934-a99b-22b387ab355c.jpeg,,0x3de65792041b2587f506df6806e308fbadff00d10d481dae6e0106f51acc085f,2025-08-23 04:37:11+00:00


In [429]:
# Compute per-wallet UP positions at a cutoff timestamp
from typing import Optional
import pandas as pd, math, datetime

UP_LABELS = {"Yes", "YES", "Up", "UP"}
DOWN_LABELS = {"No", "NO", "Down", "DOWN"}

if 'dt' not in df.columns:
    raise ValueError("DataFrame df must contain 'dt' datetime column; re-run load cell.")

required_cols = {'proxyWallet', 'side', 'size', 'price', 'name'}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing required trade columns: {missing}")

if 'outcome' not in df.columns and 'outcomeIndex' not in df.columns:
    raise ValueError("Need either 'outcome' or 'outcomeIndex' column to classify trades.")

work = df.copy()

def classify_outcome(row):
    if 'outcome' in row and isinstance(row['outcome'], str):
        o = row['outcome']
        if o in UP_LABELS:
            return 'UP'
        if o in DOWN_LABELS:
            return 'DOWN'

work['__class'] = work.apply(classify_outcome, axis=1)
unknown_ct = (work['__class'] == 'UNKNOWN').sum()
if unknown_ct:
    print(f"Warning: {unknown_ct} trades could not be classified (tagged UNKNOWN). They are ignored.")
work = work[work['__class'] != 'UNKNOWN']

def compute_deltas(row):
    side = row['side']
    size = float(row['size'])
    price = float(row['price'])
    cls = row['__class']
    if cls == 'UP':
        if side == 'BUY':
            return size, -price * size
        else:
            return -size, +price * size
    else:  # DOWN
        price_up = 1.0 - price
        if side == 'BUY':
            return -size, +price_up * size
        else:
            return +size, -price_up * size

work[['delta_shares_up','cash_flow']] = work.apply(lambda r: pd.Series(compute_deltas(r)), axis=1)

agg = work.groupby('proxyWallet', as_index=False).agg(
    name=('name','first'),
    shares_up=('delta_shares_up','sum'),
    cash_flow=('cash_flow','sum'),
    trade_count=('delta_shares_up','count')
)

def avg_entry(row):
    if row['shares_up'] > 0:
        return (-row['cash_flow']) / row['shares_up'] if row['shares_up'] != 0 else math.nan
    return math.nan

agg['avg_entry_price_est'] = agg.apply(avg_entry, axis=1)
agg.sort_values('shares_up', ascending=False, inplace=True)
agg.reset_index(drop=True, inplace=True)

agg.head(25)

Unnamed: 0,proxyWallet,name,shares_up,cash_flow,trade_count,avg_entry_price_est
0,0xb9fc8078fd6c0275c631ec10fcf8d5cc52d6da76,jacobfox,2265.0,-1156.29,6,0.510503
1,0x104a733d68fa06e916daa5ba7ab9f464ace2d487,1749177280,1990.0,-458.406,3,0.230355
2,0x8b424e123b219710a6ae219f6b61290879534eb6,MartinDupon,1076.480758,-552.759993,14,0.513488
3,0x3d2d66eb933cfa7aa7b9fc21e6614f080de99360,,873.63,-335.8721,17,0.384456
4,0x5d746fbafd770c170bf8863363f9987b31ecbe87,hwylson,822.99,-1.0299,2,0.001251
5,0xecbeba24a9e7d8fc3091db367f2578e911ac42ff,Maximus22,744.55,-37.4455,2,0.050293
6,0x3930f220bb01ccd843478928b6a640f6b7757995,ALEXYANNIS,645.98577,-292.999998,2,0.45357
7,0x86674715a977c452e3af11a1ac99ed5d17019356,0x86674715A977c452E3Af11A1ac99ED5d17019356-1750778348148,594.98,-0.49998,2,0.00084
8,0xc631d9d610b9939f0b915b1916864e9b806876f6,EGH,587.99,14.5909,14,-0.024815
9,0xf1e0bee55e83ebf7fc60f38667cfea7eb5520f73,lexfry11,577.99,-60.4389,1,0.104567


In [430]:
estimator_path

'data/estimator/2025-08-23/bitcoin-up-or-down-august-23-8am-et_0x127c15c02f13cc43c57754d9c40babc2f93233eb6a0bd7a08d34835148991d41.jsonl'

In [431]:
# Plot btcPrice from estimator JSONL vs Polymarket UP price + model estimate (cropped to estimator time range)
import json, pathlib, pandas as pd, plotly.graph_objects as go, datetime, re
from typing import List, Dict, Any

# Ensure trades df exists
if 'df' not in globals():
    raise ValueError("Trades DataFrame `df` not found. Run the load trades cell first.")

plot_df = df.copy()
for col in ['price','size','dt']:
    if col not in plot_df.columns:
        raise ValueError(f'Missing column {col} in trades DataFrame.')

# Classify outcomes if needed
if '__class' not in plot_df.columns:
    UP_LABELS = {"Yes", "YES", "Up", "UP"}
    DOWN_LABELS = {"No", "NO", "Down", "DOWN"}
    def _cls(row):
        if 'outcome' in row and isinstance(row['outcome'], str):
            if row['outcome'] in UP_LABELS: return 'UP'
            if row['outcome'] in DOWN_LABELS: return 'DOWN'
        if 'outcomeIndex' in row:
            try:
                return 'UP' if int(row['outcomeIndex']) == 0 else 'DOWN'
            except Exception:
                return 'UNKNOWN'
        return 'UNKNOWN'
    plot_df['__class'] = plot_df.apply(_cls, axis=1)

plot_df = plot_df[plot_df['__class'] != 'UNKNOWN'].copy()
plot_df['up_price'] = plot_df.apply(lambda r: float(r['price']) if r['__class'] == 'UP' else 1.0 - float(r['price']), axis=1)
plot_df['size'] = plot_df['size'].astype(float)

# --- Load estimator file (provided path) ---
estimator_dir = pathlib.Path('/Users/kate/projects/polymarket/')
base_id = None
if 'market' in globals():
    base_id = re.sub(r'^0x','', str(market))

est_df = pd.DataFrame()
rows = []
with (estimator_dir / estimator_path).open() as f:
    for line in f:
        line=line.strip()
        if not line: continue
        try:
            rows.append(json.loads(line))
        except Exception:
            pass
if rows:
    est_df = pd.DataFrame(rows)
    tcol = 'timestamp' if 'timestamp' in est_df.columns else None
    if tcol is not None:
        est_df['dt'] = pd.to_datetime(est_df[tcol], utc=True, errors='coerce')
    else:
        for alt in ['time','ts','datetime','dt']:
            if alt in est_df.columns:
                est_df['dt'] = pd.to_datetime(est_df[alt], utc=True, errors='coerce')
                break
    est_df = est_df.dropna(subset=['dt'])
    for c in ['btcPrice','target','estimate','scalingFactor']:
        if c in est_df.columns:
            est_df[c] = pd.to_numeric(est_df[c], errors='coerce')
    keep_cols = [c for c in ['dt','btcPrice','estimate','target','scalingFactor'] if c in est_df.columns]
    est_df = est_df[keep_cols].sort_values('dt')

# Crop trades to estimator time range if estimator data present
if not est_df.empty:
    est_start, est_end = est_df['dt'].min(), est_df['dt'].max()
    plot_df = plot_df[(plot_df['dt'] >= est_start) & (plot_df['dt'] <= est_end)].copy()

# Recompute per-second weighted average after potential cropping
if not plot_df.empty:
    plot_df['dt_sec'] = plot_df['dt'].dt.floor('s')
    agg_up = (plot_df.groupby('dt_sec')
              .apply(lambda g: (g['up_price'] * g['size']).sum() / g['size'].sum(), include_groups=False)
              .rename('up_wap')
              .to_frame()
              .reset_index())
else:
    agg_up = pd.DataFrame(columns=['dt_sec','up_wap'])


In [432]:
est_df.iloc[0]

dt               2025-08-23 12:00:01+00:00
btcPrice                         115310.13
estimate                          0.507622
target                           115310.13
scalingFactor                     1.235823
Name: 0, dtype: object

In [433]:
fig = go.Figure()

# btcPrice (and target) on primary axis
if not est_df.empty and 'btcPrice' in est_df.columns:
    fig.add_trace(go.Scatter(x=est_df['dt'], y=est_df['btcPrice'], name='btcPrice', mode='lines', line=dict(color='#1f77b4')))

target_val = None
scaling_factor = float(est_df['scalingFactor'].iloc[0])
y_primary_range = None
if not est_df.empty and 'target' in est_df.columns and not est_df['target'].dropna().empty:
    target_val = float(est_df['target'].iloc[0])
    # Compute symmetric y-range so target is centered
    if 'btcPrice' in est_df.columns and not est_df['btcPrice'].dropna().empty:
        max_dev = (est_df['btcPrice'] - target_val).abs().max()
        pad = max_dev * 0.1 if max_dev > 0 else 1.0
        span = max_dev + pad
        print(f"Target: {target_val:.3f} | Max deviation: {max_dev:.3f} | Span: {span:.3f}")
        y_primary_range = [target_val - span, target_val + span]
    else:
        y_primary_range = [target_val - 1, target_val + 1]
    
    fig.add_trace(go.Scatter(
        x=est_df['dt'],
        y=[target_val]*len(est_df),
        name='Target',
        mode='lines',
        line=dict(color='#9467bd', dash='dot'),
        # hovertemplate='Time=%{x}<br>Target=%{y:.2f}<extra></extra>'
    ))
    # Annotate
    fig.add_annotation(x=est_df['dt'].min(),
                       y=target_val,
                       xanchor='left',
                       yanchor='bottom',
                       text='Target',
                       showarrow=False,
                       font=dict(color='#9467bd'))

print(y_primary_range)

# Polymarket probability & estimator outputs on secondary axis
if not agg_up.empty:
    fig.add_trace(go.Scatter(x=agg_up['dt_sec'], y=agg_up['up_wap'], name='Polymarket UP WAP', mode='lines', line=dict(color='#ff7f0e'), yaxis='y2'))
if not plot_df.empty:
    fig.add_trace(go.Scatter(x=plot_df['dt'], y=plot_df['up_price'], name='Trades (UP price)', mode='markers', marker=dict(color='rgba(255,127,14,0.35)', size=6), hovertext=plot_df['size'], hovertemplate='Time=%{x}<br>Price=%{y:.3f}<br>Size=%{hovertext}<extra></extra>', yaxis='y2'))
if not est_df.empty and 'estimate' in est_df.columns:
    fig.add_trace(go.Scatter(x=est_df['dt'], y=est_df['estimate'], name=f'Estimate (factor {scaling_factor:.2f})', mode='lines', line=dict(color='#2ca02c', dash='dash'), yaxis='y2'))

# Determine dynamic probability axis range
prob_series = []
if not agg_up.empty:
    prob_series.append(agg_up['up_wap'])
if not plot_df.empty:
    prob_series.append(plot_df['up_price'])
if not est_df.empty and 'estimate' in est_df.columns:
    prob_series.append(est_df['estimate'])

y2_range = [0,1]

xrange = None
if not est_df.empty:
    xrange = [est_df['dt'].min(), est_df['dt'].max()]

fig.update_layout(
    title=f"Market: {slug} ({market})",
    xaxis=dict(title='Time', range=xrange),
    yaxis=dict(title='btcPrice / Target', side='left', fixedrange=False, range=y_primary_range, tickformat=",.2f"),
    yaxis2=dict(title='Probability (UP / Estimate)', overlaying='y', side='right', range=y2_range, tickformat='.3f'),
    # legend=dict(orientation='h', yanchor='bottom', y=1.02, x=0),
    legend=dict(
        orientation='h',
        yanchor='top',
        y=-0.15,          # push below plot
        x=0,
        xanchor='left'
    ),
    margin=dict(l=60,r=60,t=60,b=40),
    hovermode='x unified'
)
fig.show(renderer="browser")

Target: 115310.130 | Max deviation: 176.240 | Span: 193.864
[np.float64(115116.26600000002), np.float64(115503.99399999999)]


In [434]:
trades2 = df.copy()[['dt', 'side', 'outcome', 'name', 'price', 'size', 'proxyWallet']]

In [435]:
import pandas as pd

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

UP_LABELS = {"Yes", "YES", "Up", "UP"}

# up_price: normalize to UP outcome probability
trades2['up_price'] = trades2.apply(
    lambda r: float(r['price']) if r.get('outcome') in UP_LABELS else 1.0 - float(r['price']),
    axis=1
)

# up_side: perspective of the UP outcome (BUY means increasing UP exposure)
def _up_side(r):
    side = r.get('side')
    if r.get('outcome') in UP_LABELS:
        return side  # already oriented to UP
    # Outcome is DOWN: a BUY on DOWN reduces UP exposure => treat as SELL (and vice‑versa)
    if side == 'BUY':
        return 'SELL'
    if side == 'SELL':
        return 'BUY'
    return side

trades2['up_side'] = trades2.apply(_up_side, axis=1)

trades2[['dt', 'up_side', 'up_price', 'size', 'name', 'proxyWallet', 'side', 'outcome']].tail(10)

Unnamed: 0,dt,up_side,up_price,size,name,proxyWallet,side,outcome
9,2025-08-23 13:13:13+00:00,BUY,0.001,599.98,0x86674715A977c452E3Af11A1ac99ED5d17019356-1750778348148,0x86674715a977c452e3af11a1ac99ed5d17019356,SELL,Down
8,2025-08-23 13:15:17+00:00,BUY,0.001,68.12,"sc,mnbszcnm,bzsdm,fc",0x577346971f8d0a1338bc340a7f540a0b6f1ddc71,SELL,Down
7,2025-08-23 13:17:47+00:00,BUY,0.001,5.05,Jaijai,0x49a2062020deb19eb15608072d3484a05b833103,SELL,Down
6,2025-08-23 13:20:09+00:00,BUY,0.001,230.0,TechNinjaX,0x959567abebfe0fb7c4fd4be0136b17d0fe391b16,SELL,Down
5,2025-08-23 13:20:49+00:00,BUY,0.001,1000.0,1749177280,0x104a733d68fa06e916daa5ba7ab9f464ace2d487,SELL,Down
4,2025-08-23 13:27:01+00:00,BUY,0.001,1065.76,8787dada,0x2edb696cea63b3d02f6400857403d38ce2f43780,SELL,Down
3,2025-08-23 13:30:07+00:00,BUY,0.001,8.0,,0x4b43d2a40699a9a48bfadef6d85f3440817a65ab,SELL,Down
2,2025-08-23 13:30:09+00:00,BUY,0.001,20.0,,0x51a02132af17252e6993c3b668ea462d46234a02,SELL,Down
1,2025-08-23 13:53:11+00:00,BUY,0.001,379.0,bluehero,0x4c66e891ae4bb54cd83b0849cd7e8bfee97ac7b4,SELL,Down
0,2025-08-23 14:02:19+00:00,BUY,0.001,5.0,ashonetop,0xf73678e43f00a1c1df6774c4160b9a08859563ef,SELL,Down
