In [5]:
import pandas as pd
from pathlib import Path

base = Path(r"C:\Users\nimro\PolyQuant-features\data")
splits = pd.read_parquet(base / "market_splits.parquet")

assert splits["market_id"].is_unique
assert set(splits["split"].unique()) == {"train", "val", "test"}

g = splits.groupby("split")["end_ts"].agg(["count", "min", "max"])
print(g)

# time-respecting boundaries (ties allowed)
assert g.loc["train", "max"] <= g.loc["val", "min"]
assert g.loc["val", "max"] <= g.loc["test", "min"]

        count         min         max
split                                
test    24576  1762459200  1857168000
train  114682  1309897260  1760540400
val     24574  1760540400  1762459200


In [6]:
import random
import pyarrow.parquet as pq

m2s = dict(zip(splits["market_id"].astype(str), splits["split"]))

def sample_check_folder(folder, expected_split, n_files=5, n_rowgroups=3):
    paths = list(folder.glob("*.parquet"))
    if not paths:
        raise RuntimeError(f"No parquet files in {folder}")
    for p in random.sample(paths, k=min(n_files, len(paths))):
        pf = pq.ParquetFile(str(p))
        rgs = list(range(pf.num_row_groups))
        for rg in random.sample(rgs, k=min(n_rowgroups, len(rgs))):
            t = pf.read_row_group(rg, columns=["market_id"])
            mids = t.column(0).to_pandas().astype(str).unique()
            bad = [m for m in mids if m2s.get(m) != expected_split]
            if bad:
                raise RuntimeError(f"Found wrong markets in {expected_split}: file={p.name}, examples={bad[:10]}")
    print(f"OK: sampled {expected_split}")

base = Path(r"C:\Users\nimro\PolyQuant-features\data")
sample_check_folder(base/"train", "train")
sample_check_folder(base/"val", "val")
sample_check_folder(base/"test", "test")

OK: sampled train
OK: sampled val
OK: sampled test


In [7]:
import pyarrow.parquet as pq
from pathlib import Path

def collect_seen_markets(folder: Path):
    seen = set()
    for p in folder.glob("*.parquet"):
        pf = pq.ParquetFile(str(p))
        for rg in range(pf.num_row_groups):
            t = pf.read_row_group(rg, columns=["market_id"])
            mids = t.column(0).to_pandas().astype(str).unique()
            seen.update(mids)
    return seen

base = Path(r"C:\Users\nimro\PolyQuant-features\data")
seen_train = collect_seen_markets(base/"train")
seen_val   = collect_seen_markets(base/"val")
seen_test  = collect_seen_markets(base/"test")

print("overlaps:",
      len(seen_train & seen_val),
      len(seen_train & seen_test),
      len(seen_val & seen_test))

assert len(seen_train & seen_val) == 0
assert len(seen_train & seen_test) == 0
assert len(seen_val & seen_test) == 0


overlaps: 0 0 0


In [8]:
expected = set(splits["market_id"].astype(str))

foreign_train = seen_train - expected
foreign_val   = seen_val - expected
foreign_test  = seen_test - expected

print(len(foreign_train), len(foreign_val), len(foreign_test))
assert len(foreign_train) == 0 and len(foreign_val) == 0 and len(foreign_test) == 0

0 0 0


In [9]:
import sqlite3
from pathlib import Path
from datetime import datetime, timezone
import pandas as pd
import numpy as np

BASE = Path(r"C:\Users\nimro\PolyQuant-features")
DB_PATH = BASE / "sql" / "polymarket.db"
DATA_DIR = BASE / "data"  # uses your split output folders

def iso_z_to_epoch(s: str) -> int:
    dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    return int(dt.timestamp())

def load_market_end_ts(db_path: Path) -> dict:
    con = sqlite3.connect(str(db_path))
    try:
        df = pd.read_sql_query(
            """
            SELECT condition_id AS market_id, end_date
            FROM markets
            WHERE end_date IS NOT NULL
            """,
            con,
        )
    finally:
        con.close()
    df["market_id"] = df["market_id"].astype(str)
    df["end_ts"] = df["end_date"].map(iso_z_to_epoch)
    return dict(zip(df["market_id"], df["end_ts"]))

market_end_ts = load_market_end_ts(DB_PATH)
len(market_end_ts)


163832

In [None]:
import sqlite3
from pathlib import Path
from datetime import datetime, timezone
import random
import numpy as np
import pandas as pd
import pyarrow.dataset as ds
import pyarrow.parquet as pq

BASE = Path(r"C:\Users\nimro\PolyQuant-features")
DB_PATH = BASE / "sql" / "polymarket.db"
DATA_DIR = BASE / "data"

def iso_z_to_epoch(s: str) -> int:
    dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
    if dt.tzinfo is None:
        dt = dt.replace(tzinfo=timezone.utc)
    return int(dt.timestamp())

def ts_human(ts: int) -> str:
    return datetime.fromtimestamp(int(ts), tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")

def load_market_end_ts(db_path: Path) -> dict:
    con = sqlite3.connect(str(db_path))
    try:
        df = pd.read_sql_query(
            """
            SELECT condition_id AS market_id, end_date
            FROM markets
            WHERE end_date IS NOT NULL
            """,
            con,
        )
    finally:
        con.close()
    df["market_id"] = df["market_id"].astype(str)
    df["end_ts"] = df["end_date"].map(iso_z_to_epoch)
    return dict(zip(df["market_id"], df["end_ts"]))

market_end_ts = load_market_end_ts(DB_PATH)



sample file: part-000020.parquet
picked user_id: 0xfb1c3c1ab4fb2d0cbcbb9538c8d4d357dd95963e
rows for user in train: 511729

Example: two consecutive trades (time-ordered for this user) where user_historical_pnl_before changes

Trade A
  trade_uid: e178f1fcb157648953654e8c4cb4660ff59b196a
  ts: 2024-12-07 04:38:55 UTC
  market_id: 0xbb79169bc66adb4fa273611251e410bb2b06fc7a1168f752762a24c5c6b36bc3
  market_end: 2024-12-14 18:00:00 UTC
  price: 0.14 edge: -0.14
  user_historical_pnl_before: 0.0

Trade B
  trade_uid: 7fc35d42c579cc88294734316bd9b9ed133f4ea4
  ts: 2024-12-09 10:39:56 UTC
  market_id: 0x5f25affd5084282827c386dfe3fde5d9f7c51c1fa2d161642aefaf78990de951
  market_end: 2024-12-17 00:30:00 UTC
  price: 0.31 edge: -0.31
  user_historical_pnl_before: 10.659995085388754

Delta user_historical_pnl_before: 10.659995085388754

Markets (that this user traded in train) with end_ts in (TradeA.ts, TradeB.ts]: 23
  0xf6d46620924630920aad970d6790896ed7ba5a88c16e60b3e018ee4ead493b0f 2024-12-08

In [22]:
# ---- pick a user quickly from a random train parquet file ----
train_files = sorted((DATA_DIR / "train").glob("*.parquet"))
p = random.choice(train_files)
pf = pq.ParquetFile(str(p))
u = pf.read_row_group(0, columns=["user_id"]).column(0).to_pandas().astype(str)
user_id = u.value_counts().index[0]
print("sample file:", p.name)
print("picked user_id:", user_id)

# ---- load all rows for that user from TRAIN split only ----
cols = ["trade_uid","user_id","market_id","timestamp","price","edge","user_historical_pnl_before"]
dataset = ds.dataset(str(DATA_DIR / "train"), format="parquet")

table = dataset.to_table(
    columns=cols,
    filter=(ds.field("user_id") == user_id),
)
df = table.to_pandas()
df["market_id"] = df["market_id"].astype(str)
df = df.sort_values("timestamp").reset_index(drop=True)

print("rows for user in train:", len(df))
assert len(df) >= 2, "Not enough rows for this user in train."

# ---- find 2 consecutive trades where historical pnl feature changes ----
pnl = pd.to_numeric(df["user_historical_pnl_before"], errors="coerce").fillna(0.0).astype(float).values
d = np.abs(np.diff(pnl))
idxs = np.where(d > 1e-6)[0]
assert len(idxs) > 0, "Did not find any consecutive trades with a pnl feature change for this user (in train)."

i = int(idxs[0])
t1 = df.iloc[i]
t2 = df.iloc[i+1]

m1 = t1["market_id"]
m2 = t2["market_id"]
end1 = market_end_ts.get(m1, None)
end2 = market_end_ts.get(m2, None)

print("\nExample: two consecutive trades (time-ordered for this user) where user_historical_pnl_before changes\n")

print("Trade A")
print("  trade_uid:", t1["trade_uid"])
print("  ts:", ts_human(t1["timestamp"]))
print("  market_id:", m1)
print("  market_end:", ts_human(end1) if end1 is not None else "UNKNOWN")
print("  price:", float(t1["price"]), "edge:", float(t1["edge"]))
print("  user_historical_pnl_before:", float(t1["user_historical_pnl_before"]))

print("\nTrade B")
print("  trade_uid:", t2["trade_uid"])
print("  ts:", ts_human(t2["timestamp"]))
print("  market_id:", m2)
print("  market_end:", ts_human(end2) if end2 is not None else "UNKNOWN")
print("  price:", float(t2["price"]), "edge:", float(t2["edge"]))
print("  user_historical_pnl_before:", float(t2["user_historical_pnl_before"]))

print("\nDelta user_historical_pnl_before:", float(t2["user_historical_pnl_before"]) - float(t1["user_historical_pnl_before"]))

# Optional: which user-traded markets ended between those two trade timestamps?
ts_a = int(t1["timestamp"])
ts_b = int(t2["timestamp"])
user_markets = df["market_id"].unique()
ended_between = []
for mid in user_markets:
    et = market_end_ts.get(mid, None)
    if et is not None and ts_a < et <= ts_b:
        ended_between.append((mid, et))

ended_between.sort(key=lambda x: x[1])
print("\nMarkets (that this user traded in train) with end_ts in (TradeA.ts, TradeB.ts]:", len(ended_between))
for mid, et in ended_between[:10]:
    print(" ", mid, ts_human(et))
if len(ended_between) > 10:
    print("  ...")


sample file: part-000002.parquet
picked user_id: 0x9f47f1fcb1701bf9eaf31236ad39875e5d60af93
rows for user in train: 259641

Example: two consecutive trades (time-ordered for this user) where user_historical_pnl_before changes

Trade A
  trade_uid: b91fe3581abe2e82dd7961eceef0ea13aa6b7619
  ts: 2023-06-08 17:00:26 UTC
  market_id: 0x1d3e4f4b00c4d5ec6c11b469442a3e827b70d5be386d04941a960ba8ec380b56
  market_end: 2023-06-09 00:00:00 UTC
  price: 0.61000013000013 edge: -0.61000013000013
  user_historical_pnl_before: 0.0

Trade B
  trade_uid: 671bf7e1f25b329bda475df0737c24247e06c81b
  ts: 2023-06-09 10:31:40 UTC
  market_id: 0x397395ad3f3cb595fa310507354787c40ff65efd385efbc4b6bf6632dfc1a3c6
  market_end: 2023-09-01 00:00:00 UTC
  price: 0.5399999988897191 edge: 0.46000000111028094
  user_historical_pnl_before: -0.61000013000013

Delta user_historical_pnl_before: -0.61000013000013

Markets (that this user traded in train) with end_ts in (TradeA.ts, TradeB.ts]: 3
  0x1d3e4f4b00c4d5ec6c11b46944

In [24]:
import numpy as np
import pandas as pd
import pyarrow.dataset as ds
from pathlib import Path
from datetime import datetime, timezone
from bisect import bisect_right, bisect_left

DATA_DIR = Path(r"C:\Users\nimro\PolyQuant-features\data")

def ts_human(ts: int) -> str:
    return datetime.fromtimestamp(int(ts), tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")

# Load full user history across splits
dataset = ds.dataset(str(DATA_DIR/"train"), format="parquet")
cols = ["trade_uid","user_id","market_id","timestamp","price","edge","user_historical_pnl_before"]
table = dataset.to_table(columns=cols, filter=(ds.field("user_id") == user_id))

df_all = table.to_pandas()
df_all["market_id"] = df_all["market_id"].astype(str)
df_all["timestamp"] = df_all["timestamp"].astype(np.int64)
df_all["user_historical_pnl_before"] = pd.to_numeric(df_all["user_historical_pnl_before"], errors="coerce").fillna(0.0).astype(float)
df_all = df_all.sort_values("timestamp").reset_index(drop=True)

# first trade time per market (for eligibility)
first_ts = df_all.groupby("market_id")["timestamp"].min().to_dict()

# closures list: (end_ts, market_id, first_trade_ts)
closures = []
for mid, ft in first_ts.items():
    et = market_end_ts.get(mid)
    if et is not None:
        closures.append((int(et), mid, int(ft)))
closures.sort()  # by end_ts
end_ts_sorted = [x[0] for x in closures]

len(df_all), len(closures)

def eligible_closures_between(t0: int, t1: int):
    # markets with end_ts in (t0, t1] and first_trade_ts <= t0
    lo = bisect_right(end_ts_sorted, t0)
    hi = bisect_right(end_ts_sorted, t1)
    out = []
    for k in range(lo, hi):
        et, mid, ft = closures[k]
        if ft <= t0:
            out.append((mid, et, ft))
    return out

pnl = df_all["user_historical_pnl_before"].values
ts  = df_all["timestamp"].values

eps = 1e-9
expected_stable = []
suspicious = []

for i in range(len(df_all) - 1):
    t0, t1 = int(ts[i]), int(ts[i+1])
    d = float(pnl[i+1] - pnl[i])
    ends = eligible_closures_between(t0, t1)

    if len(ends) == 0:
        if abs(d) <= eps:
            expected_stable.append(i)
        else:
            suspicious.append(i)

print("expected_stable pairs:", len(expected_stable))
print("suspicious pairs:", len(suspicious))

expected_stable[:5], suspicious[:5]

def show_pair(i: int):
    a = df_all.iloc[i]
    b = df_all.iloc[i+1]
    t0, t1 = int(a["timestamp"]), int(b["timestamp"])
    ends = eligible_closures_between(t0, t1)
    d = float(b["user_historical_pnl_before"] - a["user_historical_pnl_before"])

    def mend(mid):
        et = market_end_ts.get(mid)
        return ts_human(et) if et is not None else "UNKNOWN"

    print("Trade A:", ts_human(t0), "market_end:", mend(a["market_id"]))
    print("  market_id:", a["market_id"])
    print("  pnl_before:", float(a["user_historical_pnl_before"]))
    print("Trade B:", ts_human(t1), "market_end:", mend(b["market_id"]))
    print("  market_id:", b["market_id"])
    print("  pnl_before:", float(b["user_historical_pnl_before"]))
    print("Delta pnl_before:", d)

    print("Eligible market closings in (A.ts, B.ts]:", len(ends))
    for mid, et, ft in ends[:10]:
        print(" ", mid, "end:", ts_human(et), "first_trade:", ts_human(ft))
    if len(ends) > 10:
        print("  ...")
    print("-" * 60)

print("=== Expected stable examples (no eligible closure, pnl unchanged) ===")
for i in expected_stable[:3]:
    show_pair(i)

print("\n=== Suspicious examples (no eligible closure, pnl changed) ===")
for i in suspicious[:3]:
    show_pair(i)


expected_stable pairs: 255288
suspicious pairs: 4264
=== Expected stable examples (no eligible closure, pnl unchanged) ===
Trade A: 2023-06-07 21:47:40 UTC market_end: 2023-09-01 00:00:00 UTC
  market_id: 0x397395ad3f3cb595fa310507354787c40ff65efd385efbc4b6bf6632dfc1a3c6
  pnl_before: 0.0
Trade B: 2023-06-07 23:44:54 UTC market_end: 2023-07-30 00:00:00 UTC
  market_id: 0x04bd2ccadea2c997bb946292f72a9dbbc86a937655e537ad824b0f3380dac750
  pnl_before: 0.0
Delta pnl_before: 0.0
Eligible market closings in (A.ts, B.ts]: 0
------------------------------------------------------------
Trade A: 2023-06-07 23:44:54 UTC market_end: 2023-07-30 00:00:00 UTC
  market_id: 0x04bd2ccadea2c997bb946292f72a9dbbc86a937655e537ad824b0f3380dac750
  pnl_before: 0.0
Trade B: 2023-06-08 06:58:51 UTC market_end: 2023-09-01 00:00:00 UTC
  market_id: 0x397395ad3f3cb595fa310507354787c40ff65efd385efbc4b6bf6632dfc1a3c6
  pnl_before: 0.0
Delta pnl_before: 0.0
Eligible market closings in (A.ts, B.ts]: 0
----------------