In [11]:
from pathlib import Path
import os
import pyarrow.parquet as pq
import pandas as pd
import numpy as np

In [5]:
synth_dir = Path(os.environ.get("SYNTH_DIR", "/datasets/PleIAs_synth"))
shards = sorted(synth_dir.glob("synth_*.parquet"))
assert shards, f"No shards found in {synth_dir}"

In [6]:
first = shards[0]
pf = pq.ParquetFile(first)
print("first shard:", first.name)
print("rows:", pf.metadata.num_rows)
print("schema:", pf.schema)

first shard: synth_001.parquet
rows: 154674
schema: <pyarrow._parquet.ParquetSchema object at 0xe92d80275b00>
required group field_id=-1 schema {
  optional binary field_id=-1 synth_id (String);
  optional binary field_id=-1 language (String);
  optional binary field_id=-1 exercise (String);
  optional binary field_id=-1 model (String);
  optional binary field_id=-1 query (String);
  optional binary field_id=-1 query_seed_url (String);
  optional binary field_id=-1 query_seed_text (String);
  optional binary field_id=-1 additional_seed_url (String);
  optional binary field_id=-1 seed_license (String);
  optional binary field_id=-1 constraints (String);
  optional binary field_id=-1 script (String);
  optional binary field_id=-1 synthetic_reasoning (String);
  optional binary field_id=-1 synthetic_answer (String);
  optional int64 field_id=-1 words;
}



In [None]:
cols = [
    "synth_id","language","exercise","model",
    "query","query_seed_text","constraints",
    "synthetic_reasoning","synthetic_answer","words",
]

preview = pq.read_table(first, columns=cols).slice(0, 3).to_pandas()
preview

Unnamed: 0,synth_id,language,exercise,model,query,query_seed_text,constraints,synthetic_reasoning,synthetic_answer,words
0,memorization_german_10_150696,de,memorization,qwen-3-8b-memorization,Wie genau interagieren die Wärmegradienten zwi...,Climate\n\nThe sea climate has warm waters and...,,"**Query parsing**: ""Wie genau interagieren...""...",Die Wärmegradienten zwischen dem kalten nordwe...,564
1,memorization_94_6343,en,memorization,qwen-3-8b-memorization,hey um can someone tell me why the decagon has...,Symmetry\n\nThe regular decagon has Dih symmet...,,"Query: ""why the decagon has like so many diffe...",You're right to notice that the decagon has mu...,577
2,memorization_82_52457,en,memorization,qwen-3-8b-memorization,Which animal has more poison - the salamander ...,Amphibians\n\nSome salamanders can extrude sha...,,"### 1. Query Parsing\n\n""Which animal has more...",Both animals you're describing have developed ...,443


In [8]:
n_shards = 5
sample_per_shard = 4000

dfs = []
for p in shards[:n_shards]:
    t = pq.read_table(p, columns=cols).slice(0, sample_per_shard)
    d = t.to_pandas()
    d["shard"] = p.name
    dfs.append(d)

df = pd.concat(dfs, ignore_index=True)
print("sample rows:", len(df), "from shards:", n_shards)

sample rows: 20000 from shards: 5


In [9]:
def miss_frac(s: pd.Series) -> float:
    if s.dtype == "object":
        return float(s.isna().mean())
    return float(s.isna().mean())

for c in ["query_seed_text","constraints","synthetic_reasoning","synthetic_answer"]:
    print(f"{c:20s} missing_frac={miss_frac(df[c]):.3f}")

query_seed_text      missing_frac=0.000
constraints          missing_frac=0.000
synthetic_reasoning  missing_frac=0.000
synthetic_answer     missing_frac=0.000


In [10]:
print("\nTop exercise:")
display(df["exercise"].value_counts().head(15))

print("\nTop language:")
display(df["language"].value_counts().head(15))

print("\nTop model:")
display(df["model"].value_counts().head(15))


Top exercise:


exercise
memorization           18152
mcq                      414
math exercise            333
constrained writing      316
rag                      206
math mcq                 177
editing                  159
creative writing         148
cooking                   95
Name: count, dtype: int64


Top language:


language
en    16177
es      649
fr      642
de      641
it      628
pl      620
nl      306
la      303
pt        5
uk        1
Name: count, dtype: int64


Top model:


model
qwen-3-8b-memorization                              18152
qwen-3-8b-mcq                                         414
deepseek-prover-drafter+deepseek-prover-solving       333
qwen-3-8b-constrained-writing                         316
qwen-3-8b-rag                                         206
qwen-3-8b-mcq-math+deepseek-prover-8b-solving         177
qwen-3-8b-editing                                     159
qwen-3-8b-creative-writing                            148
qwen-3-8b-memorization+seed rewriting with Qwen3       95
Name: count, dtype: int64

In [12]:
def len_stats(series: pd.Series, name: str):
    s = series.fillna("").astype(str)
    q = s.str.len().quantile([0.5, 0.9, 0.95, 0.99]).to_dict()
    mx = int(s.str.len().max())
    print(f"{name:20s} p50={int(q[0.5])} p90={int(q[0.9])} p95={int(q[0.95])} p99={int(q[0.99])} max={mx}")

for c in ["query","query_seed_text","synthetic_reasoning","synthetic_answer"]:
    len_stats(df[c], c)

query                p50=159 p90=433 p95=560 p99=1015 max=8279
query_seed_text      p50=702 p90=2470 p95=3358 p99=5579 max=14211
synthetic_reasoning  p50=2252 p90=3256 p95=3583 p99=4425 max=10058
synthetic_answer     p50=1417 p90=1902 p95=2091 p99=4496 max=15320


In [17]:
import tiktoken

enc = tiktoken.get_encoding("o200k_base")

In [18]:
sample = df.sample(n=min(2000, len(df)), random_state=0).copy()

def toklen(x: str) -> int:
    return len(enc.encode(x or ""))

sample["prompt_toks"] = (
    sample["query"].astype(str).map(toklen)
    + sample["query_seed_text"].fillna("").astype(str).map(toklen)
)
sample["reasoning_toks"] = sample["synthetic_reasoning"].fillna("").astype(str).map(toklen)
sample["answer_toks"] = sample["synthetic_answer"].fillna("").astype(str).map(toklen)

display(sample[["prompt_toks","reasoning_toks","answer_toks"]].describe(percentiles=[.5,.9,.95,.99]))
display(
    sample.groupby("exercise")[["prompt_toks","reasoning_toks","answer_toks"]]
    .quantile(0.9)
    .sort_values("answer_toks", ascending=False)
    .head(20)
)


Unnamed: 0,prompt_toks,reasoning_toks,answer_toks
count,2000.0,2000.0,2000.0
mean,275.882,426.319,270.9565
std,254.310702,221.744223,163.589353
min,9.0,0.0,4.0
50%,191.0,466.5,255.0
90%,600.1,669.0,354.1
95%,783.35,731.0,414.1
99%,1270.16,929.02,994.03
max,2112.0,1327.0,2880.0


Unnamed: 0_level_0,prompt_toks,reasoning_toks,answer_toks
exercise,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
creative writing,514.0,1245.5,1839.0
constrained writing,603.0,0.0,1207.6
editing,1353.6,0.0,685.0
rag,713.0,728.0,592.0
memorization,585.4,663.0,341.0
math exercise,868.2,620.0,328.6
cooking,612.2,775.8,271.2
mcq,646.0,632.0,29.0
math mcq,747.1,1053.9,6.0


In [20]:
top_ex = df["exercise"].value_counts().head(8).index.tolist()

for ex in top_ex:
    row = df[df["exercise"] == ex].sample(1, random_state=0).iloc[0]
    print("\n" + "="*100)
    print("exercise:", ex, "| id:", row["synth_id"], "| lang:", row["language"], "| model:", row["model"])
    print("\nQUERY:\n", (row["query"] or "")[:900])
    seed = row["query_seed_text"] or ""
    if seed:
        print("\nSEED (trunc):\n", seed[:900])
    cons = row["constraints"] or ""
    if cons:
        print("\nCONSTRAINTS (trunc):\n", cons[:900])
    print("\nREASONING (trunc):\n", (row["synthetic_reasoning"] or "")[:900])
    print("\nANSWER (trunc):\n", (row["synthetic_answer"] or "")[:900])




exercise: memorization | id: memorization_specialized_18_1_4222 | lang: en | model: qwen-3-8b-memorization

QUERY:
 What will happen to the planet if we don't reduce meat consumption to less than a billion meat-eaters?

SEED (trunc):
 We must change our diet. The planet can't support billions of meat-eaters.

REASONING (trunc):
 Query: "What will happen to the planet if we don't reduce meat consumption to less than a billion meat-eaters?"

Parse components:
- "don't reduce" → assume current consumption patterns continue
- "less than a billion meat-eaters" → threshold claim needing verification
- "what will happen" → predictive analysis

### 1. Threshold Validation

"Less than a billion meat-eaters" - questionable premise.

Global population ≈7.9B (2024). Current meat-eaters ≈3.3B (est). 
→ Billion meat-eaters = 27% of population? Or billion total people?

Semantic ambiguity. Response suggests billion total people, not meat-eaters specifically.

Need to verify: What threshold actually 

In [21]:
import re
import pandas as pd

# assumes your sampled dataframe is named `df` and has columns:
# ["exercise","synthetic_reasoning","synthetic_answer","constraints"]

def _norm(s: str) -> str:
    return re.sub(r"\s+", " ", (s or "").strip()).lower()

# 1) Empty reasoning fraction by exercise
reasoning_empty = df["synthetic_reasoning"].fillna("").astype(str).str.strip().eq("")
empty_reasoning_by_ex = (
    df.assign(reasoning_empty=reasoning_empty)
    .groupby("exercise")["reasoning_empty"]
    .agg(["count", "mean"])
    .rename(columns={"mean": "empty_reasoning_frac"})
    .sort_values("empty_reasoning_frac", ascending=False)
)
display(empty_reasoning_by_ex)

# 2) constraints "result:" mismatch rate (very rough)
# We look for patterns like "result:15" or "result:-74.5" (case-insensitive).
_result_re = re.compile(r"(?i)\bresult\s*:\s*([^\n\r,;]+)")

def extract_expected_result(constraints: str):
    m = _result_re.search(constraints or "")
    if not m:
        return None
    return m.group(1).strip()

def result_mismatch(expected: str | None, answer: str) -> bool | None:
    if expected is None:
        return None
    exp = _norm(expected)
    ans = _norm(answer)

    # if expected is a single letter (mcq), try to match "A" etc.
    if re.fullmatch(r"[a-d]", exp):
        # consider "A." / "A)" / "A" / "Answer: A" as match
        return (re.search(rf"\b{re.escape(exp)}\b", ans) is None)

    # numeric-ish: just require the expected substring appears somewhere
    # (handles "15", "-74.5%", etc. but is not a full verifier)
    return (exp not in ans)

expected = df["constraints"].astype(str).map(extract_expected_result)
has_expected = expected.notna()

mismatch = []
for exp, ans in zip(expected.tolist(), df["synthetic_answer"].fillna("").astype(str).tolist()):
    mm = result_mismatch(exp, ans)
    mismatch.append(mm)

df_tmp = df.assign(expected_result=expected, has_expected=has_expected, result_mismatch=mismatch)

summary = {
    "rows_total": len(df_tmp),
    "rows_with_result_constraint": int(df_tmp["has_expected"].sum()),
    "result_constraint_frac": float(df_tmp["has_expected"].mean()),
    "mismatch_count": int(df_tmp["result_mismatch"].fillna(False).sum()),
    "mismatch_frac_given_result": float(
        (df_tmp.loc[df_tmp["has_expected"], "result_mismatch"].fillna(False).mean())
        if df_tmp["has_expected"].any()
        else 0.0
    ),
}

Unnamed: 0_level_0,count,empty_reasoning_frac
exercise,Unnamed: 1_level_1,Unnamed: 2_level_1
constrained writing,316,1.0
editing,159,1.0
math mcq,177,0.096045
creative writing,148,0.067568
memorization,18152,0.00011
cooking,95,0.0
math exercise,333,0.0
mcq,414,0.0
rag,206,0.0


  "mismatch_count": int(df_tmp["result_mismatch"].fillna(False).sum()),
  (df_tmp.loc[df_tmp["has_expected"], "result_mismatch"].fillna(False).mean())


In [22]:


df_tmp[df_tmp["has_expected"] & df_tmp["result_mismatch"].fillna(False)][
    ["exercise","synth_id","expected_result","synthetic_answer","constraints"]
].head(10)

  df_tmp[df_tmp["has_expected"] & df_tmp["result_mismatch"].fillna(False)][


Unnamed: 0,exercise,synth_id,expected_result,synthetic_answer,constraints
118,math exercise,synth_math_29_2664,16020,"**16,020 pounds**\n\nThis is a multi-step word...",result:16020
348,math exercise,synth_math_59_14090,-2709,"**Answer: -2,709 units remain to be analyzed**...",result:-2709
380,math exercise,synth_math_61_3746,-195.45,**Answer: -$195.45**\n\nThis is an optimizatio...,result:-195.45
989,math exercise,synth_math_66_13865,20520,"**Answer: 20,520 units**\n\nThis problem invol...",result:20520
1163,math exercise,synth_math_57_11322,114240,"**114,240 cycles**\n\nThis is a **scaling prob...",result:114240
1451,math mcq,mcq_math_3_236012,672882,"A. 672,882",result:672882
1671,math exercise,synth_math_80_14169,-20,**The studio's net financial outcome for the c...,result:-20
1737,math exercise,synth_math_80_1475,2941,"**The total cost for the entire order is 2,941...",result:2941
1927,math exercise,synth_math_87_14426,1166083.1,"**The final converted area is 1,166,083.1 hect...",result:1166083.1
2059,math mcq,mcq_math_2_133349,Failed after maximum attempts,A. `dL/dt = r × F`,result:Failed after maximum attempts


In [23]:
import re
import pandas as pd
import math

_result_re = re.compile(r"(?i)\bresult\s*:\s*([^\n\r,;]+)")
_num_re = re.compile(r"[-+]?\d[\d,]*(?:\.\d+)?")

def extract_expected_result(constraints: str):
    m = _result_re.search(constraints or "")
    return m.group(1).strip() if m else None

def first_answer_line(ans: str) -> str:
    for line in (ans or "").splitlines():
        line = line.strip()
        if line:
            return line
    return ""

def parse_first_number(s: str):
    m = _num_re.search(s or "")
    if not m:
        return None
    return float(m.group(0).replace(",", ""))

def expected_is_failed(expected: str) -> bool:
    return (expected or "").strip().lower().startswith("failed after maximum attempts")

def result_mismatch(expected: str | None, answer: str, atol=1e-6, rtol=1e-6) -> bool | None:
    if expected is None:
        return None
    if expected_is_failed(expected):
        return True  # treat as unusable / drop

    exp_num = parse_first_number(expected)
    ans_line = first_answer_line(answer)
    ans_num = parse_first_number(ans_line)

    # if we can compare as numbers, do so
    if exp_num is not None and ans_num is not None:
        return not math.isclose(exp_num, ans_num, rel_tol=rtol, abs_tol=atol)

    # fallback: normalized substring match on first line
    exp_norm = re.sub(r"\s+", " ", expected.strip()).lower()
    ans_norm = re.sub(r"\s+", " ", ans_line.strip()).lower()
    return exp_norm not in ans_norm

expected = df["constraints"].astype(str).map(extract_expected_result)
mismatch = [
    result_mismatch(exp, ans)
    for exp, ans in zip(expected.tolist(), df["synthetic_answer"].fillna("").astype(str).tolist())
]

df_check = df.assign(expected_result=expected, result_mismatch=mismatch)
has_expected = df_check["expected_result"].notna()

summary = {
    "rows_total": len(df_check),
    "rows_with_result_constraint": int(has_expected.sum()),
    "result_constraint_frac": float(has_expected.mean()),
    "mismatch_frac_given_result": float(df_check.loc[has_expected, "result_mismatch"].fillna(False).mean()) if has_expected.any()
else 0.0,
}


  "mismatch_frac_given_result": float(df_check.loc[has_expected, "result_mismatch"].fillna(False).mean()) if has_expected.any()


In [24]:
summary

{'rows_total': 20000,
 'rows_with_result_constraint': 511,
 'result_constraint_frac': 0.02555,
 'mismatch_frac_given_result': 0.12524461839530332}