## Processing the Data

In [5]:
from pathlib import Path
import json, re, random
from collections import Counter
from typing import Dict, Any, List, Tuple

RAW_PATH = Path("data/raw/ori_pqal.json")
OUT_DIR  = Path("data/processed")
SEED = 42
SPLIT_RATIOS = (0.80, 0.10, 0.10)
VALID_DECISIONS = {"yes", "no", "maybe"}
random.seed(SEED)

In [7]:
with RAW_PATH.open("r", encoding="utf-8") as f:
    raw = json.load(f)

len(raw), list(raw.keys())[:3]

(1000, ['21645374', '16418930', '9488747'])

In [8]:
# peek a single item (replace with a real key from above)
sample_key = next(iter(raw.keys()))
raw[sample_key]

{'QUESTION': 'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?',
 'CONTEXTS': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.',
  'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early stages of PCD (EPCD)

In [10]:
def normalize(text: str) -> str:
    return re.sub(r"\s+", " ", (text or "")).strip()

def join_contexts(contexts):
    contexts_norm = [normalize(c) for c in (contexts or []) if isinstance(c, str) and c.strip()]
    return "\n\n".join(contexts_norm)

def generative_record(pid: str, item: Dict[str, Any]) -> Dict[str, Any]:
    return {
        "id": pid,
        "question": normalize(item.get("QUESTION", "")),
        "context": join_contexts(item.get("CONTEXTS", [])),
        "answer": normalize(item.get("LONG_ANSWER", "")),
        "year": item.get("YEAR", None),
        "labels": item.get("LABELS", []),
        "final_decision": normalize(item.get("final_decision", "")).lower(),
    }

def classification_record(pid: str, item: Dict[str, Any]) -> Dict[str, Any]:
    fd = normalize(item.get("final_decision", "")).lower()
    return {
        "id": pid,
        "question": normalize(item.get("QUESTION", "")),
        "context": join_contexts(item.get("CONTEXTS", [])),
        "target": fd if fd in {"yes","no","maybe"} else None,
        "year": item.get("YEAR", None),
        "labels": item.get("LABELS", []),
    }

In [12]:
gen_records, clf_records = [], []
for pid, item in raw.items():
    if isinstance(item, dict):
        gen_records.append(generative_record(pid, item))
        clf_records.append(classification_record(pid, item))

len(gen_records), len(clf_records), gen_records[0]

(1000,
 1000,
 {'id': '21645374',
  'question': 'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?',
  'context': 'Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.\n\nThe following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cel

In [14]:
def drop_incorrect_data(records, mode: str):
    out = []
    for r in records:
        if not r.get("question") or not r.get("context"):
            continue
        if mode == "gen" and not r.get("answer"):
            continue
        if mode == "clf" and r.get("target") not in VALID_DECISIONS:
            continue
        out.append(r)

    return out

def dedupe_by_id(records):
    seen, out = set(), []
    for r in records:
        rid = r.get("id")
        if rid and rid not in seen:
            seen.add(rid)
            out.append(r)

    return out

gen_records = dedupe_by_id(drop_incorrect_data(gen_records, mode="gen"))
clf_records = dedupe_by_id(drop_incorrect_data(clf_records, mode="clf"))

len(gen_records), len(clf_records)


(1000, 1000)

In [15]:
Counter([r["final_decision"] for r in gen_records]), Counter([r["target"] for r in clf_records])

(Counter({'yes': 552, 'no': 338, 'maybe': 110}),
 Counter({'yes': 552, 'no': 338, 'maybe': 110}))

In [17]:
def stratified_split(records: List[Dict[str, Any]], key: str):
    buckets = {}
    for r in records:
        buckets.setdefault(r.get(key, "unkown"), []).append(r)
    
    train, val, test = [], [], []
    for label, items in buckets.items():
        random.shuffle(items)
        n = len(items)
        n_train = int(SPLIT_RATIOS[0]*n)
        n_val = int(SPLIT_RATIOS[1]*n)
        train += items[:n_train]
        val += items[n_train:n_train+n_val]
        test += items[n_train+n_val:]

    random.shuffle(train)
    random.shuffle(val)
    random.shuffle(test)
    return train, val, test


gen_train, gen_val, gen_test = stratified_split(gen_records, key="final_decision")
clf_train, clf_val, clf_test = stratified_split(clf_records, key="target")

len(gen_train), len(gen_val), len(gen_test), len(clf_train), len(clf_val), len(clf_test)

(799, 99, 102, 799, 99, 102)

In [None]:
def save_json_processed(path: Path, rows):
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")
        
save_json_processed(OUT_DIR / "generative" / "train.jsonl", gen_train)
save_json_processed(OUT_DIR / "generative" / "val.jsonl",   gen_val)
save_json_processed(OUT_DIR / "generative" / "test.jsonl",  gen_test)

save_json_processed(OUT_DIR / "classification" / "train.jsonl", clf_train)
save_json_processed(OUT_DIR / "classification" / "val.jsonl",   clf_val)
save_json_processed(OUT_DIR / "classification" / "test.jsonl",  clf_test)

OUT_DIR.resolve()

In [19]:
def quick_stats(name, rows, label_field=None):
    print(f"[{name}] n={len(rows)}")
    if label_field:
        print(" label dist:", dict(Counter([r.get(label_field) for r in rows])))
    avg_q = sum(len(r.get("question","")) for r in rows)/max(1,len(rows))
    avg_c = sum(len(r.get("context","")) for r in rows)/max(1,len(rows))
    print(f" avg question chars: {avg_q:.1f} | avg context chars: {avg_c:.1f}")

quick_stats("GEN train", gen_train, "final_decision")
quick_stats("GEN val",   gen_val,   "final_decision")
quick_stats("GEN test",  gen_test,  "final_decision")

quick_stats("CLF train", clf_train, "target")
quick_stats("CLF val",   clf_val,   "target")
quick_stats("CLF test",  clf_test,  "target")


[GEN train] n=799
 label dist: {'yes': 441, 'no': 270, 'maybe': 88}
 avg question chars: 94.0 | avg context chars: 1341.6
[GEN val] n=99
 label dist: {'no': 33, 'yes': 55, 'maybe': 11}
 avg question chars: 95.1 | avg context chars: 1372.7
[GEN test] n=102
 label dist: {'yes': 56, 'no': 35, 'maybe': 11}
 avg question chars: 94.8 | avg context chars: 1331.3
[CLF train] n=799
 label dist: {'no': 270, 'yes': 441, 'maybe': 88}
 avg question chars: 94.9 | avg context chars: 1329.1
[CLF val] n=99
 label dist: {'yes': 55, 'maybe': 11, 'no': 33}
 avg question chars: 93.0 | avg context chars: 1392.6
[CLF test] n=102
 label dist: {'no': 35, 'yes': 56, 'maybe': 11}
 avg question chars: 90.0 | avg context chars: 1409.6
