In [67]:
import json

with open("qa_dataset_single.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print("Total QA pairs:", len(data))
print("Example item:")
print(json.dumps(data[0], indent=2, ensure_ascii=False))


Total QA pairs: 236
Example item:
{
  "id": "q_0000",
  "chunk_id": "Potato Mine_See Also_0_3bc6f05c188b",
  "question": "What is the recurring character in Plants vs. Zombies?",
  "answer": "Cherry Bomb",
  "source_title": "Potato Mine",
  "source_text": "Game Entity: Potato Mine\nSection: See Also\nContent:\n* Popcorn\n* Primal Potato Mine\n* Escape Root\n|  |  |  |\n| --- | --- | --- |\n| V **·** E | **Recurring characters** |  |\n| Plants | |\n| --- | --- |\n| *Plants vs. Zombies* | Peashooter **·** Sunflower **·** Cherry Bomb **·** Wall-nut **·** **Potato Mine** **·** Snow Pea **·** Chomper **·** Repeater **·** Puff-shroom **·** Sun-shroom **·** Fume-shroom **·** Grave Buster **·** Hypno-shroom **·** Scaredy-shroom **·** Ice-shroom **·** Doom-shroom **·** Lily Pad **·** Squash **·** Threepeater **·** Tan"
}


In [52]:
with open("qa_dataset_single.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print("Before adding id, sample item:")
print(data[0].keys())

for i, item in enumerate(data):
    item["id"] = f"q_{i:04d}"

with open("qa_dataset_single.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("After adding id, sample item:")
print(data[0])


Before adding id, sample item:
dict_keys(['chunk_id', 'source_title', 'question', 'answer', 'source_text'])
After adding id, sample item:
{'chunk_id': 'Potato Mine_See Also_0_3bc6f05c188b', 'source_title': 'Potato Mine', 'question': 'What is the recurring character in Plants vs. Zombies?', 'answer': 'Cherry Bomb', 'source_text': 'Game Entity: Potato Mine\nSection: See Also\nContent:\n* Popcorn\n* Primal Potato Mine\n* Escape Root\n|  |  |  |\n| --- | --- | --- |\n| V **·** E | **Recurring characters** |  |\n| Plants | |\n| --- | --- |\n| *Plants vs. Zombies* | Peashooter **·** Sunflower **·** Cherry Bomb **·** Wall-nut **·** **Potato Mine** **·** Snow Pea **·** Chomper **·** Repeater **·** Puff-shroom **·** Sun-shroom **·** Fume-shroom **·** Grave Buster **·** Hypno-shroom **·** Scaredy-shroom **·** Ice-shroom **·** Doom-shroom **·** Lily Pad **·** Squash **·** Threepeater **·** Tan', 'id': 'q_0000'}


In [None]:
import re

def is_trivial_qa(item):
    q = item["question"].strip().lower()
    a = item["answer"].strip().lower()
    trivial_name_patterns = [
        "what is the name of the game",
        "what is the name of the entity",
        "what is the name of the section",
        "what is the name of this level",
        "what is the name of this mode",
    ]
    if any(q.startswith(p) for p in trivial_name_patterns):
        return True

    if "which section" in q and ("find" in q or "can you find" in q):
        if a in {"waves", "trivia", "overview", "gallery", "description","?"}:
            return True
    if "view the gallery" in q and "game entity" in q:
        return True
    if "where can one view" in q and "visit this page" in a:
        return True
    return False


In [69]:
len_before = len(data)

kept = []
removed = []

for item in data:
    if is_trivial_qa(item):
        removed.append(item)
    else:
        kept.append(item)

print("Before filter:", len_before)
print("After filter (kept):", len(kept))
print("Removed (candidate):", len(removed))


Before filter: 236
After filter (kept): 236
Removed (candidate): 0


In [None]:
print("Preview of removed items (first 10):")
for x in removed[:10]:
    print("-" * 80)
    print("id:", x.get("id"))
    print("source_title:", x.get("source_title"))
    print("question:", x.get("question"))
    print("answer:", x.get("answer"))

Preview of removed items (first 10):


In [None]:
import json
import re
from collections import Counter
from pathlib import Path
from statistics import quantiles
from typing import List, Dict, Any

DATA_PATH = Path("qa_dataset_single.json")      
ANNOTATED_PATH = Path("qa_dataset_annotated.jsonl")  
SUSPICIOUS_PATH = Path("qa_suspicious.jsonl")        

STOPWORDS = {
    "the", "a", "an", "of", "to", "and", "or", "in", "on", "at",
    "for", "from", "with", "by", "as", "is", "are", "was", "were",
    "it", "this", "that", "these", "those", "you", "your", "my",
}

PRONOUNS = {
    "this", "that", "it", "they", "them", "there", "here",
    "last", "previous", "before", "same",
}

ENTITY_HINTS = {
    "zombie", "zombies", "plant", "plants", "level", "day",
    "modern", "heian", "peashooter", "sunflower", "world",
    "stage", "wave", "flag",
}

def load_qa(path: Path) -> List[Dict[str, Any]]:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)


def save_jsonl(path: Path, items: List[Dict[str, Any]]) -> None:
    with path.open("w", encoding="utf-8") as f:
        for item in items:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")


def normalize_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^a-z0-9 ]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s


def tokenize(s: str) -> List[str]:
    return [t for t in normalize_text(s).split() if t]


def content_tokens(tokens: List[str]) -> List[str]:
    return [t for t in tokens if t not in STOPWORDS]

def compute_overlap_ratio(answer: str, context: str) -> float:
    ans_tokens = content_tokens(tokenize(answer))
    ctx_tokens = set(content_tokens(tokenize(context)))

    if not ans_tokens:
        return 0.0

    overlap = sum(1 for t in ans_tokens if t in ctx_tokens)
    return overlap / len(ans_tokens)


def has_pronoun_ambiguity(question: str) -> bool:
    tokens = tokenize(question)
    toks = set(tokens)

    pronoun_count = sum(1 for t in tokens if t in PRONOUNS)
    has_entity = any(t in ENTITY_HINTS for t in toks)

    return (pronoun_count >= 1) and (not has_entity)


def is_meta_like_question(question: str) -> bool:
    q = question.lower()
    meta_keywords = [
        "this page", "visit this page", "view the gallery",
        "gallery of the game entity", "gallery of the entity",
        "which section", "in which section", "page of the game entity",
        "link to this", "click", "see it here"
    ]
    return any(k in q for k in meta_keywords)


def is_answer_too_short(answer: str, a_tokens: List[str], overlap_ratio: float) -> bool:
    if len(a_tokens) <= 2:
        norm = normalize_text(answer)

        if re.fullmatch(r"[0-9 ]+", norm):
            return False
        if "%" in answer:
            return False
        if norm in {"yes", "no"}:
            return False
        if overlap_ratio >= 0.6:
            return False

        return True
    return False

def analyze_qa_items(items: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    chunk_counter = Counter(item["chunk_id"] for item in items)
    chunk_sizes = list(chunk_counter.values())

    if len(chunk_sizes) >= 4:
        q1, q2, q3 = quantiles(chunk_sizes, n=4)
        dense_threshold = max(5, int(round(q3)) + 1)
    else:
        dense_threshold = 5

    print(f"[Info] Chunk dense threshold: >= {dense_threshold}")

    annotated = []
    for item in items:
        q = item["question"]
        a = item["answer"]
        ctx = item["source_text"]

        q_tokens = tokenize(q)
        a_tokens = tokenize(a)

        overlap = compute_overlap_ratio(a, ctx)
        pronoun_issue = has_pronoun_ambiguity(q)
        meta_like = is_meta_like_question(q)
        chunk_size = chunk_counter[item["chunk_id"]]

        flags = []

        if len(q_tokens) < 4:
            flags.append("q_too_short")
        if is_answer_too_short(a, a_tokens, overlap):
            flags.append("a_too_short")

        # overlap flags
        if overlap < 0.3:
            flags.append("low_overlap")
        elif overlap < 0.6:
            flags.append("medium_overlap")
        else:
            flags.append("good_overlap")

        if pronoun_issue:
            flags.append("pronoun_ambiguous")

        if meta_like:
            flags.append("meta_like")

        if chunk_size >= dense_threshold:
            flags.append("chunk_dense")

        item["analysis"] = {
            "q_len": len(q_tokens),
            "a_len": len(a_tokens),
            "overlap_ratio": overlap,
            "chunk_size": chunk_size,
        }
        item["flags"] = flags
        annotated.append(item)

    return annotated


def main():
    items = load_qa(DATA_PATH)
    print("Loaded items:", len(items))

    annotated = analyze_qa_items(items)

    suspicious = [
        item for item in annotated
        if ("low_overlap" in item["flags"])
        or ("pronoun_ambiguous" in item["flags"])
        or ("meta_like" in item["flags"])
    ]

    cleaned = [item for item in annotated if item not in suspicious]

    save_jsonl(ANNOTATED_PATH, cleaned)
    print(f"Saved annotated (cleaned) → {ANNOTATED_PATH}, count={len(cleaned)}")

    save_jsonl(SUSPICIOUS_PATH, suspicious)
    print(f"Saved suspicious → {SUSPICIOUS_PATH}, count={len(suspicious)}")

    cleaned_plain = [
        {
            "id": item["id"],
            "chunk_id": item["chunk_id"],
            "question": item["question"],
            "answer": item["answer"],
            "source_title": item["source_title"],
            "source_text": item["source_text"],
        }
        for item in cleaned
    ]
    DATA_PATH.write_text(json.dumps(cleaned_plain, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"[FINAL] Overwritten cleaned dataset → {DATA_PATH} (count={len(cleaned_plain)})")

    print("\nFlag summary (based on original annotated):")
    flag_counter = Counter()
    for item in annotated:
        for f in item["flags"]:
            flag_counter[f] += 1
    for f, c in flag_counter.most_common():
        print(f"  {f}: {c}")


if __name__ == "__main__":
    main()


In [None]:
with open("qa_dataset_filtered.json", "w", encoding="utf-8") as f:
    json.dump(kept, f, ensure_ascii=False, indent=2)

with open("qa_removed_preview.json", "w", encoding="utf-8") as f:
    json.dump(removed, f, ensure_ascii=False, indent=2)

print("Saved qa_dataset_filtered.json and qa_removed_preview.json.")
with open("qa_dataset_filtered.json", "r", encoding="utf-8") as f:
    filtered = json.load(f)

print("Filtered QA count:", len(filtered))
from pprint import pprint
pprint(filtered[0])


Saved qa_dataset_filtered.json and qa_removed_preview.json.
Filtered QA count: 236
{'answer': 'Cherry Bomb',
 'chunk_id': 'Potato Mine_See Also_0_3bc6f05c188b',
 'id': 'q_0000',
 'question': 'What is the recurring character in Plants vs. Zombies?',
 'source_text': 'Game Entity: Potato Mine\n'
                'Section: See Also\n'
                'Content:\n'
                '* Popcorn\n'
                '* Primal Potato Mine\n'
                '* Escape Root\n'
                '|  |  |  |\n'
                '| --- | --- | --- |\n'
                '| V **·** E | **Recurring characters** |  |\n'
                '| Plants | |\n'
                '| --- | --- |\n'
                '| *Plants vs. Zombies* | Peashooter **·** Sunflower **·** '
                'Cherry Bomb **·** Wall-nut **·** **Potato Mine** **·** Snow '
                'Pea **·** Chomper **·** Repeater **·** Puff-shroom **·** '
                'Sun-shroom **·** Fume-shroom **·** Grave Buster **·** '
                'Hypno-shro

In [None]:
mv qa_dataset_filtered.json qa_dataset_single.json

In [None]:
import json
import re
from collections import Counter
from pathlib import Path
from typing import List, Dict, Any

DATA_PATH = Path("qa_dataset_single.json")  

def load_qa(path: Path) -> List[Dict[str, Any]]:
    with path.open("r", encoding="utf-8") as f:
        return json.load(f)

def save_jsonl(path: Path, items: List[Dict[str, Any]]):
    with path.open("w", encoding="utf-8") as f:
        for item in items:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

def normalize_text(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[^a-z0-9 ]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def tokenize(s: str) -> List[str]:
    return [t for t in normalize_text(s).split() if t]


In [None]:
import json
from collections import Counter, defaultdict
from pathlib import Path

ANNOTATED_PATH = Path("qa_dataset_annotated.jsonl")
SUSPICIOUS_PATH = Path("qa_suspicious.jsonl")

def load_jsonl(path: Path):
    items = []
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            if line.strip():
                items.append(json.loads(line))
    return items

def main():
    suspicious = load_jsonl(SUSPICIOUS_PATH)
    print("Total suspicious QA:", len(suspicious))

    interesting_flags = {"low_overlap", "pronoun_ambiguous", "meta_like"}
    flag_counter = Counter()

    for item in suspicious:
        for f in item["flags"]:
            if f in interesting_flags:
                flag_counter[f] += 1

    print("Flag counts in suspicious set:")
    for f, c in flag_counter.most_common():
        print(f"  {f}: {c}")

    examples_by_flag = defaultdict(list)

    for item in suspicious:
        for f in item["flags"]:
            if f in interesting_flags and len(examples_by_flag[f]) < 20:
                examples_by_flag[f].append(item)

    for f, items in examples_by_flag.items():
        print("\n" + "=" * 20, f, "=" * 20)
        for it in items:
            print("id:", it.get("id"))
            print("question:", it["question"])
            print("answer:", it["answer"])
            print("flags:", it["flags"])
            print("-" * 60)

if __name__ == "__main__":
    main()


Total suspicious QA: 38
Flag counts in suspicious set:
  pronoun_ambiguous: 31
  meta_like: 7
  low_overlap: 3

id: q_0008
question: Where should the Spikerocks be placed according to this strategy?
answer: In front of the pre-placed Spikeweeds
flags: ['good_overlap', 'pronoun_ambiguous']
------------------------------------------------------------
id: q_0019
question: What does the player get when they use the Sun-Shroom?
answer: Sun-shroom
flags: ['good_overlap', 'pronoun_ambiguous']
------------------------------------------------------------
id: q_0020
question: [Grape Power] Which set does it belong to?
answer: Colossal
flags: ['good_overlap', 'pronoun_ambiguous']
------------------------------------------------------------
id: q_0036
question: In which section can you find strategies for this game?
answer: Strategies
flags: ['good_overlap', 'pronoun_ambiguous', 'meta_like']
------------------------------------------------------------
id: q_0046
question: How many dinosaurs are th

In [None]:
import json

path = "qa_suspicious.jsonl"

with open(path, "r", encoding="utf-8") as fin:
    lines = fin.readlines()

with open(path, "w", encoding="utf-8") as fout:
    for line in lines:
        line = line.strip()
        if not line:
            continue

        obj = json.loads(line)

        new_obj = {"id": obj["id"]}
        for k, v in obj.items():
            if k != "id":
                new_obj[k] = v

        fout.write(json.dumps(new_obj, ensure_ascii=False) + "\n")

print("Done. Output saved to:", path)


Done. Output saved to: qa_suspicious.jsonl
