In [1]:
# file: make_llm_events_jsonl.py
import json, sys, argparse, pathlib

def stream_jsonl(path: pathlib.Path):
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                yield json.loads(line)

def write_jsonl(records, out_path: pathlib.Path):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with out_path.open("w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def extract_events_titles_descriptions(obj):
    events = obj.get("events") or []
    for ev in events:
        yield {
            "title": (ev.get("title") or "").strip(),
            "description": (ev.get("description") or "").strip(),
        }

In [10]:
in_glob = "./markets/2024-10-23.jsonl"
out = "./markets.jsonl"

in_paths = sorted(pathlib.Path().glob(in_glob))
if not in_paths:
    print(f"No files matched: {in_glob}", file=sys.stderr)
    sys.exit(2)

seen = set()
out_records = []

for p in in_paths:
    for obj in stream_jsonl(p):
        for ev in extract_events_titles_descriptions(obj):
            key = ev["title"]  # or ev["id"] if you include it
            if key in seen:
                continue
            seen.add(key)
            out_records.append(ev)

write_jsonl(out_records, pathlib.Path(out))
