usage: ipykernel_launcher.py [-h] [--limit LIMIT] [--samples SAMPLES]
ipykernel_launcher.py: error: unrecognized arguments: -f /Users/vikkyfury/Library/Jupyter/runtime/kernel-e200e57c-425b-48e7-9264-d3c058f1953d.json


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [12]:
def read_json_safe(raw: bytes) -> Optional[Dict[str, Any]]:
    try:
        return orjson.loads(raw)
    except Exception:
        try:
            return json.loads(raw.decode("utf-8", errors="ignore"))
        except Exception:
            return None

def extract_plain_text(example: Dict[str, Any]) -> str:
    # 1) common direct fields
    for k in ["transcript", "text"]:
        v = example.get(k)
        if isinstance(v, str):
            return v
    # 2) list-like turns
    for k in ["dialog", "dialogue", "turns", "utterances"]:
        v = example.get(k)
        if isinstance(v, list):
            parts = []
            for item in v:
                if isinstance(item, dict):
                    for tk in ("text", "utterance", "content"):
                        tv = item.get(tk)
                        if isinstance(tv, str):
                            parts.append(tv)
                elif isinstance(item, str):
                    parts.append(item)
            if parts:
                return "\n".join(parts)
    # 3) fallback: join any reasonable strings
    strings = []
    for k, v in example.items():
        if isinstance(v, str) and 5 <= len(v) <= 20000:
            strings.append(v)
    return "\n".join(strings)

def iter_zip_json_records_with_names(zip_path: Path):
    """Yield (member_name, json_obj) pairs for each JSON inside the zip."""
    with zipfile.ZipFile(zip_path, "r") as zf:
        for name in zf.namelist():
            if not name.endswith(".json"):
                continue
            try:
                with zf.open(name) as f:
                    raw = f.read()
                obj = read_json_safe(raw)
                if not obj:
                    logging.warning(f"SKIP malformed JSON: {name}")
                    continue
                yield name, obj
            except Exception as e:
                logging.warning(f"SKIP error reading {name}: {e}")
                continue

def norm(s: Optional[str]) -> str:
    return " ".join(str(s or "").strip().split())

def is_target(domain: Optional[str], topic: Optional[str], fname: str) -> bool:
    """True iff this record belongs to the exact Medicare_inbound (USA- USA) bucket."""
    d = norm(domain)
    t = norm(topic)
    if d == TARGET_LABEL or t == TARGET_LABEL:
        return True
    return TARGET_LABEL in fname


In [13]:
LIMIT = 5000    # adjust as needed; set higher for more rows
SAMPLES = 50    # how many raw .txt samples to dump for quick reading

rows = []
skipped = 0
sample_dir = PROCESSED_DIR / "medicare_inbound_samples"
sample_dir.mkdir(parents=True, exist_ok=True)

for zname in ZIP_FILES:
    print(f"Downloading {zname} ...")
    local = hf_hub_download(REPO_ID, filename=zname, repo_type="dataset")
    zpath = Path(local)
    print(f"Scanning {zpath.name} ...")

    for fname, rec in tqdm(iter_zip_json_records_with_names(zpath), desc=f"scan {zpath.name}"):
        domain = rec.get("domain") or rec.get("industry") or rec.get("category")
        topic  = rec.get("topic")  or rec.get("subtopic")

        if not is_target(domain, topic, fname):
            continue

        text = extract_plain_text(rec)
        if not text:
            skipped += 1
            continue

        row = {
            "source_zip": zpath.name,
            "source_file": fname,
            "domain": domain,
            "topic": topic,
            "text": text[:50000]  # safety cap
        }
        rows.append(row)

        if len(rows) <= SAMPLES:
            out_name = f"sample_{len(rows):04d}__{Path(fname).stem[:60]}.txt"
            (sample_dir / out_name).write_text(text[:100000], encoding="utf-8")

        if len(rows) >= LIMIT:
            break
    if len(rows) >= LIMIT:
        break

if not rows:
    print("No matches found for EXACT label 'Medicare_inbound (USA- USA)'. Confirm the label in metadata/file paths.")
else:
    df = pd.DataFrame(rows)
    out_parquet = PROCESSED_DIR / "medicare_inbound_usaUSA.parquet"
    out_csv = PROCESSED_DIR / "medicare_inbound_usaUSA_preview.csv"
    df.to_parquet(out_parquet, index=False)
    df.head(200).to_csv(out_csv, index=False)

    print(f"Saved {len(rows)} rows -> {out_parquet}")
    print(f"Preview (first 200) -> {out_csv}")
    print(f"Raw text samples ({min(SAMPLES,len(rows))}) -> {sample_dir}")
    print(f"Skipped empty-text records: {skipped}")


Downloading medicare_inbound.zip ...
Scanning medicare_inbound.zip ...


scan medicare_inbound.zip: 0it [00:00, ?it/s]

No matches found for EXACT label 'Medicare_inbound (USA- USA)'. Confirm the label in metadata/file paths.


In [14]:
if 'df' in locals():
    display(df.head(3))
    print("\nDomain values (top 5):")
    print(df['domain'].value_counts(dropna=False).head(5))
    print("\nTopic values (top 5):")
    print(df['topic'].value_counts(dropna=False).head(5))
else:
    print("Nothing collected yet. Re-run the extraction cell above.")


Nothing collected yet. Re-run the extraction cell above.
