# PII NER Baseline Models

End-to-end experiments for MiniLM-L6, ELECTRA-small, and MobileBERT on the noisy STT PII dataset. Assumes `data/train.jsonl` and `data/dev.jsonl` already exist.


In [11]:
import os
import sys
import json
import re
import subprocess
import shlex
from pathlib import Path
import pandas as pd

BASE_DIR = Path("/Users/kolosus/Downloads/pii_ner_assignment").resolve()
DATA_DIR = BASE_DIR / "data"
OUT_DIR = BASE_DIR / "out"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("Base dir:", BASE_DIR)
print("Python:", sys.version)
print("Train file exists:", (DATA_DIR / "train.jsonl").exists())
print("Dev file exists:", (DATA_DIR / "dev.jsonl").exists())


Base dir: /Users/kolosus/Downloads/pii_ner_assignment
Python: 3.13.2 (v3.13.2:4f8bb3947cf, Feb  4 2025, 11:51:10) [Clang 15.0.0 (clang-1500.3.9.4)]
Train file exists: True
Dev file exists: True


In [12]:
def run_cmd(cmd, cwd=None, env=None, timeout=None):
    """Run a shell command and print stdout/stderr."""
    print(f"\n>> {cmd}\n")
    process = subprocess.run(
        shlex.split(cmd),
        cwd=str(cwd or BASE_DIR),
        env=env,
        capture_output=True,
        text=True,
        timeout=timeout,
    )
    print(process.stdout)
    if process.returncode != 0:
        print(process.stderr, file=sys.stderr)
        raise RuntimeError(f"Command failed: {cmd}")
    return process.stdout


def parse_eval_stdout(stdout: str):
    per_entity = {}
    for line in stdout.splitlines():
        m = re.match(r"^([A-Z_]+)\s+P=(\d+\.\d+)\s+R=(\d+\.\d+)\s+F1=(\d+\.\d+)", line.strip())
        if m:
            lab = m.group(1)
            per_entity[lab] = {"P": float(m.group(2)), "R": float(m.group(3)), "F1": float(m.group(4))}
    macro_match = re.search(r"Macro-F1:\s+(\d+\.\d+)", stdout)
    macro_f1 = float(macro_match.group(1)) if macro_match else None

    pii_match = re.search(r"PII-only metrics:\s+P=(\d+\.\d+)\s+R=(\d+\.\d+)\s+F1=(\d+\.\d+)", stdout)
    pii = tuple(float(pii_match.group(i)) for i in range(1, 4)) if pii_match else (None, None, None)

    non_match = re.search(r"Non-PII metrics:\s+P=(\d+\.\d+)\s+R=(\d+\.\d+)\s+F1=(\d+\.\d+)", stdout)
    non_pii = tuple(float(non_match.group(i)) for i in range(1, 4)) if non_match else (None, None, None)

    return {
        "per_entity": per_entity,
        "macro_f1": macro_f1,
        "pii_P": pii[0],
        "pii_R": pii[1],
        "pii_F1": pii[2],
        "non_P": non_pii[0],
        "non_R": non_pii[1],
        "non_F1": non_pii[2],
    }


def parse_latency_stdout(stdout: str):
    p50_match = re.search(r"p50:\s+([\d\.]+)\s*ms", stdout)
    p95_match = re.search(r"p95:\s+([\d\.]+)\s*ms", stdout)
    p50 = float(p50_match.group(1)) if p50_match else None
    p95 = float(p95_match.group(1)) if p95_match else None
    return p50, p95


In [20]:
common_args = {
    "train": str(DATA_DIR / "train.jsonl"),
    "dev": str(DATA_DIR / "dev.jsonl"),
    "batch_size": 8,
    "epochs": 4,
    "lr": 3.431218917561423e-05,
    "max_length": 192,
}

models = [
    {"name": "microsoft/MiniLM-L12-H384-uncased", "out_dir": str(OUT_DIR / "minilm_l6h384")},
    # {"name": "google/electra-small-discriminator", "out_dir": str(OUT_DIR / "electra_small")},
    # {"name": "google/mobilebert-uncased", "out_dir": str(OUT_DIR / "mobilebert")},
]



In [21]:
for model_cfg in models:
    cmd = f"""
    python3 {BASE_DIR / 'src/train.py'} \
      --model_name {model_cfg['name']} \
      --train {common_args['train']} \
      --dev {common_args['dev']} \
      --out_dir {model_cfg['out_dir']} \
      --batch_size {common_args['batch_size']} \
      --epochs {common_args['epochs']} \
      --lr {common_args['lr']} \
      --max_length {common_args['max_length']}
    """.strip()
    run_cmd(cmd)


>> python3 /Users/kolosus/Downloads/pii_ner_assignment/src/train.py       --model_name microsoft/MiniLM-L12-H384-uncased       --train /Users/kolosus/Downloads/pii_ner_assignment/data/train.jsonl       --dev /Users/kolosus/Downloads/pii_ner_assignment/data/dev.jsonl       --out_dir /Users/kolosus/Downloads/pii_ner_assignment/out/minilm_l6h384       --batch_size 8       --epochs 4       --lr 3.431218917561423e-05       --max_length 192

Epoch 1 average loss: 1.5836
Epoch 2 average loss: 0.4442
Epoch 3 average loss: 0.2859
Epoch 4 average loss: 0.2463
Saved model + tokenizer to /Users/kolosus/Downloads/pii_ner_assignment/out/minilm_l6h384



In [29]:
pred_paths = {}

for model_cfg in models:
    pred_path = str(Path(model_cfg["out_dir"]).parent / f"dev_pred_{Path(model_cfg['out_dir']).name}.json")
    cmd = f"""
    python3 {BASE_DIR / 'src/predict.py'} \
      --model_dir {model_cfg['out_dir']} \
      --input {common_args['dev']} \
      --output {pred_path} \
      --max_length {common_args['max_length']} \
      --device cpu
    """.strip()
    run_cmd(cmd)
    pred_paths[model_cfg['name']] = pred_path



>> python3 /Users/kolosus/Downloads/pii_ner_assignment/src/predict.py       --model_dir /Users/kolosus/Downloads/pii_ner_assignment/out/minilm_l6h384       --input /Users/kolosus/Downloads/pii_ner_assignment/data/dev.jsonl       --output /Users/kolosus/Downloads/pii_ner_assignment/out/dev_pred_minilm_l6h384.json       --max_length 192       --device cpu

Wrote predictions for 180 utterances to /Users/kolosus/Downloads/pii_ner_assignment/out/dev_pred_minilm_l6h384.json



In [30]:
eval_results = []

for model_cfg in models:
    pred_path = pred_paths[model_cfg["name"]]
    cmd = f"""
    python3 {BASE_DIR / 'src/eval_span_f1.py'} \
      --gold {common_args['dev']} \
      --pred {pred_path}
    """.strip()
    stdout = run_cmd(cmd)
    parsed = parse_eval_stdout(stdout)
    eval_results.append({
        "model": model_cfg["name"],
        "out_dir": model_cfg["out_dir"],
        **{k: v for k, v in parsed.items() if k != "per_entity"},
        "per_entity": parsed["per_entity"],
        "eval_stdout": stdout,
    })

pd.DataFrame([
    {
        "model": r["model"],
        "macro_f1": r["macro_f1"],
        "pii_P": r["pii_P"],
        "pii_R": r["pii_R"],
        "pii_F1": r["pii_F1"],
    }
    for r in eval_results
]).sort_values("pii_F1", ascending=False)




>> python3 /Users/kolosus/Downloads/pii_ner_assignment/src/eval_span_f1.py       --gold /Users/kolosus/Downloads/pii_ner_assignment/data/dev.jsonl       --pred /Users/kolosus/Downloads/pii_ner_assignment/out/dev_pred_minilm_l6h384.json

Per-entity metrics:
CITY            P=0.804 R=0.891 F1=0.845
CREDIT_CARD     P=0.944 R=0.944 F1=0.944
DATE            P=1.000 R=1.000 F1=1.000
EMAIL           P=0.923 R=0.649 F1=0.762
LOCATION        P=0.882 R=0.833 F1=0.857
PERSON_NAME     P=0.782 R=0.782 F1=0.782
PHONE           P=0.946 R=0.946 F1=0.946

Macro-F1: 0.877

PII-only metrics: P=0.905 R=0.856 F1=0.880
Non-PII metrics: P=0.824 R=0.875 F1=0.848



Unnamed: 0,model,macro_f1,pii_P,pii_R,pii_F1
0,microsoft/MiniLM-L12-H384-uncased,0.877,0.905,0.856,0.88


In [31]:
latency_results = []

for model_cfg in models:
    cmd = f"""
    python3 {BASE_DIR / 'src/measure_latency.py'} \
      --model_dir {model_cfg['out_dir']} \
      --input {common_args['dev']} \
      --runs 50 \
      --max_length {common_args['max_length']} \
      --device cpu
    """.strip()
    stdout = run_cmd(cmd)
    p50, p95 = parse_latency_stdout(stdout)
    latency_results.append({
        "model": model_cfg["name"],
        "p50_ms": p50,
        "p95_ms": p95,
        "latency_stdout": stdout,
    })

pd.DataFrame([
    {"model": r["model"], "p50_ms": r["p50_ms"], "p95_ms": r["p95_ms"]}
    for r in latency_results
]).sort_values("p95_ms")



>> python3 /Users/kolosus/Downloads/pii_ner_assignment/src/measure_latency.py       --model_dir /Users/kolosus/Downloads/pii_ner_assignment/out/minilm_l6h384       --input /Users/kolosus/Downloads/pii_ner_assignment/data/dev.jsonl       --runs 50       --max_length 192       --device cpu

Latency over 50 runs (batch_size=1):
  p50: 9.38 ms
  p95: 10.30 ms



Unnamed: 0,model,p50_ms,p95_ms
0,microsoft/MiniLM-L12-H384-uncased,9.38,10.3


In [32]:
summary_rows = []
for eval_row in eval_results:
    lat_row = next((lr for lr in latency_results if lr["model"] == eval_row["model"]), {})
    summary_rows.append({
        "model": eval_row["model"],
        "macro_f1": eval_row["macro_f1"],
        "pii_P": eval_row["pii_P"],
        "pii_R": eval_row["pii_R"],
        "pii_F1": eval_row["pii_F1"],
        "p50_ms": lat_row.get("p50_ms"),
        "p95_ms": lat_row.get("p95_ms"),
        "out_dir": eval_row["out_dir"],
    })

summary_df = pd.DataFrame(summary_rows).sort_values(["pii_F1", "macro_f1"], ascending=False)

In [33]:
summary_csv = OUT_DIR / "summary_first_models.csv"
summary_df.to_csv(summary_csv, index=False)
print("Saved summary:", summary_csv)

per_entity_json = OUT_DIR / "per_entity_first_models.json"
with open(per_entity_json, "w", encoding="utf-8") as f:
    json.dump({row["model"]: row["per_entity"] for row in eval_results}, f, indent=2)
print("Saved per-entity metrics:", per_entity_json)
summary_df


Saved summary: /Users/kolosus/Downloads/pii_ner_assignment/out/summary_first_models.csv
Saved per-entity metrics: /Users/kolosus/Downloads/pii_ner_assignment/out/per_entity_first_models.json


Unnamed: 0,model,macro_f1,pii_P,pii_R,pii_F1,p50_ms,p95_ms,out_dir
0,microsoft/MiniLM-L12-H384-uncased,0.877,0.905,0.856,0.88,9.38,10.3,/Users/kolosus/Downloads/pii_ner_assignment/ou...


In [34]:
best_model = summary_df.iloc[0]["model"] if not summary_df.empty else None
if best_model:
    best_eval = next(row for row in eval_results if row["model"] == best_model)
    best_lat = next((row for row in latency_results if row["model"] == best_model), None)
    print("Best model:", best_model)
    print("\nEvaluation output:\n", best_eval["eval_stdout"])
    if best_lat:
        print("Latency output:\n", best_lat["latency_stdout"])
else:
    print("No results captured yet.")


Best model: microsoft/MiniLM-L12-H384-uncased

Evaluation output:
 Per-entity metrics:
CITY            P=0.804 R=0.891 F1=0.845
CREDIT_CARD     P=0.944 R=0.944 F1=0.944
DATE            P=1.000 R=1.000 F1=1.000
EMAIL           P=0.923 R=0.649 F1=0.762
LOCATION        P=0.882 R=0.833 F1=0.857
PERSON_NAME     P=0.782 R=0.782 F1=0.782
PHONE           P=0.946 R=0.946 F1=0.946

Macro-F1: 0.877

PII-only metrics: P=0.905 R=0.856 F1=0.880
Non-PII metrics: P=0.824 R=0.875 F1=0.848

Latency output:
 Latency over 50 runs (batch_size=1):
  p50: 9.38 ms
  p95: 10.30 ms



Run cells sequentially after ensuring dependencies from `requirements.txt` are installed (e.g., `pip install -r requirements.txt`). Training and evaluation logs are captured inline for documentation.
