In [16]:
pip install datasets

^C
[31mERROR: Operation cancelled by user[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [17]:
# Clean slate: RAM/VRAM and deterministic runs
import os, gc, random, torch
import numpy as np
from pathlib import Path

# Free Python + CUDA memory
for name in list(globals().keys()):
    if name not in ["os","gc","random","torch","np","numpy","Path"]:
        del globals()[name]
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

# Reproducibility
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Helpful env toggles for Jupyter
os.environ["TOKENIZERS_PARALLELISM"] = "false"   # avoid fork/threads deadlocks
os.environ["ACCELERATE_MIXED_PRECISION"] = "no"  # we'll control AMP explicitly later

# Project paths
RUN_DIR = Path("runs/v2_safe_contract")
RUN_DIR.mkdir(parents=True, exist_ok=True)

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    dev = torch.cuda.get_device_name(0)
    free, total = torch.cuda.mem_get_info()
    print("GPU:", dev)
    print(f"VRAM free/total: {free/1e9:.2f} GB / {total/1e9:.2f} GB")

Torch: 2.4.0+cu121
CUDA available: True
GPU: NVIDIA GeForce RTX 2080 Ti
VRAM free/total: 6.82 GB / 11.35 GB


In [18]:
# Load dataset + basic EDA (English only, placeholder vocab)
from datasets import load_dataset, Dataset
from collections import Counter
import re, json, pandas as pd

DATASET_ID = "ai4privacy/pii-masking-200k" 

ds = load_dataset(DATASET_ID)
full = ds["train"]

print("Columns:", full.column_names[:20])

# Keep only english content
full_en = full.filter(lambda x: str(x["language"]).lower().startswith("en"))

# Keep only the columns we need
needed_cols = ["source_text","target_text"]
missing = [c for c in needed_cols if c not in full_en.column_names]
assert not missing, f"Missing columns: {missing}"

def norm_space(t): 
    return re.sub(r"\s+", " ", t.strip()) if isinstance(t, str) else t
    
# add the normalised column
full_en = full_en.map(lambda ex: {"_src_norm": norm_space(ex["source_text"])})

# convert -> drop -> back to HF Dataset
df = full_en.to_pandas()
before = len(df)
df = df.drop_duplicates(subset="_src_norm", keep="first").drop(columns=["_src_norm"])
after = len(df)
full_en = Dataset.from_pandas(df, preserve_index=False)
print(f"Rows (English): {before} -> {after} after dedupe")

# Build placeholder vocabulary from target_text: tokens like [FIRSTNAME]
PH_RE = re.compile(r"\[([A-Za-z0-9_]+)\]")
def extract_labels(txt):
    return PH_RE.findall(txt or "")

label_ctr = Counter()
lengths_src, lengths_tgt = [], []

for ex in full_en.select(range(min(20000, len(full_en)))):  # sample up to 20k for speed
    for lab in extract_labels(ex["target_text"]):
        label_ctr[lab] += 1
    lengths_src.append(len(ex["source_text"].split()))
    lengths_tgt.append(len(ex["target_text"].split()))

label_df = pd.DataFrame(
    sorted(label_ctr.items(), key=lambda x: x[1], reverse=True),
    columns=["label","count"]
)
print("Top 15 placeholders:\n", label_df.head(15))

# Basic length stats to guide seq_len choice
def pct(a, q): 
    i = int(len(a)*q); 
    return sorted(a)[i if i < len(a) else -1]
print(f"Source tokens: n={len(lengths_src)}, median={int(np.median(lengths_src))}, p90={pct(lengths_src,0.90)}, p95={pct(lengths_src,0.95)}")
print(f"Target tokens: n={len(lengths_tgt)}, median={int(np.median(lengths_tgt))}, p90={pct(lengths_tgt,0.90)}, p95={pct(lengths_tgt,0.95)}")

# Save vocab for later formatting checks
LABEL_VOCAB_PATH = RUN_DIR/"label_vocab_dataset.txt"
with open(LABEL_VOCAB_PATH, "w", encoding="utf-8") as f:
    for lab, _ in sorted(label_ctr.items(), key=lambda x: (-x[1], x[0])):
        f.write(lab + "\n")
print("Saved label vocab to:", LABEL_VOCAB_PATH)

# Peek a couple samples for sanity
for i in [0,1,2]:
    ex = full_en[i]
    print(f"\n--- Sample {i} ---")
    print("SRC:", ex["source_text"][:300])
    print("TGT:", ex["target_text"][:300])

Columns: ['source_text', 'target_text', 'privacy_mask', 'span_labels', 'mbert_text_tokens', 'mbert_bio_labels', 'id', 'language', 'set']
Rows (English): 43501 → 43501 after dedupe
Top 15 placeholders:
             label  count
0       FIRSTNAME   6155
1        LASTNAME   2150
2            DATE   2059
3           EMAIL   1817
4          PREFIX   1590
5          AMOUNT   1483
6        USERNAME   1379
7        JOBTITLE   1358
8             URL   1353
9         JOBAREA   1326
10    ACCOUNTNAME   1314
11           TIME   1313
12  ACCOUNTNUMBER   1307
13     MIDDLENAME   1307
14           CITY   1295
Source tokens: n=20000, median=24, p90=36, p95=40
Target tokens: n=20000, median=23, p90=34, p95=37
Saved label vocab to: runs/v2_safe_contract/label_vocab_dataset.txt

--- Sample 0 ---
SRC: A student's assessment was found on device bearing IMEI: 06-184755-866851-3. The document falls under the various topics discussed in our Optimization curriculum. Can you please collect it?
TGT: A student's 

In [19]:
# Step 2A split into train / val / test (80 / 10 / 10)
from datasets import Dataset

VAL_FRACTION  = 0.10
TEST_FRACTION = 0.10
SEED = 42

# First carve out test
split1 = full_en.train_test_split(test_size=TEST_FRACTION, seed=SEED)
tmp_train, test_raw = split1["train"], split1["test"]

# Now carve val from the remaining pool so final ratios are ~80/10/10
val_size = VAL_FRACTION / (1.0 - TEST_FRACTION)  
split2 = tmp_train.train_test_split(test_size=val_size, seed=SEED)
train_raw, val_raw = split2["train"], split2["test"]

print("Split sizes:", len(train_raw), len(val_raw), len(test_raw))

Split sizes: 34800 4350 4351


In [20]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [21]:
# Step 2B format to chat with <safe>…</safe> as PLAIN STRINGS
from transformers import AutoTokenizer

BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False, local_files_only=True)

# pad token safety
if tok.pad_token is None and tok.eos_token is not None:
    tok.pad_token = tok.eos_token

# Llama 3.2 chat special tokens (strings)
BOS = tok.bos_token or "<|begin_of_text|>"
START = "<|start_header_id|>"
END = "<|end_header_id|>"
EOT = "<|eot_id|>"

SAFE_OPEN, SAFE_CLOSE = "<safe>", "</safe>"

SYSTEM_RULE = (
    "You are a redactor. Return the EXACT input text with only PII spans replaced by dataset placeholders. "
    "Do NOT change any other words, punctuation, or casing. If unsure, keep. "
    "Wrap the final output inside <safe> and </safe>."
)

def format_llama3_chat(system_text: str, user_text: str, assistant_text: str) -> str:
    # Manual Llama-3 chat template as a STRING (no tokenization here)
    return (
        f"{BOS}"
        f"{START}system{END}\n{system_text}\n{EOT}"
        f"{START}user{END}\n{user_text}\n{EOT}"
        f"{START}assistant{END}\n{assistant_text}"
    )

def to_chat_text(ex):
    assistant = f"{SAFE_OPEN}{ex['target_text']}{SAFE_CLOSE}"
    text = format_llama3_chat(SYSTEM_RULE, ex["source_text"], assistant)
    return {"text": text}

# Rebuild formatted datasets as STRINGS
train_ds = train_raw.map(to_chat_text, remove_columns=train_raw.column_names)
val_ds   = val_raw.map(to_chat_text,   remove_columns=val_raw.column_names)
test_ds  = test_raw.map(to_chat_text,  remove_columns=test_raw.column_names)

print("train/val/test (formatted):", len(train_ds), len(val_ds), len(test_ds))
print("\nExample formatted record (first ~1400 chars):\n")
print(train_ds[0]["text"][:1400])


Map:   0%|          | 0/34800 [00:00<?, ? examples/s]

Map:   0%|          | 0/4350 [00:00<?, ? examples/s]

Map:   0%|          | 0/4351 [00:00<?, ? examples/s]

train/val/test (formatted): 34800 4350 4351

Example formatted record (first ~1400 chars):

<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a redactor. Return the EXACT input text with only PII spans replaced by dataset placeholders. Do NOT change any other words, punctuation, or casing. If unsure, keep. Wrap the final output inside <safe> and </safe>.
<|eot_id|><|start_header_id|>user<|end_header_id|>
The survey conducted on Intersex shows an alarming percentage experiencing early symptoms of PCOS. It's prevalent especially in the age group of 39 years old. Focusing on these demographic could be vital in future prevention.
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
<safe>The survey conducted on [GENDER] shows an alarming percentage experiencing early symptoms of PCOS. It's prevalent especially in the age group of [AGE]. Focusing on these demographic could be vital in future prevention.</safe>


In [22]:
# Step 3A tag variants + helpers
SAFE_OPEN  = "<safe>"
SAFE_CLOSE = "</safe>"

def make_tag_candidates(tag: str, tok):
    variants = [tag, " " + tag, "\n" + tag, "\r\n" + tag]
    uniq = []
    for v in variants:
        ids = tok.encode(v, add_special_tokens=False)
        if ids and ids not in uniq:
            uniq.append(ids)
    return uniq

open_cands  = make_tag_candidates(SAFE_OPEN, tok)
close_cands = make_tag_candidates(SAFE_CLOSE, tok)

def find_subseq(hay, needle):
    H, N = len(hay), len(needle)
    if N == 0 or H < N: return -1
    for i in range(H - N + 1):
        if hay[i:i+N] == needle:
            return i
    return -1

def find_any_tag(hay, candidates):
    """Return (pos, length) for the first matching candidate, else (-1, 0)."""
    best_pos, best_len = -1, 0
    for cand in candidates:
        pos = find_subseq(hay, cand)
        if pos != -1 and (best_pos == -1 or pos < best_pos):
            best_pos, best_len = pos, len(cand)
    return best_pos, best_len


In [23]:
# Step 3B Precise SafeCollator- map string span to token span via prefix vs (prefix+target)
from typing import List, Dict, Any

SEQ_LEN = 512
SAFE_OPEN, SAFE_CLOSE = "<safe>", "</safe>"

class SafeCollator:
    def __init__(self, tokenizer, max_len=SEQ_LEN):
        self.tok = tokenizer
        self.max_len = max_len

    def _ids(self, s: str):
        return self.tok(
            s,
            add_special_tokens=False,
            truncation=True,
            max_length=self.max_len,
        )["input_ids"]

    def __call__(self, batch: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        texts = [b["text"] for b in batch]
        enc = self.tok(
            texts,
            padding=True,
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt",
            add_special_tokens=False,
        )
        input_ids = enc["input_ids"]
        attn = enc["attention_mask"]
        labels = input_ids.clone()

        for i, text in enumerate(texts):
            # Find char spans (use the LAST <safe> to avoid the mention in the system prompt)
            open_idx = text.rfind(SAFE_OPEN)
            if open_idx == -1:
                labels[i, :] = -100
                continue
            start_idx = open_idx + len(SAFE_OPEN)
            end_idx = text.find(SAFE_CLOSE, start_idx)
            if end_idx == -1:
                end_idx = len(text)

            prefix = text[:start_idx]
            target = text[start_idx:end_idx]

            p_ids  = self._ids(prefix)
            pt_ids = self._ids(prefix + target)

            p_len  = len(p_ids)
            pt_len = len(pt_ids)
            L = int(attn[i].sum().item())  

            if p_len >= L:
                labels[i, :] = -100
                continue

            start_tok = p_len
            end_tok   = min(L, pt_len) 

\            labels[i, :start_tok] = -100
            labels[i, end_tok:]   = -100

        enc["labels"] = labels
        return enc

collator = SafeCollator(tok, max_len=SEQ_LEN)
print("Collator ready (precise prefix vs prefix+target).")


Collator ready (precise prefix vs prefix+target).


In [24]:
# Step 3C- sanity check the masking on a couple of samples
batch = [train_ds[i] for i in range(2)]
batch_out = collator(batch)

print("input_ids shape:", tuple(batch_out["input_ids"].shape))
print("labels    shape:", tuple(batch_out["labels"].shape))

for i in range(len(batch)):
    keep_mask = (batch_out["labels"][i] != -100)
    kept = batch_out["input_ids"][i][keep_mask].tolist()
    print(f"\n--- Example {i} tokens_with_loss:", int(keep_mask.sum().item()))
    print("Decoded text: ",tok.decode(kept))

input_ids shape: (2, 156)
labels    shape: (2, 156)

--- Example 0 tokens_with_loss: 42
Decoded text:   survey conducted on [GENDER] shows an alarming percentage experiencing early symptoms of PCOS. It's prevalent especially in the age group of [AGE]. Focusing on these demographic could be vital in future prevention.</

--- Example 1 tokens_with_loss: 34
Decoded text:  FIRSTNAME], the diagnostic device located at [NEARBYGPSCOORDINATE] identified you as [AGE]. The scan data confirmed the [EYECOLOR].


In [25]:
pip install peft accelerate

Note: you may need to restart the kernel to use updated packages.


In [26]:
# Step 4A - QLoRA model load
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
SEQ_LEN = 512

# Safety free CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache(); torch.cuda.ipc_collect()

# Tokenizer 
tok = AutoTokenizer.from_pretrained(BASE_MODEL, use_fast=False, local_files_only=True)
if tok.pad_token is None and tok.eos_token is not None:
    tok.pad_token = tok.eos_token

# 4-bit quant config
bnb_cfg = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load base in 4-bit (use eager attention on 20-series GPUs)
model_base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    quantization_config=bnb_cfg,
    attn_implementation="eager",
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=False,
    local_files_only=True,   # set False if you need to pull from hub
)
model_base.config.use_cache = False  # training

# Prepare for k-bit training BEFORE adding LoRA
model_base = prepare_model_for_kbit_training(model_base, use_gradient_checkpointing=False)

# LoRA config - modest, stable
lora_cfg = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05, bias="none",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type="CAUSAL_LM",
)

model = get_peft_model(model_base, lora_cfg)
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable:,} / {total:,} ({100*trainable/total:.3f}%)")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Trainable params: 24,313,856 / 1,827,777,536 (1.330%)


In [27]:
# Step 4B-mini: fast warmup
from transformers import TrainingArguments, Trainer

# 1) Use a small random subset
N_WARMUP = 6000
train_ds_small = train_ds.shuffle(seed=42).select(range(min(N_WARMUP, len(train_ds))))
print("Warmup subset size:", len(train_ds_small))

# 2) Shorten seq length and re-instantiate the collator
SEQ_LEN = 320  # shorter context => faster; still far above p95 length for this dataset
collator = SafeCollator(tok, max_len=SEQ_LEN)

# 3) Training args tuned for speed
args_fast = TrainingArguments(
    output_dir=str(RUN_DIR/"checkpoints_fast"),
    num_train_epochs=1,                 # single quick epoch
    max_steps=0,                        # keep 0; OR set e.g. 400 to hard-cap runtime
    learning_rate=2e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,     # effective batch = 16
    logging_steps=50,
    save_strategy="no",
    fp16=True, bf16=False,              # if FP16 scaler complains, set fp16=False and optim="adamw_torch"
    optim="paged_adamw_8bit",
    gradient_checkpointing=False,
    max_grad_norm=0.3,
    report_to="none",
    seed=42,
    group_by_length=False,              # we tokenize in collator; keep this False
    dataloader_num_workers=0,
    remove_unused_columns=False,        # <-- important so 'text' reaches the collator
)

trainer_fast = Trainer(
    model=model,                        # LoRA-wrapped model from 4A
    args=args_fast,
    train_dataset=train_ds_small,
    eval_dataset=None,
    data_collator=collator,
    tokenizer=tok,
)

print("Starting v2 fast warmup…")
res = trainer_fast.train()
print("Final training loss:", getattr(res, "training_loss", None))

ADAPTER_DIR = RUN_DIR/"adapter_v2_fast"
trainer_fast.model.save_pretrained(str(ADAPTER_DIR))
tok.save_pretrained(str(ADAPTER_DIR))
print("Saved adapter to:", ADAPTER_DIR)

Warmup subset size: 6000
Starting v2 fast warmup…


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
50,0.3048
100,0.0681
150,0.0484
200,0.0349
250,0.0288
300,0.0243
350,0.0227


Final training loss: 0.07227263100941976
Saved adapter to: runs/v2_safe_contract/adapter_v2_fast


In [30]:
# 5A - helper + a few tests
SAFE_OPEN, SAFE_CLOSE = "<safe>", "</safe>"
BOS = tok.bos_token or "<|begin_of_text|>"
START = "<|start_header_id|>"
END = "<|end_header_id|>"
EOT = "<|eot_id|>"

SYSTEM_RULE = (
    "You are a redactor. Return the EXACT input text with only PII spans replaced by dataset placeholders. "
    "Do NOT change any other words, punctuation, or casing. If unsure, keep. "
    "Wrap the final output inside <safe> and </safe>."
)

def build_prompt(user_text: str) -> str:
    return (
        f"{BOS}"
        f"{START}system{END}\n{SYSTEM_RULE}\n{EOT}"
        f"{START}user{END}\n{user_text}\n{EOT}"
        f"{START}assistant{END}\n{SAFE_OPEN}"
    )

@torch.no_grad()
def redact_safe(user_text: str, max_new_tokens=96) -> str:
    model.eval(); model.config.use_cache = True
    prompt = build_prompt(user_text)
    inputs = tok(prompt, return_tensors="pt").to(model.device)
    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        eos_token_id=tok.eos_token_id,
        pad_token_id=tok.pad_token_id,
    )
    decoded = tok.decode(out[0], skip_special_tokens=True)
    # extract text between <safe> and </safe>
    s = decoded.rfind(SAFE_OPEN)
    if s != -1:
        s += len(SAFE_OPEN)
        e = decoded.find(SAFE_CLOSE, s)
        piece = decoded[s:e if e != -1 else None]
    else:
        piece = decoded[len(prompt):]
    return piece.strip()

tests = [
    "Hi, I am Vishal Shinde. Email me at vishal@example.com and call +1 415 555 0199.",
    "Card 4111 1111 1111 1111 expires 12/26, CVV 123.",
    "Server at 10.0.0.5 uses API key sk_live_ABC123XYZ456 for uploads.",
    "IMEI: 06-184755-866851-3 belongs to this device.",
    "My SSN is 123-45-6789. Do not store it.",
    'Hi Vishal, The phone with IMEI: 105879097227517  that you chose for trading in has passed our checks. '
    'You can send the package to us at 3000 Berkeley Avenue, AZ. If you have any questions, feel free to call us at '
    '+16328963421 or email us at phoneguy@gmail.com. Reply with not interested for the above email.'
]

for i, t in enumerate(tests, 1):
    print(f"\n--- Test {i} ---")
    print("IN :", t)
    print("OUT:", redact_safe(t))


--- Test 1 ---
IN : Hi, I am Vishal Shinde. Email me at vishal@example.com and call +1 415 555 0199.




OUT: Hi, I am [FIRSTNAME] [LASTNAME]. Email me at [EMAIL] and call [PHONENUMBER].

--- Test 2 ---
IN : Card 4111 1111 1111 1111 expires 12/26, CVV 123.
OUT: Card [CREDITCARDCVV] [CREDITCARDNUMBER] expires [EXPIRYDATE], CVV [CREDITCARDCVV].

--- Test 3 ---
IN : Server at 10.0.0.5 uses API key sk_live_ABC123XYZ456 for uploads.
OUT: Server at [IPV4] uses API key [CREDITCARDISSUER]_[CREDITCARDNUMBER] for uploads.

--- Test 4 ---
IN : IMEI: 06-184755-866851-3 belongs to this device.
OUT: IMEI: [PHONEIMEI] belongs to this device.

--- Test 5 ---
IN : My SSN is 123-45-6789. Do not store it.
OUT: My SSN is [SSN]. Do not store it.

--- Test 6 ---
IN : Hi Vishal, The phone with IMEI: 105879097227517  that you chose for trading in has passed our checks. You can send the package to us at 3000 Berkeley Avenue, AZ. If you have any questions, feel free to call us at +16328963421 or email us at phoneguy@gmail.com. Reply with not interested for the above email.
OUT: Hi [FIRSTNAME], The phone with IMEI:

In [31]:
# 5B - batched eval: exact match, placeholder micro-F1, formatting errors
PLACEHOLDER_RE = re.compile(r"\[([A-Za-z0-9_]+)\]")

def extract_labels(text): 
    return [m.group(1) for m in PLACEHOLDER_RE.finditer(text or "")]

def count_map(labels):
    c = Counter()
    for l in labels: c[l]+=1
    return c

# Allowed placeholders
ALLOWED = set([ln.strip() for ln in open(Path("runs/v2_safe_contract")/"label_vocab_dataset.txt", "r", encoding="utf-8") if ln.strip()])

def formatting_errors(text):
    errs = []
    if (text or "").count("[") != (text or "").count("]"):
        errs.append("bracket_mismatch")
    unk = sorted(set(l for l in extract_labels(text) if l not in ALLOWED))
    if unk: errs.append("unknown:" + "|".join(unk))
    return errs

def prf(tp, fp, fn):
    P = tp/(tp+fp) if (tp+fp) else 0.0
    R = tp/(tp+fn) if (tp+fn) else 0.0
    F = 2*P*R/(P+R) if (P+R) else 0.0
    return P, R, F

N_EVAL = 300
BATCH  = 4
MAX_NEW = 96

rows = test_raw.shuffle(seed=42).select(range(min(N_EVAL, len(test_raw))))
srcs  = [r["source_text"] for r in rows]
golds = [r["target_text"] for r in rows]

preds=[]
for i in range(0, len(srcs), BATCH):
    batch = srcs[i:i+BATCH]
    for s in batch:
        preds.append(redact_safe(s, max_new_tokens=MAX_NEW))

# metrics
exact, fmt_err = 0, 0
tp=Counter(); fp=Counter(); fn=Counter()

for pred, gold in zip(preds, golds):
    if pred == gold: exact += 1
    if formatting_errors(pred): fmt_err += 1
    pC = count_map(extract_labels(pred))
    gC = count_map(extract_labels(gold))
    labs = set(pC)|set(gC)
    for l in labs:
        tp[l] += min(pC.get(l,0), gC.get(l,0))
        fp[l] += max(pC.get(l,0)-gC.get(l,0), 0)
        fn[l] += max(gC.get(l,0)-pC.get(l,0), 0)

P_micro, R_micro, F_micro = prf(sum(tp.values()), sum(fp.values()), sum(fn.values()))
print(f"\nEvaluated {len(preds)} examples")
print(f"Exact match rate: {exact/len(preds):.3f}")
print(f"Placeholder micro-F1: P={P_micro:.3f} R={R_micro:.3f} F1={F_micro:.3f}")
print(f"Formatting error rate: {fmt_err/len(preds):.3f}")


Evaluated 300 examples
Exact match rate: 0.670
Placeholder micro-F1: P=0.911 R=0.896 F1=0.903
Formatting error rate: 0.000


In [32]:
tests = [
    "Hi, I am Vishal Shinde. Email me at vishal@example.com and call +1 415 555 0199.",
    "Card 4111 1111 1111 1111 expires 12/26, CVV 123.",
    "Server at 10.0.0.5 uses API key sk_live_ABC123XYZ456 for uploads.",
    "IMEI: 06-184755-866851-3 belongs to this device.",
    "My SSN is 123-45-6789. Do not store it.",
    'Hi Vishal, The phone with IMEI: 105879097227517  that you chose for trading in has passed our checks. '
    'You can send the package to us at 3000 Berkeley Avenue, AZ. If you have any questions, feel free to call us at '
    '+16328963421 or email us at phoneguy@gmail.com. Reply with not interested for the above email.'
]

for i, t in enumerate(tests, 1):
    print(f"\n--- Test {i} ---")
    print("IN :", t)
    print("OUT:", redact_safe(t))


--- Test 1 ---
IN : Hi, I am Vishal Shinde. Email me at vishal@example.com and call +1 415 555 0199.
OUT: Hi, I am [FIRSTNAME] [LASTNAME]. Email me at [EMAIL] and call [PHONENUMBER].

--- Test 2 ---
IN : Card 4111 1111 1111 1111 expires 12/26, CVV 123.
OUT: Card [CREDITCARDCVV] [CREDITCARDNUMBER] expires [EXPIRYDATE], CVV [CREDITCARDCVV].

--- Test 3 ---
IN : Server at 10.0.0.5 uses API key sk_live_ABC123XYZ456 for uploads.
OUT: Server at [IPV4] uses API key [CREDITCARDISSUER]_[CREDITCARDNUMBER] for uploads.

--- Test 4 ---
IN : IMEI: 06-184755-866851-3 belongs to this device.
OUT: IMEI: [PHONEIMEI] belongs to this device.

--- Test 5 ---
IN : My SSN is 123-45-6789. Do not store it.
OUT: My SSN is [SSN]. Do not store it.

--- Test 6 ---
IN : Hi Vishal, The phone with IMEI: 105879097227517  that you chose for trading in has passed our checks. You can send the package to us at 3000 Berkeley Avenue, AZ. If you have any questions, feel free to call us at +16328963421 or email us at phoneg