# Step 1 - Double Check proper enviroment setup

* write this in terminal "conda activate pii
python -m ipykernel install --user --name=pii --display-name "Python (pii)"

In [9]:
import sys
print(sys.executable)

/opt/miniconda3/envs/pii/bin/python


In [10]:
# Quick check that key packages are available
import datasets
import transformers
import torch
import pandas

print(f"datasets version: {datasets.__version__}")
print(f"transformers version: {transformers.__version__}")
print(f"torch version: {torch.__version__}")
print(f"torch version: {pandas.__version__}")

datasets version: 4.4.1
transformers version: 4.57.1
torch version: 2.9.1
torch version: 2.3.3


In [11]:
from datasets import get_dataset_config_names, get_dataset_split_names
from huggingface_hub import list_repo_files

# See what files are actually in the repository
files = list_repo_files("tursunait/deberta-pii-synth", repo_type="dataset")
print("Files in the repository:")
for f in files:
    print(f"  {f}")

Files in the repository:
  .gitattributes
  README.md
  test/data-00000-of-00001.arrow
  test/dataset_info.json
  test/state.json
  train/.gitattributes
  train/README.md
  train/data-00000-of-00002.arrow
  train/data-00001-of-00002.arrow
  train/dataset_info.json
  train/state.json
  val/data-00000-of-00001.arrow
  val/dataset_info.json
  val/state.json


# Step 2: Load and Analyse Dataset

* Issue: This confirms the issue: train uses Arrow format, but validation and test use JSON format! This is a misconfiguration in the dataset on Hugging Face itself.The dataset IS actually all in Arrow format, but there's a configuration issue preventing it from loading automatically. So use the exact code below not the one given by Tursunai.

In [12]:
from datasets import load_dataset

# Load by explicitly pointing to the arrow files
ds = load_dataset(
    "arrow",
    data_files={
        "train": "hf://datasets/tursunait/deberta-pii-synth/train/data-*.arrow",
        "validation": "hf://datasets/tursunait/deberta-pii-synth/val/data-*.arrow",
        "test": "hf://datasets/tursunait/deberta-pii-synth/test/data-*.arrow"
    }
)

# Dataset already split
train = ds["train"]
val = ds["validation"]
test = ds["test"]

print("Train size:", len(train))
print("Val size:", len(val))
print("Test size:", len(test))
print("\nFirst example:")
print(train[0])

Train size: 96000
Val size: 12000
Test size: 12000

First example:
{'text': 'Jack Phillips fr7m Hancock-Melendez uqes card 4843127370283685 on 1994-08-27.', 'spans': [{'end': 13, 'label': 'PERSON', 'start': 0}, {'end': 35, 'label': 'ORG', 'start': 19}, {'end': 62, 'label': 'CREDIT_CARD', 'start': 46}, {'end': 76, 'label': 'DATE', 'start': 66}], 'input_ids': [1, 20907, 7431, 6664, 406, 119, 19632, 12, 20201, 22192, 1717, 1343, 293, 1886, 2929, 3897, 1092, 5352, 3083, 2517, 3367, 4531, 15, 8148, 12, 3669, 12, 2518, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [13]:
train[0]

{'text': 'Jack Phillips fr7m Hancock-Melendez uqes card 4843127370283685 on 1994-08-27.',
 'spans': [{'end': 13, 'label': 'PERSON', 'start': 0},
  {'end': 35, 'label': 'ORG', 'start': 19},
  {'end': 62, 'label': 'CREDIT_CARD', 'start': 46},
  {'end': 76, 'label': 'DATE', 'start': 66}],
 'input_ids': [1,
  20907,
  7431,
  6664,
  406,
  119,
  19632,
  12,
  20201,
  22192,
  1717,
  1343,
  293,
  1886,
  2929,
  3897,
  1092,
  5352,
  3083,
  2517,
  3367,
  4531,
  15,
  8148,
  12,
  3669,
  12,
  2518,
  4,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

In [14]:
val[0]

{'text': 'Kelsey Fitzpatrick from Bass PLC used card 4572683644100983306 on 2001-07-12.',
 'spans': [{'end': 18, 'label': 'PERSON', 'start': 0},
  {'end': 32, 'label': 'ORG', 'start': 24},
  {'end': 62, 'label': 'CREDIT_CARD', 'start': 43},
  {'end': 76, 'label': 'DATE', 'start': 66}],
 'input_ids': [1,
  530,
  523,
  4169,
  20842,
  31,
  12554,
  221,
  6447,
  341,
  1886,
  42346,
  2481,
  6361,
  36520,
  1866,
  5208,
  246,
  32230,
  15,
  5155,
  12,
  3570,
  12,
  1092,
  4,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

In [15]:
test[0]

{'text': 'meeting on 2006-05-04. Call 7525572089 if late.',
 'spans': [{'end': 21, 'label': 'DATE', 'start': 11},
  {'end': 38, 'label': 'PHONE', 'start': 28}],
 'input_ids': [1,
  1794,
  16382,
  15,
  3503,
  12,
  2546,
  12,
  3387,
  4,
  3310,
  3337,
  1244,
  4390,
  844,
  5046,
  114,
  628,
  4,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

In [None]:
import numpy as np
print("üîç Analyzing FULL training dataset...")

# Check ALL training examples
lengths = [len(example['input_ids']) for example in train]

print(f"\nüìä Sequence Length Statistics:")
print(f"Total examples: {len(train):,}")
print(f"Average length: {np.mean(lengths):.1f} tokens")
print(f"Median length: {np.median(lengths):.1f} tokens")
print(f"Min length: {np.min(lengths)} tokens")
print(f"Max length: {np.max(lengths)} tokens")
print(f"Std deviation: {np.std(lengths):.1f}")

# Distribution analysis
print(f"\nüìà Length Distribution:")
print(f"Sequences < 50 tokens: {sum(1 for l in lengths if l < 50):,} ({sum(1 for l in lengths if l < 50)/len(lengths)*100:.1f}%)")
print(f"Sequences 50-100 tokens: {sum(1 for l in lengths if 50 <= l < 100):,} ({sum(1 for l in lengths if 50 <= l < 100)/len(lengths)*100:.1f}%)")
print(f"Sequences 100-200 tokens: {sum(1 for l in lengths if 100 <= l < 200):,} ({sum(1 for l in lengths if 100 <= l < 200)/len(lengths)*100:.1f}%)")
print(f"Sequences 200-512 tokens: {sum(1 for l in lengths if 200 <= l < 512):,} ({sum(1 for l in lengths if 200 <= l < 512)/len(lengths)*100:.1f}%)")
print(f"Sequences 512+ tokens: {sum(1 for l in lengths if l >= 512):,} ({sum(1 for l in lengths if l >= 512)/len(lengths)*100:.1f}%)")

print("\n‚è±Ô∏è Time Estimation:")
print(f"With batch_size=8: ~{len(train)//8:,} steps per epoch")
print(f"Estimated time at ~0.5 sec/step: {(len(train)//8 * 0.5)/3600:.1f} hours per epoch")

üîç Analyzing FULL training dataset...


# Step 3: Build Model

## 3.1 - Load Roberta-base
- Roberta is a pre-trained language model (like a smart AI that already understands language). You need to:

    - Load the base model called "roberta-base"
    - Load it specifically for token classification (labeling each word/token)\
    - The model needs to know how many labels exist (like PERSON, ORG, EMAIL, etc.)
    - Also load the tokenizer (converts text to numbers the model understands)

- Think of it like: You're taking a smart student (Roberta) who already knows English, and now you're going to teach them to specifically identify PII in text.


## 3.2 - Baseline Model: Fixed Hyperparameter

### A. Train and Validation

In [None]:
# Step 1: Call Moddl and Assign the Labels

import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification
import sys
import os

# Add parent directory to path
parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

# Import label configuration
from pii_synth.config_and_labels import LABEL2ID, ID2LABEL

# Step 1: Call the model - RoBERTa (Most stable and popular for NER)
model_name = "roberta-base"

num_labels = len(ID2LABEL)

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=ID2LABEL,
    label2id=LABEL2ID
)

In [None]:
# Step 2: Create SMALL subsets for fast hyperparameter tuning - IF WE USE FULL MODEL IT BREAKS
train_subset = train.select(range(6000))
val_subset = val.select(range(600))

print(f"Train subset: {len(train_subset)} examples")
print(f"Val subset: {len(val_subset)} examples")

In [None]:
# Step 3: Random Search on key Hyperparameter

In [None]:
# Step 4: Pick Best Hyperparameter and Retrain Model

In [None]:

# Step 3: Training configuration - STABLE SETTINGS -- 
training_args = TrainingArguments(
    output_dir="./results_baseline_model",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="no",
    report_to="none",
    max_grad_norm=1.0,
    warmup_ratio=0.1,
    weight_decay=0.01,
    dataloader_num_workers=0,
)

# Step 4: Data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

# Step 5: Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=val_subset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)



üîÑ Loading roberta-base...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


‚úÖ Model loaded with 33 labels
üìä Creating small subsets for fast tuning...
Train subset: 3000 examples
Val subset: 500 examples

üöÄ Starting training with roberta-base...
‚è±Ô∏è  This should work without overflow errors!


Step,Training Loss,Validation Loss


‚úÖ Training complete!

üìà Evaluating...


Validation results: {'eval_loss': 0.02069513127207756, 'eval_runtime': 25.2524, 'eval_samples_per_second': 19.8, 'eval_steps_per_second': 2.495, 'epoch': 2.0}


In [None]:
# Step 6: Train
print(f"\n Starting training with {model_name}...")
trainer.train()
print("‚úÖ Training complete!")

# Step 7: Evaluate
print("\nüìà Evaluating...")
results = trainer.evaluate()
print(f"Validation results: {results}")

### B. Test

In [None]:
import numpy as np

# (Optional) use smaller test subset
test_eval = test.select(range(600))   # or test for full set

# 1) Run predictions on the test set
test_predictions = trainer.predict(test_eval)

logits = test_predictions.predictions   # [batch, seq_len, num_labels]
labels = test_predictions.label_ids     # [batch, seq_len]

# 2) Convert logits ‚Üí predicted label IDs
pred_ids = np.argmax(logits, axis=-1)

# 3) Remove padding tokens (label == -100)
all_preds = []
all_labels = []

for p, t in zip(pred_ids, labels):
    mask = t != -100
    all_preds.append(p[mask])
    all_labels.append(t[mask])

# Flatten for simple metrics
all_preds_flat = np.concatenate(all_preds)
all_labels_flat = np.concatenate(all_labels)

print("Test prediction step done.")




Test prediction step done.


### C. Performance Evaluation - METRICS

In [None]:
from sklearn.metrics import accuracy_score
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

# 1) Token-level accuracy (mostly meaningless for NER but fine to report)
accuracy = accuracy_score(all_labels_flat, all_preds_flat)
print("Token-level Accuracy:", accuracy)

# 2) Convert label IDs ‚Üí text labels for seqeval
id2label = ID2LABEL   # your mapping from earlier

grouped_preds = []
grouped_labels = []

for p_seq, t_seq in zip(all_preds, all_labels):
    grouped_preds.append([id2label[int(i)] for i in p_seq])
    grouped_labels.append([id2label[int(i)] for i in t_seq])

# 3) Print real NER metrics
print("\nPrecision:", precision_score(grouped_labels, grouped_preds))
print("Recall:", recall_score(grouped_labels, grouped_preds))
print("F1 Score:", f1_score(grouped_labels, grouped_preds))

print("\nDetailed classification report:")
print(classification_report(grouped_labels, grouped_preds))


Token-level Accuracy: 0.9984779299847792

Precision: 0.9882943143812709
Recall: 0.9924433249370277
F1 Score: 0.9903644742354419

Detailed classification report:
              precision    recall  f1-score   support

     ADDRESS       0.98      1.00      0.99        95
 CREDIT_CARD       0.99      1.00      1.00       113
        DATE       1.00      1.00      1.00       219
       EMAIL       0.99      0.99      0.99       142
         ORG       0.98      0.98      0.98       193
      PERSON       0.98      0.99      0.98       236
       PHONE       1.00      1.00      1.00       145
         SSN       0.98      0.98      0.98        48

   micro avg       0.99      0.99      0.99      1191
   macro avg       0.99      0.99      0.99      1191
weighted avg       0.99      0.99      0.99      1191



