# Step 1 - Double Check proper enviroment setup

* write this in terminal "conda activate pii
python -m ipykernel install --user --name=pii --display-name "Python (pii)"

In [1]:
import sys
print(sys.executable)

/opt/miniconda3/envs/pii/bin/python


In [2]:
# Quick check that key packages are available
import datasets
import transformers
import torch
import pandas

print(f"datasets version: {datasets.__version__}")
print(f"transformers version: {transformers.__version__}")
print(f"torch version: {torch.__version__}")
print(f"torch version: {pandas.__version__}")

datasets version: 4.4.1
transformers version: 4.57.1
torch version: 2.9.1
torch version: 2.3.3


In [3]:
# from datasets import get_dataset_config_names, get_dataset_split_names
# from huggingface_hub import list_repo_files

# # See what files are actually in the repository
# files = list_repo_files("tursunait/deberta-pii-synth", repo_type="dataset")
# print("Files in the repository:")
# for f in files:
#     print(f"  {f}")

In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification
import sys
import os

# Add parent directory to path
parent_dir = os.path.dirname(os.getcwd())
sys.path.append(parent_dir)

# ============================================================================
# CRITICAL: Force reimport of config to get AGE
# ============================================================================

# Remove cached import
if 'pii_synth.config_and_labels' in sys.modules:
    del sys.modules['pii_synth.config_and_labels']

# Fresh import
from pii_synth.config_and_labels import LABEL_LIST, ENTITY_TYPES

# ============================================================================
# Verify labels
# ============================================================================

print(f"ENTITY_TYPES: {ENTITY_TYPES}")
print(f"Number of entity types: {len(ENTITY_TYPES)}")
print(f"Total labels: {len(LABEL_LIST)}")
print(f"Expected: 1 + {len(ENTITY_TYPES)} √ó 4 = {1 + len(ENTITY_TYPES) * 4}")

if len(ENTITY_TYPES) == 9 and len(LABEL_LIST) == 37:
    print("‚úÖ Config is correct!")
else:
    print("‚ùå Config mismatch!")
    
# ============================================================================
# Create label mappings
# ============================================================================

ID2LABEL = {i: label for i, label in enumerate(LABEL_LIST)}
LABEL2ID = {label: i for i, label in enumerate(LABEL_LIST)}

num_labels = len(LABEL_LIST)

print(f"\nnum_labels for model: {num_labels}")

ENTITY_TYPES: ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'PERSON', 'ORG', 'ADDRESS', 'DATE', 'AGE']
Number of entity types: 9
Total labels: 37
Expected: 1 + 9 √ó 4 = 37
‚úÖ Config is correct!

num_labels for model: 37


# Step 2: Load and Analyse Dataset

* Issue: This confirms the issue: train uses Arrow format, but validation and test use JSON format! This is a misconfiguration in the dataset on Hugging Face itself.The dataset IS actually all in Arrow format, but there's a configuration issue preventing it from loading automatically. So use the exact code below not the one given by Tursunai.

In [5]:
from datasets import load_dataset

# # Load by explicitly pointing to the arrow files
# ds = load_dataset(
#     "arrow",
#     data_files={
#         "train": "hf://datasets/tursunait/deberta-pii-synth/train/data-*.arrow",
#         "validation": "hf://datasets/tursunait/deberta-pii-synth/val/data-*.arrow",
#         "test": "hf://datasets/tursunait/deberta-pii-synth/test/data-*.arrow"
#     }
# )

# NEW CODE (loads from local):

# # Dataset already split
# train = ds["train"]
# val = ds["validation"]
# test = ds["test"]

# print("Train size:", len(train))
# print("Val size:", len(val))
# print("Test size:", len(test))
# print("\nFirst example:")
# print(train[0])

In [6]:
from datasets import load_from_disk

train = load_from_disk("data/processed/train")
val = load_from_disk("data/processed/val")
test = load_from_disk("data/processed/test")

print("Train size:", len(train))
print("Val size:", len(val))
print("Test size:", len(test))
print("\nFirst example:")
print(train[0])


Train size: 96000
Val size: 12000
Test size: 12000

First example:
{'text': "DON'T SHACE buX uZrE's 1970rodney.lewis'S coMatctD sarahperez@aol.com /g2118x174 / ssn 0651734596", 'spans': [{'end': 39, 'label': 'PERSON', 'start': 23}, {'end': 69, 'label': 'EMAIL', 'start': 51}, {'end': 80, 'label': 'PHONE', 'start': 72}, {'end': 97, 'label': 'SSN', 'start': 87}], 'input_ids': [1, 42737, 108, 565, 4584, 15949, 10306, 1000, 1717, 1301, 338, 717, 18, 6200, 10774, 2596, 4, 459, 605, 354, 108, 104, 1029, 30121, 3894, 495, 579, 36000, 2379, 13208, 1039, 102, 1168, 4, 175, 1589, 571, 176, 21369, 1178, 29221, 1589, 579, 22617, 321, 3506, 30664, 1898, 5607, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 18, 18, 18, 18, 18, 19, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 3, 0, 0, 5, 6, 6, 7, 0, 0, 0, 9, 10, 10, 10, 11, 

In [7]:
from datasets import load_from_disk

# Load data
train = load_from_disk("data/processed/train")

# Check first example
sample = train[0]
print(f"Sample labels: {sample['labels']}")

# Find max label
all_max_labels = []
for i in range(min(100, len(train))):
    labels = [l for l in train[i]['labels'] if l != -100]
    if labels:
        all_max_labels.append(max(labels))

print(f"\nMax label ID found in first 100 examples: {max(all_max_labels)}")
print(f"This means we need at least {max(all_max_labels) + 1} labels")

# Check what's in config
from pii_synth.config_and_labels import LABEL_LIST, ENTITY_TYPES

print(f"\nENTITY_TYPES: {ENTITY_TYPES}")
print(f"Number of entity types: {len(ENTITY_TYPES)}")
print(f"LABEL_LIST length: {len(LABEL_LIST)}")
print(f"\nExpected labels: 1 (O) + {len(ENTITY_TYPES)} entities √ó 4 (BILOU) = {1 + len(ENTITY_TYPES) * 4}")

Sample labels: [-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 17, 18, 18, 18, 18, 18, 19, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 3, 0, 0, 5, 6, 6, 7, 0, 0, 0, 9, 10, 10, 10, 11, -100]

Max label ID found in first 100 examples: 36
This means we need at least 37 labels

ENTITY_TYPES: ['EMAIL', 'PHONE', 'SSN', 'CREDIT_CARD', 'PERSON', 'ORG', 'ADDRESS', 'DATE', 'AGE']
Number of entity types: 9
LABEL_LIST length: 37

Expected labels: 1 (O) + 9 entities √ó 4 (BILOU) = 37


In [8]:
import numpy as np
print("üîç Analyzing FULL training dataset...")

# Check ALL training examples
lengths = [len(example['input_ids']) for example in train]

print(f"\nüìä Sequence Length Statistics:")
print(f"Total examples: {len(train):,}")
print(f"Average length: {np.mean(lengths):.1f} tokens")
print(f"Median length: {np.median(lengths):.1f} tokens")
print(f"Min length: {np.min(lengths)} tokens")
print(f"Max length: {np.max(lengths)} tokens")
print(f"Std deviation: {np.std(lengths):.1f}")

# Distribution analysis
print(f"\nüìà Length Distribution:")
print(f"Sequences < 50 tokens: {sum(1 for l in lengths if l < 50):,} ({sum(1 for l in lengths if l < 50)/len(lengths)*100:.1f}%)")
print(f"Sequences 50-100 tokens: {sum(1 for l in lengths if 50 <= l < 100):,} ({sum(1 for l in lengths if 50 <= l < 100)/len(lengths)*100:.1f}%)")
print(f"Sequences 100-200 tokens: {sum(1 for l in lengths if 100 <= l < 200):,} ({sum(1 for l in lengths if 100 <= l < 200)/len(lengths)*100:.1f}%)")
print(f"Sequences 200-512 tokens: {sum(1 for l in lengths if 200 <= l < 512):,} ({sum(1 for l in lengths if 200 <= l < 512)/len(lengths)*100:.1f}%)")
print(f"Sequences 512+ tokens: {sum(1 for l in lengths if l >= 512):,} ({sum(1 for l in lengths if l >= 512)/len(lengths)*100:.1f}%)")

print("\n‚è±Ô∏è Time Estimation:")
print(f"With batch_size=8: ~{len(train)//8:,} steps per epoch")
print(f"Estimated time at ~0.5 sec/step: {(len(train)//8 * 0.5)/3600:.1f} hours per epoch")

üîç Analyzing FULL training dataset...

üìä Sequence Length Statistics:
Total examples: 96,000
Average length: 46.0 tokens
Median length: 28.0 tokens
Min length: 5 tokens
Max length: 294 tokens
Std deviation: 48.4

üìà Length Distribution:
Sequences < 50 tokens: 74,534 (77.6%)
Sequences 50-100 tokens: 9,231 (9.6%)
Sequences 100-200 tokens: 10,113 (10.5%)
Sequences 200-512 tokens: 2,122 (2.2%)
Sequences 512+ tokens: 0 (0.0%)

‚è±Ô∏è Time Estimation:
With batch_size=8: ~12,000 steps per epoch
Estimated time at ~0.5 sec/step: 1.7 hours per epoch


# Step 3: Build Model

## 3.1 - Load DeBERTa-base
- DeBERTa is a pre-trained language model (like a smart AI that already understands language). You need to:

    - Load the base model called "microsoft/deberta-base"
    - Load it specifically for token classification (labeling each word/token)\
    - The model needs to know how many labels exist (like PERSON, ORG, EMAIL, etc.)
    - Also load the tokenizer (converts text to numbers the model understands)

- Think of it like: You're taking a smart student (DeBERTa) who already knows English, and now you're going to teach them to specifically identify PII in text.


## 3.2 - Baseline Model: Fixed Hyperparameter

### A. Train and Validation

In [9]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification
import os


# Step 1: Call the model - RoBERTa (Most stable and popular for NER)
# ============================================================================
# Initialize model
# ============================================================================

model_name = "roberta-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Force CPU
device = torch.device("cpu")

# Create model with correct number of labels
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=num_labels,  # Should be 37
    id2label=ID2LABEL,
    label2id=LABEL2ID,
    ignore_mismatched_sizes=True  # Important!
).to(device)

print(f"\n‚úÖ Model created with {num_labels} labels")
print(f"Model config num_labels: {model.config.num_labels}")


# Step 2: Create SMALL subsets for fast hyperparameter tuning - IF WE USE FULL MODEL IT BREAKS
train_subset = train.select(range(6000))
val_subset = val.select(range(600))

print(f"Train subset: {len(train_subset)} examples")
print(f"Val subset: {len(val_subset)} examples")

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



‚úÖ Model created with 37 labels
Model config num_labels: 37
Train subset: 6000 examples
Val subset: 600 examples


In [10]:

# Step 3: Training configuration - STABLE SETTINGS -- 
training_args = TrainingArguments(
    output_dir="./results_baseline_model",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="no",
    report_to="none",
    max_grad_norm=1.0,
    warmup_ratio=0.1,
    weight_decay=0.01,
    dataloader_num_workers=0,
    use_cpu=True # ‚Üê Add this to run without cpu issues
)


In [11]:
# Step 4: Data Collator
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(
    tokenizer=tokenizer,
    padding=True,
    max_length=512
)

print("‚úÖ Data collator created")

‚úÖ Data collator created


In [12]:
# Use regular Trainer
trainer = Trainer(  # Changed this line!
    model=model,
    args=training_args,
    train_dataset=train_subset,
    eval_dataset=val_subset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Trainer(  # Changed this line!


In [13]:
# Step 6: Train
print(f"\n Starting training with {model_name}...")
trainer.train()
print("‚úÖ Training complete!")





 Starting training with roberta-base...


Step,Training Loss,Validation Loss
200,0.2178,0.167246


‚úÖ Training complete!


In [14]:
# Step 7: Evaluate
print("\nüìà Evaluating...")
results = trainer.evaluate()
print(f"Validation results: {results}")


üìà Evaluating...


Validation results: {'eval_loss': 0.12513229250907898, 'eval_runtime': 23.2881, 'eval_samples_per_second': 25.764, 'eval_steps_per_second': 3.221, 'epoch': 2.0}


In [15]:
# Save your trained model properly
print("Saving your trained model...")

# Create a directory for your model
model_save_path = "./trained_model"
os.makedirs(model_save_path, exist_ok=True)

# Save the model and tokenizer
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"‚úÖ Model saved to: {model_save_path}")

# Verify the files were created
model_files = os.listdir(model_save_path)
print(f"üìÅ Model directory contains: {model_files}")

Saving your trained model...
‚úÖ Model saved to: ./trained_model
üìÅ Model directory contains: ['model.safetensors', 'tokenizer_config.json', 'special_tokens_map.json', 'config.json', 'tokenizer.json', 'merges.txt', 'training_args.bin', 'vocab.json']


### B. Test

In [16]:
import numpy as np

# (Optional) use smaller test subset
test_eval = test.select(range(600))   # or test for full set

# 1) Run predictions on the test set
test_predictions = trainer.predict(test_eval)

logits = test_predictions.predictions   # [batch, seq_len, num_labels]
labels = test_predictions.label_ids     # [batch, seq_len]

# 2) Convert logits ‚Üí predicted label IDs
pred_ids = np.argmax(logits, axis=-1)

# 3) Remove padding tokens (label == -100)
all_preds = []
all_labels = []

for p, t in zip(pred_ids, labels):
    mask = t != -100
    all_preds.append(p[mask])
    all_labels.append(t[mask])

# Flatten for simple metrics
all_preds_flat = np.concatenate(all_preds)
all_labels_flat = np.concatenate(all_labels)

print("Test prediction step done.")


Test prediction step done.


### C. Performance Evaluation - METRICS

In [17]:
from sklearn.metrics import accuracy_score
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

# 1) Token-level accuracy (mostly meaningless for NER but fine to report)
accuracy = accuracy_score(all_labels_flat, all_preds_flat)
print("Token-level Accuracy:", accuracy)

# 2) Convert label IDs ‚Üí text labels for seqeval
id2label = ID2LABEL   # your mapping from earlier

grouped_preds = []
grouped_labels = []

for p_seq, t_seq in zip(all_preds, all_labels):
    grouped_preds.append([id2label[int(i)] for i in p_seq])
    grouped_labels.append([id2label[int(i)] for i in t_seq])

# 3) Print real NER metrics
print("\nPrecision:", precision_score(grouped_labels, grouped_preds))
print("Recall:", recall_score(grouped_labels, grouped_preds))
print("F1 Score:", f1_score(grouped_labels, grouped_preds))

print("\nDetailed classification report:")
print(classification_report(grouped_labels, grouped_preds))


Token-level Accuracy: 0.9689050985110543

Precision: 0.8337874659400545
Recall: 0.865874363327674
F1 Score: 0.84952803997779

Detailed classification report:




              precision    recall  f1-score   support

     ADDRESS       0.79      0.70      0.74       122
         AGE       0.66      0.76      0.71        95
 CREDIT_CARD       0.87      0.93      0.90        74
        DATE       0.93      0.93      0.93       258
       EMAIL       0.91      0.93      0.92       267
         ORG       0.93      0.94      0.94        87
      PERSON       0.81      0.86      0.84       448
       PHONE       0.81      0.85      0.83       294
         SSN       0.72      0.80      0.76       122

   micro avg       0.83      0.87      0.85      1767
   macro avg       0.83      0.86      0.84      1767
weighted avg       0.84      0.87      0.85      1767

