In [None]:
import json
from gliner import GLiNER
from transformers import AutoTokenizer
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

model = GLiNER.from_pretrained("knowledgator/gliner-pii-base-v1.0")
tokenizer = AutoTokenizer.from_pretrained("knowledgator/gliner-pii-base-v1.0")

PII_LABELS = [
    "name", "email", "username", "id number", "phone number", "url", "street address"
]

# Mapping competition labels to GLiNER-compatible labels
label_map = {
    "NAME_STUDENT": "name",
    "EMAIL": "email",
    "USERNAME": "username",
    "ID_NUM": "id number",
    "PHONE_NUM": "phone number",
    "URL_PERSONAL": "url",
    "STREET_ADDRESS": "street address",
}


Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

In [5]:
import os

data_filename = "mixtral-8x7b-v1.json"
data_path = os.path.join(".",data_filename)
with open(data_path, "r") as f:
    data = json.load(f)

for i, entry in enumerate(data[:1]):
    print(f"\n📝 Entry {i + 1}")
    print("-" * 40)
    for key, value in entry.items():
        # Truncate long values for readability
        display_value = value
        if isinstance(value, str) and len(value) > 300:
            display_value = value[:300] + "..."
        elif isinstance(value, list) and len(value) > 10:
            display_value = value[:10] + ["..."]

        print(f"{key}: {display_value}")


📝 Entry 1
----------------------------------------
document: dtduupvzgt
full_text: Tiburce Evans, https://www.instagram.com/tiburce-evans, pin NO bLBeoRIe
001-691-518-9820x5621

Introduction - Identifying the Challenge:

In my role as a User Experience Designer at a technology startup in San Francisco, I encountered a complex challenge that required a thoughtful and innovative sol...
tokens: ['Tiburce', 'Evans', ',', 'https://www.instagram.com/tiburce-evans', ',', 'pin', 'NO', 'bLBeoRIe', '\n', '001', '...']
labels: ['B-NAME_STUDENT', 'I-NAME_STUDENT', 'O', 'B-URL_PERSONAL', 'O', 'O', 'O', 'B-ID_NUM', 'O', 'B-PHONE_NUM', '...']
trailing_whitespace: [True, False, True, False, True, True, True, False, False, False, '...']


In [6]:
import json
from sklearn.model_selection import train_test_split


# Split: 80% train, 20% test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Optionally save the splits
with open("train_split.json", "w") as f:
    json.dump(train_data, f, indent=2)

with open("test_split.json", "w") as f:
    json.dump(test_data, f, indent=2)

print(f"✅ Split complete: {len(train_data)} train / {len(test_data)} test entries")


✅ Split complete: 1884 train / 471 test entries


In [16]:
def reconstruct_text(tokens, whitespaces):
    return "".join([t + (" " if w else "") for t, w in zip(tokens, whitespaces)])

def extract_labeled_spans(tokens, whitespaces, labels):
    spans = []
    current = None
    start_char = 0

    for idx, (token, label, has_space) in enumerate(zip(tokens, labels, whitespaces)):
        end_char = start_char + len(token)

        if label.startswith("B-"):
            if current:
                spans.append(current)
            current = {"start": start_char, "end": end_char, "label": label_map[label[2:]]}

        elif label.startswith("I-") and current:
            current["end"] = end_char

        elif current:
            spans.append(current)
            current = None

        start_char = end_char + (1 if has_space else 0)

    if current:
        spans.append(current)

    return spans


In [17]:
def evaluate_gliner_on_json(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)

    correct = 0
    predicted_total = 0
    gold_total = 0

    for entry in tqdm(data):
        tokens = entry["tokens"]
        whitespaces = entry["trailing_whitespace"]
        labels = entry["labels"]

        text = reconstruct_text(tokens, whitespaces)
        gold_spans = extract_labeled_spans(tokens, whitespaces, labels)

        pred_spans = model.predict_entities(text, PII_LABELS)

        pred_set = set((s["start"], s["end"], s["label"].lower()) for s in pred_spans)
        gold_set = set((s["start"], s["end"], s["label"].lower()) for s in gold_spans)

        correct += len(pred_set & gold_set)
        predicted_total += len(pred_set)
        gold_total += len(gold_set)

    precision = correct / predicted_total if predicted_total else 0
    recall = correct / gold_total if gold_total else 0
    f1 = 2 * precision * recall / (precision + recall) if precision + recall else 0

    print(f"\n🔍 Precision: {precision:.3f}")
    print(f"🎯 Recall: {recall:.3f}")
    print(f"🏆 F1 Score: {f1:.3f}")


In [18]:
evaluate_gliner_on_json(os.path.join(".","test_split.json"))


  0%|          | 0/471 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 471/471 [04:11<00:00,  1.87it/s]


🔍 Precision: 0.918
🎯 Recall: 0.386
🏆 F1 Score: 0.543





In [None]:
import json
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, classification_report
from tqdm import tqdm
import os
from difflib import SequenceMatcher

PII_LABELS = [
    "name", "email", "username", "id number", "phone number", "url", "street address"
]

label_map = {
    "NAME_STUDENT": "name",
    "EMAIL": "email",
    "USERNAME": "username",
    "ID_NUM": "id number",
    "PHONE_NUM": "phone number",
    "URL_PERSONAL": "url",
    "STREET_ADDRESS": "street address",
}

def reconstruct_text(tokens, whitespaces):
    return "".join([t + (" " if w else "") for t, w in zip(tokens, whitespaces)])

def extract_labeled_spans(tokens, whitespaces, labels):
    spans = []
    current = None
    start_char = 0

    for idx, (token, label, has_space) in enumerate(zip(tokens, labels, whitespaces)):
        end_char = start_char + len(token)

        if label.startswith("B-"):
            if current:
                spans.append(current)
            pii_label = label[2:]
            if pii_label in label_map:
                current = {"start": start_char, "end": end_char, "label": label_map[pii_label]}
            else:
                current = None

        elif label.startswith("I-") and current:
            current["end"] = end_char

        elif current:
            spans.append(current)
            current = None

        start_char = end_char + (1 if has_space else 0)

    if current:
        spans.append(current)

    return spans

def evaluate_gliner_on_json(file_path, model):
    with open(file_path, "r") as f:
        data = json.load(f)

    all_labels = []
    all_preds = []

    print("\n🔍 Showing one example entry...\n")

    for i, entry in enumerate(tqdm(data)):
        tokens = entry["tokens"]
        whitespaces = entry["trailing_whitespace"]
        labels = entry["labels"]

        text = reconstruct_text(tokens, whitespaces)
        gold_spans = extract_labeled_spans(tokens, whitespaces, labels)
        pred_spans = model.predict_entities(text, PII_LABELS)

        # Show first example
        if i == 0:
            print("📝 Text:")
            print(text)
            print("\n✅ Gold:")
            for s in gold_spans:
                print(f"{s['label']} => '{text[s['start']:s['end']]}'")
            print("\n🔮 Predicted:")
            for s in pred_spans:
                print(f"{s['label']} => '{text[s['start']:s['end']]}'")

        # Match prediction to gold spans by overlap or exact text match
        matched = set()
        for pred in pred_spans:
            pred_text = text[pred['start']:pred['end']]
            pred_label = pred['label'].lower()
            matched_flag = False

            for gold in gold_spans:
                gold_text = text[gold['start']:gold['end']]
                gold_label = gold['label'].lower()

                # exact match on span + label
                if pred_text == gold_text and pred_label == gold_label:
                    all_labels.append(gold_label)
                    all_preds.append(pred_label)
                    matched.add((gold['start'], gold['end'], gold_label))
                    matched_flag = True
                    break

            if not matched_flag:
                all_labels.append("none")         # false positive
                all_preds.append(pred_label)

        # Remaining unmatched gold spans are false negatives
        for gold in gold_spans:
            gold_key = (gold['start'], gold['end'], gold['label'].lower())
            if gold_key not in matched:
                all_labels.append(gold['label'].lower())
                all_preds.append("none")         # not detected

    print("\n📊 Evaluation Results:")
    print(classification_report(all_labels, all_preds, zero_division=0))
    print("\n📉 Confusion Matrix:")
    labels_sorted = sorted(set(all_labels + all_preds))
    print("Labels:", labels_sorted)
    # After the confusion_matrix() call
    cm = confusion_matrix(all_labels, all_preds, labels=labels_sorted)

    plt.figure(figsize=(12, 8))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels_sorted, yticklabels=labels_sorted, cmap='Blues')

    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title("Confusion Matrix")
    plt.tight_layout()
    plt.show()
evaluate_gliner_on_json(os.path.join(".","test_split.json"), model)



🔍 Showing one example entry...



  0%|          | 1/471 [00:00<07:06,  1.10it/s]

📝 Text:
Margot Cecil; +1-951-835-8837x13881, 54026 Jane Parkways
New Brendan, CA 85498

Identifying the Challenge

As a User Experience (UX) Designer, I often encounter complex challenges that require innovative solutions to improve the digital experiences of various user groups. Recently, I faced a significant challenge while designing a new feature for a popular mobile application used by millions of people worldwide. The application aimed to provide an accessible platform for users to create and share content. However, our user research revealed that a substantial portion of our users struggled with navigating through the app and creating content due to its intricate design and complex functionalities.

Moreover, these usability issues were more prevalent among older adult users and those with little to no experience using similar apps. Consequently, addressing these challenges was critical to ensure that the application remained inclusive and accessible to the diverse user populati

100%|██████████| 471/471 [04:14<00:00,  1.85it/s]


📊 Evaluation Results:
                precision    recall  f1-score   support

         email       0.93      0.56      0.70       560
     id number       0.65      0.09      0.15       467
          name       0.96      0.53      0.68      1629
          none       0.00      0.00      0.00       139
  phone number       0.94      0.40      0.56       478
street address       0.83      0.27      0.41       464
           url       1.00      0.01      0.03       605
      username       0.49      0.05      0.10       454

      accuracy                           0.33      4796
     macro avg       0.73      0.24      0.33      4796
  weighted avg       0.85      0.33      0.44      4796


📉 Confusion Matrix:
Labels: ['email', 'id number', 'name', 'none', 'phone number', 'street address', 'url', 'username']
[[313   0   0 247   0   0   0   0]
 [  0  41   0 426   0   0   0   0]
 [  0   0 865 764   0   0   0   0]
 [ 23  22  32   0  12  25   0  25]
 [  0   0   0 288 190   0   0   0]
 [  0 


