# Pre-process original data

In [1]:
import json
import os
from tqdm import tqdm

# Label map from PII-DD to GLiNER-compatible
label_map = {
    "NAME_STUDENT": "name",
    "EMAIL": "email",
    "USERNAME": "username",
    "ID_NUM": "id number",
    "PHONE_NUM": "phone number",
    "URL_PERSONAL": "url",
    "STREET_ADDRESS": "street address"
}

# Load JSON data
input_path = os.path.join(".data/", "mixtral-8x7b-v1.json")  # Update if needed
output_path = os.path.join(".data/", "preprocessed_pii_dd.json")
if os.path.exists(output_path):
    os.remove(output_path)
    print(f"Removed existing file at {output_path}")
    output_path = os.path.join(".data/", "preprocessed_pii_dd.json")
    print(f"New output path set to {output_path}")



with open(input_path, "r") as f:
    data = json.load(f)

# Helper to reconstruct full text from tokens and spacing
def reconstruct_text(tokens, whitespaces):
    return "".join([t + (" " if w else "") for t, w in zip(tokens, whitespaces)])

# Extract labeled spans in GLiNER format
def extract_labeled_spans(tokens, whitespaces, labels):
    spans = []
    current = None
    start_char = 0

    for token, label, has_space in zip(tokens, labels, whitespaces):
        end_char = start_char + len(token)

        if label.startswith("B-"):
            if current:
                spans.append(current)
            mapped = label_map.get(label[2:])
            if mapped:
                current = {"start": start_char, "end": end_char, "label": mapped}
            else:
                current = None

        elif label.startswith("I-") and current:
            current["end"] = end_char

        elif current:
            spans.append(current)
            current = None

        start_char = end_char + (1 if has_space else 0)

    if current:
        spans.append(current)

    return spans

# Process and convert
converted_data = []
for entry in tqdm(data):
    text = reconstruct_text(entry["tokens"], entry["trailing_whitespace"])
    entities = extract_labeled_spans(entry["tokens"], entry["trailing_whitespace"], entry["labels"])
    converted_data.append({"text": text, "entities": entities})

# Save to JSON
with open(output_path, "w") as f:
    json.dump(converted_data, f, indent=2)

# Print example
print("\n🔎 Sample Entry from Preprocessed Dataset:")
print(json.dumps(converted_data[0], indent=2))


Removed existing file at .data/preprocessed_pii_dd.json
New output path set to .data/preprocessed_pii_dd.json


100%|██████████| 2355/2355 [00:00<00:00, 4186.12it/s]



🔎 Sample Entry from Preprocessed Dataset:
{
  "text": "Tiburce Evans, https://www.instagram.com/tiburce-evans, pin NO bLBeoRIe\n001-691-518-9820x5621\n\nIntroduction - Identifying the Challenge:\n\nIn my role as a User Experience Designer at a technology startup in San Francisco, I encountered a complex challenge that required a thoughtful and innovative solution. Our team was tasked with redesigning the user experience of our mobile application to better meet the needs of a diverse user group, spanning various age ranges, cultures, ethnicities, and abilities. This challenge was significant due to the wide array of user needs and preferences to consider, as well as the potential impact on overall customer satisfaction and conversion rates.\n\nSelection of the Tool or Approach:\n\nTo address this complex challenge, I chose to apply the human-centered design (HCD) approach, a methodology that emphasizes empathy, collaboration, and iteration throughout the design process. I selected this

# Augmented data creation

# Combination of data

In [None]:

# with open("data/preprocessed_pii_dd.json") as f:
#     real_data = json.load(f)

# with open("data/augmented_name_url_id.json") as f:
#     augmented_data = json.load(f)

# combined = real_data + augmented_data

# with open("data/gilner_data_set.json", "w") as f:
#     json.dump(combined, f, indent=2)



# Split data

In [14]:
import json
import os
from sklearn.model_selection import train_test_split

preprocessed_path = os.path.join('.data', 'preprocessed_pii_dd.json')

with open(preprocessed_path, 'r') as f:
    preprocessed_data = json.load(f)

def write_split(pathname, payload):
    if os.path.exists(pathname):
        os.remove(pathname)
        print(f'Removed existing file at {pathname}')
    os.makedirs(os.path.dirname(pathname), exist_ok=True)
    with open(pathname, 'w') as f:
        json.dump(payload, f, indent=2)
    print(f'Wrote {len(payload)} records to {pathname}')

train_data, temp_data = train_test_split(preprocessed_data, test_size=0.2, random_state=42)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

train_data_path = os.path.join('.data', 'train_split.json')
val_data_path = os.path.join('.data', 'val_split.json')
test_data_path = os.path.join('.data', 'test_split.json')

write_split(train_data_path, train_data)
write_split(val_data_path, val_data)
write_split(test_data_path, test_data)

print(f'✅ Split complete: {len(train_data)} train / {len(val_data)} val / {len(test_data)} test entries')


Removed existing file at .data/train_split.json
Wrote 1884 records to .data/train_split.json
Removed existing file at .data/val_split.json
Wrote 235 records to .data/val_split.json
Removed existing file at .data/test_split.json
Wrote 236 records to .data/test_split.json
✅ Split complete: 1884 train / 235 val / 236 test entries


In [20]:
import torch
from gliner import GLiNERConfig, GLiNER
from gliner.training import Trainer, TrainingArguments
from gliner.data_processing.collator import DataCollatorWithPadding, DataCollator
from gliner.utils import load_config_as_namespace
from gliner.data_processing import WordsSplitter, GLiNERDataset


# Load model

In [26]:
from gliner import GLiNER
from transformers import AutoTokenizer
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")  

model = GLiNER.from_pretrained("knowledgator/gliner-pii-base-v1.0")
tokenizer = AutoTokenizer.from_pretrained("knowledgator/gliner-pii-base-v1.0")

# use it for better performance, it mimics original implementation but it's less memory efficient
data_collator = DataCollator(model.config, data_processor=model.data_processor, prepare_labels=True)


model.to(device)

PII_LABELS = [
    "name", "email", "username", "id number", "phone number", "url", "street address"
]

# Mapping competition labels to GLiNER-compatible labels
label_map = {
    "NAME_STUDENT": "name",
    "EMAIL": "email",
    "USERNAME": "username",
    "ID_NUM": "id number",
    "PHONE_NUM": "phone number",
    "URL_PERSONAL": "url",
    "STREET_ADDRESS": "street address",
}

all_labels = PII_LABELS

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

In [33]:
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
import os
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from gliner import GLiNER
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

# Move model to device
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_steps = 500
batch_size = 8
data_size = len(train_data)
num_batches = data_size // batch_size
num_epochs = max(1, num_steps // num_batches)

# Custom dataset wrapper to be used with GLiNER
class GLiNERPyTorchDataset(torch.utils.data.Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        entry = self.data[idx]
        return {
            "text": entry["text"],
            "ner": [(e["start"], e["end"], e["label"]) for e in entry.get("entities", [])],
            "labels": self.labels
        }


# Create DataLoaders
train_dataset = GLiNERPyTorchDataset(train_data, all_labels)
val_dataset = GLiNERPyTorchDataset(val_data, all_labels)

# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=batch_size)


# train_dataset = GLiNERDataset(train_data, label_list=all_labels, tokenizer=tokenizer)
# val_dataset = GLiNERDataset(val_data, label_list=all_labels, tokenizer=tokenizer)

data_collator = DataCollator(
    model.config,
    data_processor=model.data_processor,
    prepare_labels=True
)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=data_collator)

# Optimizer
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# Training loop
def train(model, train_loader, val_loader, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        print(f"\n🚀 Epoch {epoch+1}/{num_epochs}")
        total_loss = 0.0

        for batch in tqdm(train_loader):
            inputs = data_collator(batch)
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to(device)

            outputs = model(**inputs)
            loss = outputs.loss

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"\n📉 Average training loss: {avg_loss:.4f}")

        evaluate(model, val_loader)

# Evaluation loop
def evaluate(model, val_loader):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for batch in val_loader:
            inputs = data_collator(batch)
            for k, v in inputs.items():
                if isinstance(v, torch.Tensor):
                    inputs[k] = v.to(device)

            outputs = model(**inputs)
            total_loss += outputs.loss.item()

    avg_val_loss = total_loss / len(val_loader)
    print(f"✅ Validation loss: {avg_val_loss:.4f}")

# Run training loop
train(model, train_loader, val_loader, optimizer, num_epochs=num_epochs)

# Save the model
os.makedirs(".models/gliner_finetuned", exist_ok=True)
model.save_pretrained(".models/gliner_finetuned")



🚀 Epoch 1/2


  0%|          | 0/236 [00:00<?, ?it/s]

  0%|          | 0/236 [00:00<?, ?it/s]


KeyError: 'tokenized_text'

In [None]:
# Setup
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
model = GLiNER.from_pretrained("knowledgator/gliner-pii-base-v1.0").to(device)
tokenizer = AutoTokenizer.from_pretrained("knowledgator/gliner-pii-base-v1.0")

# Assuming train_data is a list of dicts with "text" and "entities"
all_labels = list({ent["label"] for example in train_data for ent in example["entities"]})
model.set_labels(all_labels)

# Create PyTorch-compatible dataset
from gliner.data import GLiNERDataset, DataCollator

train_dataset = GLiNERDataset(train_data, label_list=all_labels, tokenizer=tokenizer)
val_dataset = GLiNERDataset(val_data, label_list=all_labels, tokenizer=tokenizer)

data_collator = DataCollator(
    model.config,
    data_processor=model.data_processor,
    prepare_labels=True
)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=data_collator)

# Define optimizer
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

# Training loop
num_epochs = 3
model.train()

for epoch in range(num_epochs):
    total_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch in loop:
        for k in batch:
            batch[k] = batch[k].to(device)

        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())

    avg_loss = total_loss / len(train_loader)
    print(f"📉 Epoch {epoch+1} avg loss: {avg_loss:.4f}")

# ✅ Save the fine-tuned model
model.save_pretrained("./models/gliner-finetuned-adamw")


# Train data

In [None]:

num_steps = 500
batch_size = 8
data_size = len(train_data)
num_batches = data_size // batch_size
num_epochs = max(1, num_steps // num_batches)

# Define training arguments
args = TrainingArguments(
    output_dir=".models/gliner_finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    report_to="none",  # disables wandb/huggingface logging
)

# Create trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=model.data_processor.transformer_tokenizer,
    data_collator=data_collator,

)

# 🚀 Start training
trainer.train()

# ✅ Save your model
model.save_pretrained(".models/gliner_finetuned")


  trainer = Trainer(


KeyError: 'ner'