# Pre-process original data

In [3]:
import json
import os
from tqdm import tqdm

# Label map from PII-DD to GLiNER-compatible
label_map = {
    "NAME_STUDENT": "name",
    "EMAIL": "email",
    "USERNAME": "username",
    "ID_NUM": "id number",
    "PHONE_NUM": "phone number",
    "URL_PERSONAL": "url",
    "STREET_ADDRESS": "street address"
}

# Load JSON data
input_path = os.path.join("data/", "mixtral-8x7b-v1.json")  # Update if needed
output_path = os.path.join("data/", "preprocessed_pii_dd.json")

with open(input_path, "r") as f:
    data = json.load(f)

# Helper to reconstruct full text from tokens and spacing
def reconstruct_text(tokens, whitespaces):
    return "".join([t + (" " if w else "") for t, w in zip(tokens, whitespaces)])

# Extract labeled spans in GLiNER format
def extract_labeled_spans(tokens, whitespaces, labels):
    spans = []
    current = None
    start_char = 0

    for token, label, has_space in zip(tokens, labels, whitespaces):
        end_char = start_char + len(token)

        if label.startswith("B-"):
            if current:
                spans.append(current)
            mapped = label_map.get(label[2:])
            if mapped:
                current = {"start": start_char, "end": end_char, "label": mapped}
            else:
                current = None

        elif label.startswith("I-") and current:
            current["end"] = end_char

        elif current:
            spans.append(current)
            current = None

        start_char = end_char + (1 if has_space else 0)

    if current:
        spans.append(current)

    return spans

# Process and convert
converted_data = []
for entry in tqdm(data):
    text = reconstruct_text(entry["tokens"], entry["trailing_whitespace"])
    entities = extract_labeled_spans(entry["tokens"], entry["trailing_whitespace"], entry["labels"])
    converted_data.append({"text": text, "entities": entities})

# Save to JSON
with open(output_path, "w") as f:
    json.dump(converted_data, f, indent=2)

# Print example
print("\n🔎 Sample Entry from Preprocessed Dataset:")
print(json.dumps(converted_data[0], indent=2))


100%|██████████| 2355/2355 [00:00<00:00, 4483.45it/s]



🔎 Sample Entry from Preprocessed Dataset:
{
  "text": "Tiburce Evans, https://www.instagram.com/tiburce-evans, pin NO bLBeoRIe\n001-691-518-9820x5621\n\nIntroduction - Identifying the Challenge:\n\nIn my role as a User Experience Designer at a technology startup in San Francisco, I encountered a complex challenge that required a thoughtful and innovative solution. Our team was tasked with redesigning the user experience of our mobile application to better meet the needs of a diverse user group, spanning various age ranges, cultures, ethnicities, and abilities. This challenge was significant due to the wide array of user needs and preferences to consider, as well as the potential impact on overall customer satisfaction and conversion rates.\n\nSelection of the Tool or Approach:\n\nTo address this complex challenge, I chose to apply the human-centered design (HCD) approach, a methodology that emphasizes empathy, collaboration, and iteration throughout the design process. I selected this

# Augmented data creation

# Combination of data

In [None]:
with open("data/preprocessed_pii_dd.json") as f:
    real_data = json.load(f)

with open("data/augmented_name_url_id.json") as f:
    augmented_data = json.load(f)

combined = real_data + augmented_data

with open("data/gilner_data_set.json", "w") as f:
    json.dump(combined, f, indent=2)

# Split data

In [None]:
import json
from sklearn.model_selection import train_test_split

preprocessed_path = os.path.join("data/", "gilner_data_set.json")
# # Split: 80% train, 20% test
# with open(preprocessed_path, "r") as f:
#     data = json.load(f)

train_data, test_data = train_test_split(preprocessed_path, test_size=0.2, random_state=42)

# Optionally save the splits
with open("data/train_split.json", "w") as f:
    json.dump(train_data, f, indent=2)

with open("data/test_split.json", "w") as f:
    json.dump(test_data, f, indent=2)

print(f"✅ Split complete: {len(train_data)} train / {len(test_data)} test entries")
