In [1]:
from gliner import GLiNER
import nltk
nltk.download('punkt')
# Load the model
from nltk.tokenize import sent_tokenize
from gliner import GLiNER
from transformers import AutoTokenizer

# Load model and tokenizer
model_name = "knowledgator/gliner-pii-base-v1.0"
model = GLiNER.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

[nltk_data] Downloading package punkt to /Users/valencia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

In [2]:
personal_labels = [
    "name",                       # Full names
    "first name",                 # First names  
    "last name",                  # Last names
    "name medical professional",  # Healthcare provider names
    "dob",                        # Date of birth
    "age",                        # Age information
    "gender",                     # Gender identifiers
    "marital status"              # Marital status
]
contact_labels = [
    "email address",          # Email addresses
    "phone number",           # Phone numbers
    "ip address",             # IP addresses
    "url",                    # URLs
    "location address",       # Street addresses
    "location street",        # Street names
    "location city",          # City names
    "location state",         # State/province names
    "location country",       # Country names
    "location zip"            # ZIP/postal codes
]
financial_labels = [
    "account number",         # Account numbers
    "bank account",           # Bank account numbers
    "routing number",         # Routing numbers
    "credit card",            # Credit card numbers
    "credit card expiration", # Card expiration dates  
    "cvv",                    # CVV/security codes
    "ssn",                    # Social Security Numbers
    "money"                   # Monetary amounts
]
healthcare_labels = [
    "condition",                    # Medical conditions
    "medical process",              # Medical procedures
    "drug",                         # Drugs
    "dose",                         # Dosage information
    "blood type",                   # Blood types
    "injury",                       # Injuries
    "organization medical facility",# Healthcare facility names
    "healthcare number",            # Healthcare numbers
    "medical code"                  # Medical codes
]
id_labels = [
    "passport number",       # Passport numbers
    "driver license",        # Driver's license numbers
    "username",              # Usernames
    "password",              # Passwords
    "vehicle id"             # Vehicle IDs
]

labels = personal_labels + contact_labels + financial_labels + healthcare_labels + id_labels

In [3]:
MAX_TOKENS = 512
LABELS = labels

In [4]:
# Input text (simulating an LLM prompt)
text = "Hi, my name is Sarah and my email is sarah@example.com. I was born on January 1st, 1990 and live in Los Angeles."

# Predict entities
entities = model.predict_entities(text, labels)

# Redact PII from text
# Replace entities from end to start to preserve character positions
for ent in sorted(entities, key=lambda x: x['start'], reverse=True):
    label = ent['label'].upper().replace(" ", "_")
    redacted = f"[{label}]"
    text = text[:ent['start']] + redacted + text[ent['end']:]

print("Redacted output:\n", text)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Redacted output:
 Hi, my name is [NAME] and my email is [EMAIL_ADDRESS]. I was born on [DOB] and live in [LOCATION_CITY].


In [5]:


def chunk_sentences(text, max_tokens=MAX_TOKENS):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_tokens = 0

    for sentence in sentences:
        sentence_tokens = tokenizer.encode(sentence, add_special_tokens=False)
        sentence_token_len = len(sentence_tokens)

        if current_tokens + sentence_token_len <= max_tokens:
            current_chunk.append(sentence)
            current_tokens += sentence_token_len
        else:
            if current_chunk:
                chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_tokens = sentence_token_len

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

def redact_with_gliner(text_chunk, labels):
    entities = model.predict_entities(text_chunk, labels)
    for ent in sorted(entities, key=lambda x: x["start"], reverse=True):
        tag = f"[{ent['label'].upper().replace(' ', '_')}]"
        text_chunk = text_chunk[:ent["start"]] + tag + text_chunk[ent["end"]:]
    return text_chunk

def redact_text_pipeline(text):
    input_ids = tokenizer.encode(text, add_special_tokens=False)
    token_count = len(input_ids)
    print(f"\n📏 Total token count: {token_count}")

    if token_count <= MAX_TOKENS:
        print("✅ No chunking needed.")
        redacted = redact_with_gliner(text, LABELS)
        print("\n🔒 Redacted Output:\n", redacted)
        return redacted

    print(f"⚠️ Chunking by sentence (via NLTK) to respect token limit...\n")
    chunks = chunk_sentences(text)

    redacted_chunks = []
    for i, chunk in enumerate(chunks):
        token_len = len(tokenizer.encode(chunk, add_special_tokens=False))
        print(f"\n--- Chunk {i+1} (tokens: {token_len}) ---\n{chunk}")
        redacted = redact_with_gliner(chunk, LABELS)
        redacted_chunks.append(redacted)

    final_output = " ".join(redacted_chunks)
    print("\n✅ Final Redacted Output:\n", final_output)
    return final_output


In [6]:
# Example usage
long_text = (
    "My name is John Doe. I live at 123 Apple St. My SSN is 123-45-6789 "
    "and my email is john.doe@example.com. I was born on July 1st, 1990. "
    "My credit card is 4111-1111-1111-1111. " * 10  # repeat to exceed token limit
)

redact_text_pipeline(long_text)


📏 Total token count: 580
⚠️ Chunking by sentence (via NLTK) to respect token limit...


--- Chunk 1 (tokens: 509) ---
My name is John Doe. I live at 123 Apple St. My SSN is 123-45-6789 and my email is john.doe@example.com. I was born on July 1st, 1990. My credit card is 4111-1111-1111-1111. My name is John Doe. I live at 123 Apple St. My SSN is 123-45-6789 and my email is john.doe@example.com. I was born on July 1st, 1990. My credit card is 4111-1111-1111-1111. My name is John Doe. I live at 123 Apple St. My SSN is 123-45-6789 and my email is john.doe@example.com. I was born on July 1st, 1990. My credit card is 4111-1111-1111-1111. My name is John Doe. I live at 123 Apple St. My SSN is 123-45-6789 and my email is john.doe@example.com. I was born on July 1st, 1990. My credit card is 4111-1111-1111-1111. My name is John Doe. I live at 123 Apple St. My SSN is 123-45-6789 and my email is john.doe@example.com. I was born on July 1st, 1990. My credit card is 4111-1111-1111-1111. My name is 

'My name is [NAME]. I live at [LOCATION_ADDRESS]. My SSN is [SSN] and my email is [EMAIL_ADDRESS]. I was born on July 1st, 1990. My credit card is [CREDIT_CARD]. My name is [NAME]. I live at [LOCATION_ADDRESS]. My SSN is [SSN] and my email is [EMAIL_ADDRESS]. I was born on July 1st, 1990. My credit card is [CREDIT_CARD]. My name is [NAME]. I live at [LOCATION_ADDRESS]. My SSN is [SSN] and my email is [EMAIL_ADDRESS]. I was born on July 1st, 1990. My credit card is [CREDIT_CARD]. My name is [NAME]. I live at [LOCATION_ADDRESS]. My SSN is [SSN] and my email is [EMAIL_ADDRESS]. I was born on July 1st, 1990. My credit card is [CREDIT_CARD]. My name is [NAME]. I live at [LOCATION_ADDRESS]. My SSN is [SSN] and my email is [EMAIL_ADDRESS]. I was born on July 1st, 1990. My credit card is [CREDIT_CARD]. My name is [NAME]. I live at [LOCATION_ADDRESS]. My SSN is [SSN] and my email is [EMAIL_ADDRESS]. I was born on July 1st, 1990. My credit card is [CREDIT_CARD]. My name is [NAME]. I live at [LOC

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nbroad/pii-dd-mistral-generated")

print("Path to dataset files:", path)

In [None]:
import pandas as pd

df = pd.read_csv("path/to/train.csv")
print(df.columns)
print(df.iloc[0])
