# loading saved model

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("./saved_model3")
model = AutoModelForTokenClassification.from_pretrained("./saved_model")

In [7]:
from transformers import pipeline

nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="none")



Device set to use cpu


# generating entity_tags from a sequence, to mask

In [58]:
# Example input sequence, can be any sentence of your choice
sequence = "John Doe's email is john.doe@example.com. You can reach him 24/7, or reach out to Alan Turing in Data Science Department."

# Get NER predictions
ner_results = nlp(sequence)

# Display the results
for entity in ner_results:
    entity_label = entity.get('entity', entity.get('entity_group', 'N/A'))
    print(f"Entity: {entity['word']}, Label: {entity_label}")



Entity: john, Label: B-PER
Entity: doe, Label: I-PER
Entity: john, Label: B-EMAIL
Entity: ., Label: B-EMAIL
Entity: doe, Label: B-EMAIL
Entity: @, Label: B-EMAIL
Entity: example, Label: B-EMAIL
Entity: ., Label: B-EMAIL
Entity: com, Label: B-EMAIL
Entity: ., Label: B-EMAIL
Entity: alan, Label: B-PER
Entity: turing, Label: I-PER


# masking the sentence based on model predictions

In [60]:
def mask_entities(model_output, sentence):
    # Initialize the list to store the masked sentence characters
    masked_sentence = list(sentence)
    
    i = 0
    prev_mask = None  # Keep track of the previous mask to avoid repeating

    while i < len(model_output):
        entity = model_output[i]
        entity_type = entity['entity']
        start_idx = entity['start']
        end_idx = entity['end']
        word = entity['word']
        
        if entity_type == 'B-PER' or entity_type == 'I-PER':
            # Replace the entire sequence of a person's name with [B-PER] and [I-PER]
            while i + 1 < len(model_output) and model_output[i + 1]['entity'] == 'I-PER':
                end_idx = model_output[i + 1]['end']
                i += 1
            # If the previous mask was not the same, update the mask
            if prev_mask != '[B-PER]':
                masked_sentence[start_idx] = '[B-PER]'
            if prev_mask != '[I-PER]':
                masked_sentence[start_idx+1:end_idx] = ['[I-PER]'] * (end_idx - start_idx - 1)

            prev_mask = '[I-PER]'  # Mark that we've just used [I-PER] to combine bot begin and inner names to one.

        elif entity_type == 'B-EMAIL':
            # Replace the entire email sequence with [B-EMAIL]
            while i + 1 < len(model_output) and model_output[i + 1]['entity'] == 'B-EMAIL':
                end_idx = model_output[i + 1]['end']
                i += 1
            # If the previous mask was not the same, update the mask
            if prev_mask != '[B-EMAIL]':
                masked_sentence[start_idx:end_idx] = ['[B-EMAIL]'] * (end_idx - start_idx)

            prev_mask = '[B-EMAIL]'  # Mark that we've just used [B-EMAIL] for email masking
        
        i += 1

    # Remove consecutively repeated masks, 
    i = 0
    while i < len(masked_sentence) - 1:
        if masked_sentence[i] == masked_sentence[i + 1]:
            masked_sentence.pop(i + 1)  # Remove the duplicate
        else:
            i += 1
    
    # Convert the list of characters back to a string
    return ''.join(masked_sentence)

In [61]:
masked_sentence = mask_entities(ner_results, sequence)

In [62]:
print(masked_sentence)

[B-PER][I-PER]'s email is [B-EMAIL] You can reach him 24/7, or reach out to [B-PER][I-PER] in Data Science Department.
