#### For Sentence

In [16]:
!pip install gensim nltk




In [17]:
# Import required libraries
import gensim.downloader as api
from gensim.models import KeyedVectors
import nltk
from nltk.corpus import stopwords
import re

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [18]:
# Load the pre-trained Word2Vec model
model = api.load('word2vec-google-news-300')

# Load stopwords from NLTK
stop_words = set(stopwords.words('english'))


In [19]:
# Function for text cleaning and preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    words = nltk.word_tokenize(text)
    return words


In [20]:
# Function to find semantically similar words
def find_similar_words(word, topn=5):
    if word in model.key_to_index:
        similar_words = model.most_similar(word, topn=topn)
        return similar_words
    else:
        return f"Word '{word}' not in vocabulary"

# Function to compute semantic distance (cosine similarity)
def semantic_distance(word1, word2):
    if word1 in model.key_to_index and word2 in model.key_to_index:
        similarity = model.similarity(word1, word2)
        return similarity
    else:
        return f"One or both words '{word1}' and '{word2}' not in vocabulary"


In [21]:
# Function to transform the sentence
def transform_sentence(sentence, rank=1):
    words = preprocess_text(sentence)
    transformed_words = []
    for word in words:
        if word not in stop_words:
            similar_words = find_similar_words(word, topn=rank)
            if isinstance(similar_words, str):
                transformed_words.append(word)
            else:
                similar_word, similarity = similar_words[rank-1]  # Get the N-th similar word
                transformed_words.append(similar_word)
        else:
            transformed_words.append(word)
    return ' '.join(transformed_words)


In [22]:
# Input sentence and rank
input_sentence = "President greet the press in chicago"
rank = 2  # we can change this to 1st similar, 2nd similar, etc.

# Display semantically similar words for each word in the sentence
words = preprocess_text(input_sentence)
for word in words:
    if word not in stop_words:
        similar_words = find_similar_words(word, topn=5)
        if isinstance(similar_words, str):
            print(similar_words)
        else:
            print(f"Similar words for '{word}': {[w for w, _ in similar_words]}")

# Transform the sentence based on semantic distance
transformed_sentence = transform_sentence(input_sentence, rank)

# Display result
print(f"Original sentence: {input_sentence}")
print(f"Transformed sentence: {transformed_sentence}")


Similar words for 'president': ['President', 'chairman', 'vice_president', 'chief_executive', 'CEO']
Similar words for 'greet': ['greets', 'greeting', 'Greet', 'greeted', 'warmly_greeted']
Similar words for 'press': ['media', 'reporters', 'hastily_convened_press', 'breifing', 'news']
Similar words for 'chicago': ['baltimore', 'denver', 'nyc', 'atlanta', 'springfield']
Original sentence: President greet the press in chicago
Transformed sentence: chairman greeting the reporters in denver


### New Method for Anonymizing.

In [9]:
# Install required packages
!pip install numpy gensim torch transformers


Defaulting to user installation because normal site-packages is not writeable


In [10]:
import numpy as np
import gensim.downloader as api

# Load GloVe embeddings
glove_model = api.load("glove-wiki-gigaword-100")  # You can choose a different model if needed

def get_embedding(word):
    if word in glove_model:
        return glove_model[word]
    else:
        return None  # Return None if the word is not in the vocabulary


HTTPError: HTTP Error 403: Forbidden

In [None]:
def anonymize_numerical(value, epsilon=1.0):
    noise = np.random.laplace(0, 1/epsilon)
    return value + noise


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def find_similar_word(word, distance_threshold=0.10):
    word_embedding = get_embedding(word)
    if word_embedding is None:
        return word  # Return original word if not found

    # Prepare to find similar words
    similar_words = []
    
    for candidate in glove_model.key_to_index.keys():
        candidate_embedding = get_embedding(candidate)
        if candidate_embedding is not None:
            similarity = cosine_similarity([word_embedding], [candidate_embedding])[0][0]
            distance = 1 - similarity  # Convert similarity to distance
            if distance >= distance_threshold:
                similar_words.append((candidate, distance))

    # Sort by distance and return the first similar word if available
    similar_words.sort(key=lambda x: x[1])  # Sort by distance
    return similar_words[0][0] if similar_words else word  # Return the first similar word or original


In [None]:
def anonymize_string(input_string, distance=0.10):
    try:
        # Check if the input is numerical
        value = float(input_string)
        return anonymize_numerical(value)
    except ValueError:
        # If not numerical, proceed with string anonymization
        return find_similar_word(input_string, distance)

# Example usage
print(anonymize_string("42"))  # Numerical input
print(anonymize_string("example"))  # String input


### Code to be added in main masking model

In [None]:
from typing import List
import numpy as np
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity

# Load GloVe embeddings
glove_model = api.load("glove-wiki-gigaword-100")  # You can choose a different model if needed

def get_embedding(word):
    if word in glove_model:
        return glove_model[word]
    else:
        return None  # Return None if the word is not in the vocabulary

def anonymize_numerical(value, epsilon=1.0):
    noise = np.random.laplace(0, 1/epsilon)
    return value + noise

def find_similar_word(word, distance_threshold=0.10):
    word_embedding = get_embedding(word)
    if word_embedding is None:
        return word  # Return original word if not found

    similar_words = []
    
    for candidate in glove_model.key_to_index.keys():
        candidate_embedding = get_embedding(candidate)
        if candidate_embedding is not None:
            similarity = cosine_similarity([word_embedding], [candidate_embedding])[0][0]
            distance = 1 - similarity  # Convert similarity to distance
            if distance >= distance_threshold:
                similar_words.append((candidate, distance))

    similar_words.sort(key=lambda x: x[1])  # Sort by distance
    return similar_words[0][0] if similar_words else word  # Return the first similar word or original

def anonymize_pii1(text_input: str, result: List[RecognizerResult], attributedb: AttributeDB, cat: str) -> str:
    """
    Mask the text PIIs according to disclosure proportion:
    - If the entity is present in the category (CATEGORY_ENTITIES), apply its defined masking method.
    - If the entity is not in the category, completely mask it.
    """
    # Get the list of allowed entities for the given category
    category_entities = dcfg.CATEGORY_ENTITIES.get(cat, [])
    operator_options = {}

    for entity in dcfg.ENTITIES:
        if entity in category_entities:
            # Apply category-specific masking plan
            operator_options[entity] = OperatorConfig(
                "custom", {"lambda": attributedb.get_attr_obj_from_type(entity).masking_plan}
            )
        else:
            # Completely mask entities not in the category with semantic or numerical anonymization
            # Here we will replace the entity with its anonymized version
            operator_options[entity] = OperatorConfig(
                "replace", {"new_value": anonymize_entity(text_input, entity)}
            )

    # Anonymize the input text based on PII detection
    anonymizer = AnonymizerEngine()
    anonymized_text = anonymizer.anonymize(
        text=text_input,
        analyzer_results=result,  # type: ignore
        operators=operator_options
    ).text

    return anonymized_text

def anonymize_entity(text_input: str, entity: str) -> str:
    """
    Anonymize the entity based on whether it's numerical or a regular string.
    """
    # Check if the entity is numerical
    try:
        value = float(entity)
        return str(anonymize_numerical(value))  # Anonymize numerically
    except ValueError:
        # If not numerical, proceed with string anonymization
        return find_similar_word(entity)  # Anonymize semantically
