# Sanitize a text

Sanitization of a small paragraph using glove.6B.100d.pkl.

In [None]:
import pickle
import os
from os.path import join
import re
import numpy as np
from cupyx.scipy.spatial import distance
import cupy as cp
from pathlib import Path
import sys
# Add the main directory to sys.path to be able to import config
sys.path.append(str(Path.cwd().parent))
from config import ROOT_DIR
from utils.dx import sample_noise_vectors

# PARAMS
distance_metric = "euclidean"
glove_data_folderpath = ROOT_DIR
# END PARAMS
glove_dimension_to_filename = {
    50: "glove.6B.50d.pkl", # 400000 words
    100:"glove.6B.100d.pkl", # 400000 words
    200: "glove.6B.200d.pkl", # 400000 words
    300:"glove.6B.300d.pkl" # 400000 words
}

Prepare vocabulary

In [None]:
hidden_size = 100
with open(join(glove_data_folderpath, glove_dimension_to_filename[hidden_size]), "rb") as f:
    glove = pickle.load(f)

vocab_embs = np.array(list(glove.values()))
words_to_id = {word:index for index,word in enumerate(glove.keys())}
id_to_words = list(glove.keys())
del glove # Save RAM

Prepare the text

In [None]:
text = "Maria Gonzalez, a patient at Riverside Clinic, was diagnosed with depression on March 5, 2023. She currently lives at 789 Oak Drive, San Francisco. Maria has been prescribed medication and is undergoing weekly therapy sessions."

# Splitting the paragraph based on spaces, commas, and dots
split_text = re.split(r'(\s+|,|\.)', text)

# Removing any empty strings from the result (due to multiple delimiters)
split_text = [word for word in split_text if word.strip()]

sanitization_excluded = [",", "."]

Sanitize

In [None]:
epsilon = 10

sanitized_text_split = []
for word in split_text:
    if word in sanitization_excluded:
        sanitized_text_split.append(word)
        continue
    try:
        word_id = words_to_id[word.lower()]
    except KeyError:
        print(f"{word.lower()} is not in the vocabulary.")
        break
    
    word_emb = vocab_embs[word_id]
    noise = sample_noise_vectors(dimension=hidden_size,
                                    shape1=1,
                                    shape2=1,
                                    epsilon=epsilon)[0][0]
    # Adding noise to embeddings
    noisy_embedding = word_emb + noise

    # Convert embedding back to text via Nearest neighbor
    noisy_word_id = distance.cdist([noisy_embedding], vocab_embs, distance_metric)[0].argmin().get()
    noisy_word = id_to_words[noisy_word_id]
    sanitized_text_split.append(noisy_word)

' '.join(sanitized_text_split)