# Close Neighbors $d_\mathcal{X}$-privacy frequencies

In [None]:
import pickle
from os.path import join
import numpy as np
from collections import Counter
from pathlib import Path
import sys
# Add the main directory to sys.path to be able to import config
sys.path.append(str(Path.cwd().parent))
from config import ROOT_DIR
from utils.dx import sample_noise_vectors, noisy_embeddings_to_ids
from utils.tools import rank_neighbors

# PARAMS
distance_metric = "euclidean" # Metric to use for the distances
distances_dtype = np.float16 # Precision of the distances

glove_variant = "6B" #"Twitter" or "6B"

glove_data_folderpath = ROOT_DIR

# END PARAMS
if glove_variant == "6B":
    glove_dimension_to_filename = {
        50: "glove.6B.50d.pkl", # 400000 words
        100:"glove.6B.100d.pkl", # 400000 words
        200: "glove.6B.200d.pkl", # 400000 words
        300:"glove.6B.300d.pkl" # 400000 words
    }
elif glove_variant == "Twitter":
    glove_dimension_to_filename = {
        25: "glove.twitter.27B.25d.pkl", #1,193,513 words
        50: "glove.twitter.27B.50d.pkl", #1,193,513 words
        100: "glove.twitter.27B.100d.pkl", #1,193,513 words
        200: "glove.twitter.27B.200d.pkl", #1,193,513 words
    }
fit_dtype = np.uint32 # Integer size sufficient to encode the number of words in the vocabularies

# Average several words

Load GloVe in a specific dimension

In [None]:
hidden_size = 300
with open(join(glove_data_folderpath, glove_dimension_to_filename[hidden_size]), "rb") as f:
    glove = pickle.load(f)

vocab_embs = np.array(list(glove.values()))
vocab_size = vocab_embs.shape[0]
del glove # Save RAM

Select *number_of_words* random words and rank their neighbors according to their distance with the word in the embedding space.

In [None]:
number_of_words = 5000
words_ids = np.random.randint(0, vocab_size, size=number_of_words)
words_embs = vocab_embs[words_ids]

words_neighbors_ranked = rank_neighbors(words_embs, vocab_embs, distance_metric)

Add noise to the embeddings of the words following the $d_x$-privacy mechanism and count which neighbor was chosen, represented by its rank in the neighbor list of the initial word.

In [None]:
epsilons = [i for i in range(1, 31)] # Parameter
neighbor_counted_occurences = {}

for epsilon in epsilons:
    embeddings = np.copy(words_embs)
    noise = sample_noise_vectors(dimension=hidden_size,
                                        shape1=1,
                                        shape2=number_of_words,
                                        epsilon=epsilon)[0]
    # Adding noise to embeddings
    noisy_embeddings = embeddings + noise

    # Convert embedding back to text via Nearest neighbor
    noisy_word_ids = noisy_embeddings_to_ids(noisy_embeddings, vocab_embs, distance_metric)

    # for all words_ids, get the rank k of noisy_word_ids[i] and increase a counter at index k
    noisy_word_ids_ranks = words_neighbors_ranked[np.arange(number_of_words), noisy_word_ids] # This line, for all the elements i in the first dimension of words_neighbors_ranked, gets the particular value pointed by the index which is stored at noisy_word_ids[i]
    noisy_word_ids_ranks_counted = Counter(noisy_word_ids_ranks)
    neighbor_counted_occurences[epsilon] = [noisy_word_ids_ranks_counted[k] for k in range(vocab_size)]

Results are stored in *neighbor_counted_occurences*, which is a dictionary where the keys are integers representing the value of epsilon. The dictionary associates each epsilon with a list, where list[i] contains the number of times the i-th neighbor was chosen as the replacement of a word. 

# Average several words (with post-processing fix)

Post-processing step after we have found the nearest word $\mathbf{x}^*$ to the noisy embedding $\mathbf{w}^*$. We sort the nearest neighbors of $\mathbf{x}^*$ and output a neighbor proportional to $\exp(- d_\text{NN}(\mathbf{x}^*, \mathbf{x}))$. More specifically any word $\mathbf{x} \in \mathcal{D}$ is output with probability:
$$\frac{\exp(- c \epsilon d_\text{NN}(\mathbf{x}^*, \mathbf{x}))}{\sum_{\mathbf{x} \in \mathcal{D}} \exp(- c \epsilon d_\text{NN}(\mathbf{x}^*, \mathbf{x}))}, 
$$
where $c$ is a constant to control how many neighbors are likely to be selected. A higher value such as $c > 1$ means that the mechanism will output the first few neighbors with high probability, and a lower value such as $c = 0.01$ means that more neighbors will likely to be output, of course, with probability exponentially decreasing as we move away from the original word. This is the same as the temperature variable in the softmax function.

In [None]:
# Define the new nearest neighbor search function.
# It requires an already computed matrix of ranked neighbors.
# Used for faster tests of different dx_constant for the same embeddings without 
# having to recompute the ranks of the neighbors.
def noisy_embeddings_to_ids_dxfix_lazy(
        words_embeddings: np.ndarray,
        vocabulary: np.ndarray,
        neighbors_ranked: np.ndarray,
        dx_constant: int,
        epsilon: int
    ) -> np.ndarray:
    number_of_words = words_embeddings.shape[0]
    vocab_size = vocabulary.shape[0]

    probabilities = np.exp(-dx_constant*epsilon*neighbors_ranked)

    probabilities_summed = probabilities.sum(axis=-1, keepdims=True)
    probabilities = probabilities / probabilities_summed

    noisy_words_ids = [np.random.choice(vocab_size, p=probabilities[i]) for i in range(number_of_words)]

    return noisy_words_ids

Load GloVe in a specific dimension

In [None]:
hidden_size = 300
with open(join(glove_data_folderpath, glove_dimension_to_filename[hidden_size]), "rb") as f:
    glove = pickle.load(f)

vocab_embs = np.array(list(glove.values()))
vocab_size = vocab_embs.shape[0]
del glove # Save RAM

Select *number_of_words* random words and rank their neighbors according to their distance with the word in the embedding space.

In [None]:
number_of_words = 5000
words_ids = np.random.randint(0, vocab_size, size=number_of_words)
words_embs = vocab_embs[words_ids]

words_neighbors_ranked = rank_neighbors(words_embs, vocab_embs, distance_metric)

Add noise to the embeddings of the words following the $d_x$-privacy mechanism. Apply the post-processing described in the paper and count which neighbor was chosen, represented by its rank in the neighbor list of the initial word.

In [None]:
epsilons = [i for i in range(1, 71)] # Parameter
dx_constants = [i for i in np.arange(0.001, 0.1, 0.001)] # Parameter
neighbor_counted_occurences = {}

for epsilon in epsilons:
    embeddings = np.copy(words_embs)
    noise = sample_noise_vectors(dimension=hidden_size,
                                        shape1=1,
                                        shape2=number_of_words,
                                        epsilon=epsilon)[0]
    # Adding noise to embeddings
    noisy_embeddings = embeddings + noise

    # We first find the nearest neighbors of each of the noisy embeddings, called the "pivots" here
    pivot_noisy_word_ids = noisy_embeddings_to_ids(noisy_embeddings, vocab_embs, distance_metric)
    pivot_noisy_word_embeddings = vocab_embs[pivot_noisy_word_ids]
    
    # Then, we apply the post-processing fix proposed in the paper, by sampling a neighbor
    # of each pivot according to the formula above. Finally, we count the number of times 
    # the k-th neighbor has been chosen and store it in neighbor_counted_occurences.
    neighbor_counted_occurences[epsilon] = {}

    #Rank the words in the vocabulary according to their distance with each of the embeddings
    pivot_noisy_word_embeddings_neighbors_ranked = rank_neighbors(pivot_noisy_word_embeddings, vocab_embs, distance_metric)
    for dx_constant in dx_constants:
        noisy_words_ids = noisy_embeddings_to_ids_dxfix_lazy(pivot_noisy_word_embeddings, vocab_embs, pivot_noisy_word_embeddings_neighbors_ranked, dx_constant, epsilon)

        # for all words_ids, get the rank k of noisy_word_ids[i] and increase a counter at index k
        noisy_word_ids_ranks = words_neighbors_ranked[np.arange(number_of_words), noisy_words_ids] # This line, for all the elements i in the first dimension of words_neighbors_ranked, gets the particular value pointed by the index which is stored at noisy_word_ids[i]
        noisy_word_ids_ranks_counted = Counter(noisy_word_ids_ranks)
        neighbor_counted_occurences[epsilon][dx_constant] = [noisy_word_ids_ranks_counted[k] for k in range(vocab_size)]

Results are stored in *neighbor_counted_occurences*, which is a dictionary where the keys are integers representing the value of epsilon. The dictionary associates each epsilon with another dictionary, where the keys are floats representing the value of the constant $c$ in the post-processing fix. This sub-dictionnary associates each $c$ with a list, where list[i] contains the number of times the i-th neighbor was chosen as the replacement of a word. 