# Distance to neighbors

In [None]:
import pickle
from os.path import join
import numpy as np
from cupyx.scipy.spatial import distance
import cupy as cp
from pathlib import Path
import sys
# Add the main directory to sys.path to be able to import config
sys.path.append(str(Path.cwd().parent))
from config import ROOT_DIR
from utils.tools import compute_distances, argsort_chunked

# PARAMS
distance_metric = "euclidean"
distances_dtype = np.float16 # Precision of the distances
fit_dtype = np.uint32 # Integer size sufficient to encode the number of words in the vocabularies

word2vec_data_folderpath = ROOT_DIR

## Distance to two arbitrary neighbors

In [None]:
# Statistics will be averaged for this number of words
sample_size = 5000
x_rank = 1 # Rank of neighbor x
y_rank = 2 # Rank of neighbor y

Load word2vec

In [None]:
# Load word2vec vocabulary and store it into suitable structures
with open(join(word2vec_data_folderpath, "GoogleNews-vectors-negative300.pkl"), "rb") as f:
    word2vec = pickle.load(f)

vocab_embs = cp.array(list(word2vec.values())) # Put on GPU
vocab_size = vocab_embs.shape[0]
del word2vec # Save RAM

Take *sample_size* words and compute the distances against the entire vocabulary and rank their neighbors

In [None]:
words_ids = np.random.randint(0, vocab_size, size=sample_size)
words_embeddings = vocab_embs[words_ids]

distances = compute_distances(words_embeddings, vocab_embs, distance_metric, dtype=distances_dtype)

# For each word, get a sorted list of their neighbors.
word_neighbors = argsort_chunked(distances, fit_dtype)

# For each word, get a sorted list of the distances with the entire vocabulary.
# Instead of sorting again, benefit from word_neighbors.
# Doing distances[word_neighbors] here would not work as word_neighbors is a 2D array and would
# result in numpy advanced indexing
sorted_distances = np.take_along_axis(distances, word_neighbors, axis=-1)

For each word, compute the distance to their *x*-th and *y*-th neighbor, and between these two.

In [None]:
# Distance to the neighbor x
distances_to_x = sorted_distances[:, x_rank]

# Distance to the neighbor y
distances_to_y = sorted_distances[:, y_rank]

# Gather the ids of x and y
x_neighbors = word_neighbors[:, x_rank:x_rank+1]
y_neighbors = word_neighbors[:, y_rank:y_rank+1]
x_and_y_neighbors = np.concatenate((x_neighbors, y_neighbors), axis=1)

# Compute the distance between x and y
distances_between_x_and_y = np.empty((sample_size), dtype=distances_dtype)
for i in range(sample_size):
    # Using cdist because cupyx.scipy.spatial.distance.euclidean has a bug https://github.com/cupy/cupy/issues/8288
    distances_between_x_and_y[i] = distance.cdist(vocab_embs[x_and_y_neighbors[i][0]:x_and_y_neighbors[i][0]+1], vocab_embs[x_and_y_neighbors[i][1]:x_and_y_neighbors[i][1]+1], distance_metric).item()

In [None]:
print(f"Average half distance to neighbor {x_rank} = {distances_to_x.mean()/2}")
print(f"Average half distance to neighbor {y_rank} = {distances_to_y.mean()/2}")
print(f"Average eq19 for neighbor {x_rank} and {y_rank} = {(((distances_to_y**2)-(distances_to_x**2))/(2*distances_between_x_and_y)).mean()}")