In [160]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial.distance import cdist
from scipy.spatial.distance import cosine

In [4]:
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = list(map(float, values[1:]))
            embeddings[word] = vector
    return embeddings

# Path to the GloVe file
glove_file_path = 'glove.6B.300d.txt'

# Load the embeddings
embeddings = load_glove_embeddings(glove_file_path)

# Get the embedding for "norway"
norway_embedding = embeddings.get('norway')

if norway_embedding:
    print("Embedding for 'norway':")
    print(norway_embedding)
else:
    print("'norway' not found in embeddings.")

Embedding for 'norway':
[-0.10675, 0.1479, -0.50551, 0.47089, 0.3117, 0.15165, 0.19078, 0.27304, -0.024549, -1.2821, -0.21861, -0.39795, 0.22596, 0.30201, -0.33422, 0.25108, 0.32763, 0.39699, -0.71871, 0.0096784, 0.11126, 0.099761, -0.27434, 0.40397, -0.3468, -0.11817, -0.3358, -0.47028, -1.0238, -0.022385, -0.19229, -0.51256, 0.13575, 0.42843, -0.29873, -0.39356, 0.57573, -0.11213, -0.26791, 0.28042, -0.88649, -0.085926, 0.17979, -0.23622, -0.40873, -0.38531, 0.3385, -0.20835, 0.18581, 0.015005, -0.64442, 0.2628, 0.35952, -0.41501, -0.322, 0.68475, -0.22952, 0.10886, -0.2761, 0.35901, -0.81549, 0.94224, -0.31344, -0.43145, 0.25273, 0.055572, -0.099283, 0.55193, 0.44542, -0.20776, -0.90083, -0.19906, 0.26436, 0.064958, -0.47981, 0.11094, -0.085895, -0.052858, 0.31741, 0.10706, 0.13175, -0.61744, -0.33871, 0.176, 0.32555, -0.30336, -0.10891, -0.37193, -0.21215, -0.96199, -0.088204, -0.33672, 0.25825, -0.13834, 0.012352, -0.68369, 0.10282, 0.094525, -0.57033, 0.28754, 0.10308, -0.28742, 

In [5]:
def normalize_embeddings(embeddings):
    normalized_embeddings = {}
    for word, vector in embeddings.items():
        norm = np.linalg.norm(vector)
        if norm > 0:
            normalized_vector = vector / norm
            normalized_embeddings[word] = normalized_vector
    return normalized_embeddings

normalized_embeddings = normalize_embeddings(embeddings)

# Get the embedding for "norway"
norway_embedding = normalized_embeddings.get('norway')
# print(norway_embedding)

In [6]:
def get_top_similarities(target_word, word_list, embeddings, top_n=20):
    target_embedding = embeddings.get(target_word)
    if target_embedding is None:
        return None

    similarities = {}
    for word in word_list:
        word_embedding = embeddings.get(word)
        if word_embedding is not None:
            similarities[word] = 1 - cosine(target_embedding, word_embedding)

    sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return sorted_similarities[:top_n]

# Test the function
european_countries = [
    'albania', 'andorra', 'austria', 'belarus', 'belgium', 'bosnia', 'bulgaria', 'croatia',
    'cyprus', 'czech', 'denmark', 'estonia', 'finland', 'france', 'germany', 'greece',
    'hungary', 'iceland', 'ireland', 'italy', 'latvia', 'liechtenstein', 'lithuania',
    'luxembourg', 'malta', 'moldova', 'monaco', 'montenegro', 'netherlands', 'north macedonia',
    'norway', 'poland', 'portugal', 'romania', 'russia', 'san marino', 'serbia', 'slovakia',
    'slovenia', 'spain', 'sweden', 'switzerland', 'turkey', 'ukraine', 'united kingdom', 'vatican'
]

top_20_similarities = get_top_similarities('norway', european_countries, normalized_embeddings)
print(top_20_similarities)

[('norway', 1), ('denmark', 0.6980988523058034), ('sweden', 0.6567031150559826), ('iceland', 0.6321000420842228), ('finland', 0.6197740583127542), ('netherlands', 0.49975032779171835), ('estonia', 0.49594574438870165), ('switzerland', 0.49237581412474896), ('austria', 0.4910478696010585), ('lithuania', 0.4776603139255837), ('germany', 0.47481855105552884), ('latvia', 0.4747955043159663), ('poland', 0.44408460155662943), ('hungary', 0.4434215764866545), ('bulgaria', 0.4430403688270663), ('portugal', 0.4363678236887445), ('slovakia', 0.4297746516021328), ('belgium', 0.41912721993294366), ('liechtenstein', 0.40841938161504554), ('luxembourg', 0.4071672836820489)]


In [7]:
# read world_cities.csv
world_cities = pd.read_csv('world_cities.csv')
# lowercase the city names
world_cities['city'] = world_cities['city'].str.lower()
world_cities.head()

Unnamed: 0,city
0,tokyo
1,jakarta
2,delhi
3,guangzhou
4,mumbai


In [8]:
# convert the city names to a list
city_list = world_cities['city'].tolist()

top_20_similarities = get_top_similarities('dhaka', city_list, normalized_embeddings)
print(top_20_similarities)

[('dhaka', 1), ('lahore', 0.6045195884667484), ('delhi', 0.5801741189727798), ('kathmandu', 0.5767808345438777), ('karachi', 0.5673440561332525), ('kolkata', 0.5412403045629106), ('sylhet', 0.5246002691956407), ('hyderabad', 0.5140027152100624), ('rajshahi', 0.5130482827425025), ('harare', 0.5125273361371179), ('khulna', 0.5081745278471346), ('multan', 0.5008812303132791), ('bangalore', 0.4937680822881667), ('peshawar', 0.4899602890251177), ('bangkok', 0.489741201151789), ('mymensingh', 0.47754347426052635), ('johannesburg', 0.4765021598599348), ('ahmedabad', 0.47437400329212487), ('lucknow', 0.4700376385918963), ('rawalpindi', 0.46969229529531953)]


This function performs K-means clustering on a given set of word embeddings to group them into a specified number of clusters. 
It uses the K-means algorithm to cluster the remaining vectors. 
Each word is then assigned to a cluster based on the K-means output. 
The function returns a dictionary where each key is a cluster label and each value is a list of words belonging to that cluster. 
In the example usage, the function is applied to a subset of the embeddings that correspond to city names, and the resulting clusters are printed.

In [258]:
def cluster_embeddings(embeddings, num_clusters):
    # Filter out None embeddings and convert to list of vectors
    words = [word for word in embeddings if embeddings[word] is not None]
    vectors = [embeddings[word] for word in words]
    
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=42)
    kmeans.fit(vectors)
    
    # Assign each word to a cluster
    clusters = {}
    for i, label in enumerate(kmeans.labels_):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(words[i])
    
    return clusters

# Filter embeddings to include only cities
city_embeddings = {city: normalized_embeddings[city] for city in city_list if city in normalized_embeddings}

# Example usage
num_clusters = 100  # Change this to the desired number of clusters
clusters = cluster_embeddings(city_embeddings, num_clusters)

# Print clusters
for label, words in clusters.items():
    print(f"Cluster {label}: {', '.join(words)}")

Cluster 69: tokyo, osaka, nagoya, yokohama, fukuoka, sapporo, kawasaki, kobe, kyoto, saitama, hiroshima, sendai, chiba, sakai, niigata, kumamoto, shizuoka, adachi, kawaguchi, ichikawa, oita, kanazawa, fukuyama, kashiwa, aomori, toyama, nagasaki, gifu, miyazaki, saki, okazaki, nagano, nara, nakano, akita, morioka, fukushima, ibaraki, mito, ichihara, fukui, hiratsuka, soka, yamagata, fuji, matsumoto, ota, arakawa, yamaguchi, hino, sakura, hitachi, iwata, matsuzaka, noda, ueda, dawei, fujita, ashikaga, koga, narita, saida, iizuka, sano, tokai, komatsu, ikeda, shibuya, sakata, iida, shibata, wako, nikko, miki, yoshikawa, tanabe, kashima, saiki, asahi, ito, hashimoto, oda, shirakawa, murakami, mizuho, sakurai, izumi, takeo, yuki, nakagawa, hamada, midori, hikari, ono, takashima, kikuchi, masuda, kobayashi, kasai, miura, kato, ozu, masaki, tamura, goto, shinjo, chuo, takahashi, yoshida, maki, saito, manga, ogawa, nagai, tosa, onda, ishii, aso, oji, sanga, kitajima, furukawa, murayama, matsuu

This code calculates the maximum pairwise distance between the centroids of a given set of clusters. Each cluster is represented by a list of words, and their corresponding embeddings are used to compute the centroid. The process is as follows:

1. **Calculate Centroids:** For each cluster, the embeddings of the words in the cluster are retrieved and used to calculate the centroid by taking the mean of the embeddings along each dimension.

2. **Compute Pairwise Distances:** The Euclidean distances between all pairs of centroids are computed to form a matrix of distances.

3. **Find Maximum Distance:** The maximum distance in the matrix is identified, representing the largest pairwise distance between any two cluster centroids.

4. **Print Result:** The maximum pairwise distance is printed as the sensitivity measure (`sensitivity_inter`).

In [259]:
# Calculate centroids of each cluster
centroids = {}
for label, words in clusters.items():
    cluster_vectors = [city_embeddings[word] for word in words]
    centroids[label] = np.mean(cluster_vectors, axis=0)

# Compute pairwise distances between centroids
centroid_labels = list(centroids.keys())
centroid_vectors = [centroids[label] for label in centroid_labels]
distances = euclidean_distances(centroid_vectors, centroid_vectors)

# Find the maximum distance
sensitivity_inter = np.max(distances)

print(f"The maximum pairwise distance between cluster centroids is: {sensitivity_inter}")

The maximum pairwise distance between cluster centroids is: 1.0172011256229572


In this modified code, instead of computing the distances between centroids, 
we use the cdist function from `scipy.spatial.distance` to compute all pairwise distances between the members of two different clusters. 
We then take the maximum of these distances as the distance between the two clusters. 
The maximum of all these cluster-to-cluster distances is then used as sensitivity_inter.

In [260]:
# Calculate centroids of each cluster
centroids = {}
for label, words in clusters.items():
    cluster_vectors = [city_embeddings[word] for word in words]
    centroids[label] = np.mean(cluster_vectors, axis=0)

# Compute maximum pairwise distances between clusters
max_distances = np.zeros((len(centroids), len(centroids)))

for i, label1 in enumerate(centroids.keys()):
    for j, label2 in enumerate(centroids.keys()):
        if i != j:
            cluster_vectors_1 = [city_embeddings[word] for word in clusters[label1]]
            cluster_vectors_2 = [city_embeddings[word] for word in clusters[label2]]
            distances = cdist(cluster_vectors_1, cluster_vectors_2, metric='euclidean')
            max_distances[i, j] = np.max(distances)

# Find the maximum distance
sensitivity_inter = np.max(max_distances)

print(f"The maximum pairwise distance between any two clusters is: {sensitivity_inter}")


The maximum pairwise distance between any two clusters is: 1.7514913100634877


This code defines a function `compute_largest_intracluster_distance` that calculates the largest Euclidean distance between any two points within each cluster of a given set of clusters. The function iterates over each cluster, retrieves the embeddings for the words in the cluster, computes the pairwise distances between all points in the cluster using the `pdist` function with the Euclidean metric, and then finds the maximum distance. This maximum distance represents the largest intracluster distance for that cluster. The results are stored in a dictionary `sensitivity_intra` where the keys are the cluster labels and the values are the largest intracluster distances. The function returns this dictionary, and the results are printed for each cluster.

In [261]:
from scipy.spatial.distance import pdist, squareform

def compute_largest_intracluster_distance(clusters, embeddings):
    sensitivity_intra = {}
    for label, words in clusters.items():
        # Get the embeddings for the words in the cluster
        cluster_embeddings = [embeddings[word] for word in words]
        
        # Compute the pairwise distances within the cluster
        if len(cluster_embeddings) > 1:
            distances = pdist(cluster_embeddings, metric='euclidean')
            # Find the maximum distance
            max_distance = max(distances)
        else:
            # If the cluster has only one element, set the max distance to 0
            max_distance = 0
        
        # Store the result in the dictionary
        sensitivity_intra[label] = max_distance
    
    return sensitivity_intra

# Compute the largest intracluster distance for each cluster
sensitivity_intra = compute_largest_intracluster_distance(clusters, normalized_embeddings)

# Print the results
for label, max_distance in sensitivity_intra.items():
    print(f"Cluster {label}: Largest intracluster distance = {max_distance}")

Cluster 69: Largest intracluster distance = 1.4601717415369153
Cluster 7: Largest intracluster distance = 1.423194090980196
Cluster 90: Largest intracluster distance = 1.3988365044139837
Cluster 68: Largest intracluster distance = 1.3043284470413257
Cluster 97: Largest intracluster distance = 1.525722252915953
Cluster 25: Largest intracluster distance = 1.4461332423759907
Cluster 28: Largest intracluster distance = 1.440031300192401
Cluster 88: Largest intracluster distance = 1.4779192448068106
Cluster 79: Largest intracluster distance = 1.4177800818537605
Cluster 52: Largest intracluster distance = 1.444309965051001
Cluster 72: Largest intracluster distance = 1.4815757396934381
Cluster 49: Largest intracluster distance = 1.4299047080936989
Cluster 76: Largest intracluster distance = 1.463746771812121
Cluster 18: Largest intracluster distance = 1.4618697834943395
Cluster 41: Largest intracluster distance = 1.4018486738104858
Cluster 84: Largest intracluster distance = 1.409994850706792

In [262]:
# Given a word, find the cluster to which it belongs

def find_word_cluster(word, clusters):
    for label, words in clusters.items():
        if word in words:
            return label
    return None

# Example usage
target_word = 'paris'
target_cluster = find_word_cluster(target_word, clusters)
if target_cluster is not None:
    print(f"The word '{target_word}' belongs to cluster {target_cluster}.")
else:
    print(f"The word '{target_word}' does not belong to any cluster.")

The word 'paris' belongs to cluster 25.


This function implements the exponential mechanism from differential privacy to probabilistically select a cluster from a given set of clusters. The selection is based on the negative distance between the clusters, with the goal of selecting a cluster that is similar to a specified target cluster. The function takes as input the clusters, a matrix of distances between clusters, the label of the target cluster, and a privacy parameter epsilon. It calculates the utility of each cluster as the negative distance from the target cluster, then computes the selection probabilities using the exponential mechanism, taking into account the sensitivity of the distance metric. Finally, it selects and returns a cluster based on these probabilities. The example usage demonstrates how to use this function to select a cluster similar to a given target cluster.

In [315]:
def exponential_mechanism(clusters, distances, selected_label, epsilon=1):
    # Calculate the utilities for each cluster based on the negative distance to the selected cluster
    utilities = [-distances[selected_label, label] for label in clusters.keys()]

    # Calculate the sensitivity
    sensitivity = np.max(distances)

    # Compute the probabilities for each cluster using the exponential mechanism
    probabilities = np.exp(np.array(utilities) * epsilon / (2 * sensitivity))
    probabilities /= np.sum(probabilities)

    # Select a cluster probabilistically
    selected_cluster = np.random.choice(list(clusters.keys()), p=probabilities)

    return selected_cluster

# Example usage
selected_label = target_cluster  # The label of the cluster for which we want to select a similar cluster
selected_cluster = exponential_mechanism(clusters, max_distances, selected_label, epsilon=20)
print(f"Selected cluster: {selected_cluster}")


Selected cluster: 25


This code defines a function `exponential_mechanism_for_word` that uses the exponential mechanism from differential privacy to probabilistically select a word from a given cluster. The selection is based on the negative cosine distance between the target word's embedding and the embeddings of the words in the cluster. The function calculates the utilities for each word in the cluster, computes the probabilities using the exponential mechanism, and then randomly selects a word based on these probabilities. The sensitivity parameter (`sensitivity_intra[target_cluster]`) controls how much the selection is influenced by the distances. The function returns the index of the selected word, which is then used to retrieve the selected word from the cluster.

In [322]:
def exponential_mechanism_for_word(selected_cluster_embeddings, target_word_embedding, sensitivity_intra, epsilon=1):
    # Calculate the utilities for each word in the cluster
    utilities = [-cosine(target_word_embedding, word_embedding) for word_embedding in selected_cluster_embeddings]

    # Calculate the probabilities using the exponential mechanism
    probabilities = np.exp(epsilon * np.array(utilities) / (2 * sensitivity_intra))
    probabilities /= np.sum(probabilities)

    # Randomly select a word based on the probabilities
    selected_index = np.random.choice(range(len(selected_cluster_embeddings)), p=probabilities)
    return selected_index

# Example usage
target_word_embedding = normalized_embeddings[target_word]
selected_cluster_embeddings = [normalized_embeddings[word] for word in clusters[selected_cluster]]

selected_index = exponential_mechanism_for_word(selected_cluster_embeddings, target_word_embedding, sensitivity_intra[selected_cluster], epsilon=1)
selected_word = clusters[selected_cluster][selected_index]

print(f"Selected word: {selected_word}")


Selected word: macedonia
