In [3]:
import numpy as np
from scipy.spatial.distance import cosine
import pandas as pd
from sklearn.cluster import KMeans

In [4]:
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = list(map(float, values[1:]))
            embeddings[word] = vector
    return embeddings

# Path to the GloVe file
glove_file_path = 'glove.6B.300d.txt'

# Load the embeddings
embeddings = load_glove_embeddings(glove_file_path)

# Get the embedding for "norway"
norway_embedding = embeddings.get('norway')

if norway_embedding:
    print("Embedding for 'norway':")
    print(norway_embedding)
else:
    print("'norway' not found in embeddings.")

Embedding for 'norway':
[-0.10675, 0.1479, -0.50551, 0.47089, 0.3117, 0.15165, 0.19078, 0.27304, -0.024549, -1.2821, -0.21861, -0.39795, 0.22596, 0.30201, -0.33422, 0.25108, 0.32763, 0.39699, -0.71871, 0.0096784, 0.11126, 0.099761, -0.27434, 0.40397, -0.3468, -0.11817, -0.3358, -0.47028, -1.0238, -0.022385, -0.19229, -0.51256, 0.13575, 0.42843, -0.29873, -0.39356, 0.57573, -0.11213, -0.26791, 0.28042, -0.88649, -0.085926, 0.17979, -0.23622, -0.40873, -0.38531, 0.3385, -0.20835, 0.18581, 0.015005, -0.64442, 0.2628, 0.35952, -0.41501, -0.322, 0.68475, -0.22952, 0.10886, -0.2761, 0.35901, -0.81549, 0.94224, -0.31344, -0.43145, 0.25273, 0.055572, -0.099283, 0.55193, 0.44542, -0.20776, -0.90083, -0.19906, 0.26436, 0.064958, -0.47981, 0.11094, -0.085895, -0.052858, 0.31741, 0.10706, 0.13175, -0.61744, -0.33871, 0.176, 0.32555, -0.30336, -0.10891, -0.37193, -0.21215, -0.96199, -0.088204, -0.33672, 0.25825, -0.13834, 0.012352, -0.68369, 0.10282, 0.094525, -0.57033, 0.28754, 0.10308, -0.28742, 

In [5]:
def normalize_embeddings(embeddings):
    normalized_embeddings = {}
    for word, vector in embeddings.items():
        norm = np.linalg.norm(vector)
        if norm > 0:
            normalized_vector = vector / norm
            normalized_embeddings[word] = normalized_vector
    return normalized_embeddings

normalized_embeddings = normalize_embeddings(embeddings)

# Get the embedding for "norway"
norway_embedding = normalized_embeddings.get('norway')
# print(norway_embedding)

In [6]:
def get_top_similarities(target_word, word_list, embeddings, top_n=20):
    target_embedding = embeddings.get(target_word)
    if target_embedding is None:
        return None

    similarities = {}
    for word in word_list:
        word_embedding = embeddings.get(word)
        if word_embedding is not None:
            similarities[word] = 1 - cosine(target_embedding, word_embedding)

    sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return sorted_similarities[:top_n]

# Test the function
european_countries = [
    'albania', 'andorra', 'austria', 'belarus', 'belgium', 'bosnia', 'bulgaria', 'croatia',
    'cyprus', 'czech', 'denmark', 'estonia', 'finland', 'france', 'germany', 'greece',
    'hungary', 'iceland', 'ireland', 'italy', 'latvia', 'liechtenstein', 'lithuania',
    'luxembourg', 'malta', 'moldova', 'monaco', 'montenegro', 'netherlands', 'north macedonia',
    'norway', 'poland', 'portugal', 'romania', 'russia', 'san marino', 'serbia', 'slovakia',
    'slovenia', 'spain', 'sweden', 'switzerland', 'turkey', 'ukraine', 'united kingdom', 'vatican'
]

top_20_similarities = get_top_similarities('norway', european_countries, normalized_embeddings)
print(top_20_similarities)

[('norway', 1), ('denmark', 0.6980988523058034), ('sweden', 0.6567031150559826), ('iceland', 0.6321000420842228), ('finland', 0.6197740583127542), ('netherlands', 0.49975032779171835), ('estonia', 0.49594574438870165), ('switzerland', 0.49237581412474896), ('austria', 0.4910478696010585), ('lithuania', 0.4776603139255837), ('germany', 0.47481855105552884), ('latvia', 0.4747955043159663), ('poland', 0.44408460155662943), ('hungary', 0.4434215764866545), ('bulgaria', 0.4430403688270663), ('portugal', 0.4363678236887445), ('slovakia', 0.4297746516021328), ('belgium', 0.41912721993294366), ('liechtenstein', 0.40841938161504554), ('luxembourg', 0.4071672836820489)]


In [7]:
# read world_cities.csv
world_cities = pd.read_csv('world_cities.csv')
# lowercase the city names
world_cities['city'] = world_cities['city'].str.lower()
world_cities.head()

Unnamed: 0,city
0,tokyo
1,jakarta
2,delhi
3,guangzhou
4,mumbai


In [8]:
# convert the city names to a list
city_list = world_cities['city'].tolist()

top_20_similarities = get_top_similarities('dhaka', city_list, normalized_embeddings)
print(top_20_similarities)

[('dhaka', 1), ('lahore', 0.6045195884667484), ('delhi', 0.5801741189727798), ('kathmandu', 0.5767808345438777), ('karachi', 0.5673440561332525), ('kolkata', 0.5412403045629106), ('sylhet', 0.5246002691956407), ('hyderabad', 0.5140027152100624), ('rajshahi', 0.5130482827425025), ('harare', 0.5125273361371179), ('khulna', 0.5081745278471346), ('multan', 0.5008812303132791), ('bangalore', 0.4937680822881667), ('peshawar', 0.4899602890251177), ('bangkok', 0.489741201151789), ('mymensingh', 0.47754347426052635), ('johannesburg', 0.4765021598599348), ('ahmedabad', 0.47437400329212487), ('lucknow', 0.4700376385918963), ('rawalpindi', 0.46969229529531953)]


In [11]:
# This function performs K-means clustering on a given set of word embeddings to group them into a specified number of clusters. 
# It uses the K-means algorithm to cluster the remaining vectors. 
# Each word is then assigned to a cluster based on the K-means output. 
# The function returns a dictionary where each key is a cluster label and each value is a list of words belonging to that cluster. 
# In the example usage, the function is applied to a subset of the embeddings that correspond to city names, and the resulting clusters are printed.

def cluster_embeddings(embeddings, num_clusters):
    # Filter out None embeddings and convert to list of vectors
    words = [word for word in embeddings if embeddings[word] is not None]
    vectors = [embeddings[word] for word in words]
    
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=42)
    kmeans.fit(vectors)
    
    # Assign each word to a cluster
    clusters = {}
    for i, label in enumerate(kmeans.labels_):
        if label not in clusters:
            clusters[label] = []
        clusters[label].append(words[i])
    
    return clusters

# Filter embeddings to include only cities
city_embeddings = {city: normalized_embeddings[city] for city in city_list if city in normalized_embeddings}

# Example usage
num_clusters = 500  # Change this to the desired number of clusters
clusters = cluster_embeddings(city_embeddings, num_clusters)

# Print clusters
for label, words in clusters.items():
    print(f"Cluster {label}: {', '.join(words)}")


Cluster 17: tokyo, seoul, moscow, pyongyang, warsaw, sofia, kazan, vladimir, duma, petersburg, imperial
Cluster 357: jakarta, surabaya, medan, malang, semarang, palembang, makassar, bogor, pekanbaru, padang, denpasar, samarinda, jambi, surakarta, manado, bandung, yogyakarta, jayapura, mataram, bengkulu, ambon, bali, bam, bambang, batavia, sari
Cluster 468: delhi, mumbai, dhaka, karachi, lahore, faisalabad, colombo, rawalpindi, peshawar, multan, sharjah, srinagar, islamabad, quetta, jammu
Cluster 92: guangzhou, shanghai, shenzhen, chengdu, xi'an, chongqing, dongguan, tianjin, nanjing, shenyang, dalian, pudong, changchun, xiamen, zhongshan, chaoyang, zhuhai, baoshan, jilin, sanya, dongfeng, panda
Cluster 270: manila, beijing, bangkok, hanoi, singapore, taipei, macau, hong, xinhua, falun, kong, commerce, china
Cluster 409: cairo, khartoum, giza, alexandria, amman, algiers, dubai, damascus, beirut, aleppo, doha, tripoli, suez, aswan, luxor, bahrain, lebanon, jordan, palestine, ahram, mena,

In [14]:
# This code segment is preparing data for hierarchical clustering of city embeddings based on cosine similarity.
# It starts by defining the desired average size for the clusters. 
# Then, it filters the embeddings to include only those corresponding to a predefined list of cities. 
# The embeddings are converted into a list of vectors, 
# which are then used to compute a condensed distance matrix using cosine similarity, 
# providing the basis for clustering the cities.

# Define the desired average cluster size
avg_cluster_size = 20

# Filter embeddings for cities
city_embeddings = {city: embeddings[city] for city in city_list if city in embeddings}

# Convert city embeddings to a list of vectors
cities = list(city_embeddings.keys())
vectors = [city_embeddings[city] for city in cities]

# Compute the condensed cosine distance matrix
cosine_dist_matrix = pdist(vectors, metric='cosine')

In [15]:
# This code segment performs hierarchical clustering on the condensed cosine distance matrix 
# of city embeddings and assigns each city to a cluster.

# Compute the linkage matrix using the condensed cosine distance matrix
Z = linkage(cosine_dist_matrix, method='average')

# Determine the number of clusters to use
num_cities = len(cities)
num_clusters = max(1, num_cities // avg_cluster_size)

# Obtain cluster labels
cluster_labels = fcluster(Z, t=num_clusters, criterion='maxclust')

# Assign cities to clusters
clusters = {}
for city, label in zip(cities, cluster_labels):
    if label not in clusters:
        clusters[label] = []
    clusters[label].append(city)

# Print the clusters
for label, cluster_cities in clusters.items():
    print(f"Cluster {label}: {', '.join(cluster_cities)}")

Cluster 770: tokyo, jakarta, manila, shanghai, seoul, beijing, bangkok, shenzhen, hanoi, singapore, pudong, pyongyang, taipei, vientiane, macau, hong, xinhua, kong, central, southeast, center, bank, china, casino
Cluster 229: delhi, mumbai, dhaka, kolkata, karachi, bangalore, chennai, lahore, hyderabad, pune, ahmedabad, lucknow, kanpur, nagpur, rawalpindi, indore, peshawar, multan, bhopal, patna, bilaspur, agra, jabalpur, varanasi, srinagar, aligarh, gwalior, islamabad, raipur, quetta, thiruvananthapuram, kochi, gorakhpur, mangalore, agartala, ujjain, jhansi, jammu, gaya, mathura, bhagalpur, bihar, imphal, haridwar, shimla, daman, guna, shillong, dimapur, rishikesh, gangtok, kohima, muzaffarabad, dhar, goa, faizabad, vrindavan, sagar, diu, panaji, manali, allahabad, mussoorie, ahmadabad, nagar, sabha
Cluster 472: guangzhou, chengdu, xi'an, chongqing, baoding, linyi, dongguan, tianjin, wuhan, nanyang, hangzhou, foshan, zhoukou, ganzhou, heze, quanzhou, nanjing, jining, fuyang, shenyang,

In [19]:
# This code defines a function that merges small clusters with larger clusters 
# based on the cosine distance between the centroids of the clusters. 
# If a cluster is smaller than a specified minimum size, it is merged with the closest larger cluster. 

def merge_small_clusters(clusters, city_embeddings, min_cluster_size):
    # Calculate cluster centroids
    centroids = {label: np.mean([city_embeddings[city] for city in cluster], axis=0) for label, cluster in clusters.items()}

    # Find clusters smaller than min_cluster_size
    small_clusters = [label for label, cluster in clusters.items() if len(cluster) < min_cluster_size]

    # Create a copy of clusters to modify
    merged_clusters = dict(clusters)

    for label in small_clusters:
        # Calculate distances between the small cluster and all other clusters
        distances = {other_label: np.mean(cdist([centroids[label]], [centroids[other_label]], metric='cosine')) for other_label in merged_clusters if other_label != label}

        # Find the closest cluster
        closest_cluster = min(distances, key=distances.get) if distances else None

        # Merge the small cluster with the closest cluster
        if closest_cluster is not None:
            merged_clusters[closest_cluster].extend(merged_clusters.pop(label))

    return merged_clusters

# Example usage
min_cluster_size = 20
merged_clusters = merge_small_clusters(clusters, city_embeddings, min_cluster_size)

# Print the merged clusters
for label, cluster_cities in merged_clusters.items():
    print(f"Cluster {label}: {', '.join(cluster_cities)}")

Cluster 770: tokyo, jakarta, manila, shanghai, seoul, beijing, bangkok, shenzhen, hanoi, singapore, pudong, pyongyang, taipei, vientiane, macau, hong, xinhua, kong, central, southeast, center, bank, china, casino, enterprise, commerce
Cluster 229: delhi, mumbai, dhaka, kolkata, karachi, bangalore, chennai, lahore, hyderabad, pune, ahmedabad, lucknow, kanpur, nagpur, rawalpindi, indore, peshawar, multan, bhopal, patna, bilaspur, agra, jabalpur, varanasi, srinagar, aligarh, gwalior, islamabad, raipur, quetta, thiruvananthapuram, kochi, gorakhpur, mangalore, agartala, ujjain, jhansi, jammu, gaya, mathura, bhagalpur, bihar, imphal, haridwar, shimla, daman, guna, shillong, dimapur, rishikesh, gangtok, kohima, muzaffarabad, dhar, goa, faizabad, vrindavan, sagar, diu, panaji, manali, allahabad, mussoorie, ahmadabad, nagar, sabha
Cluster 472: guangzhou, chengdu, xi'an, chongqing, baoding, linyi, dongguan, tianjin, wuhan, nanyang, hangzhou, foshan, zhoukou, ganzhou, heze, quanzhou, nanjing, jin