In [2]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.pairwise import euclidean_distances

from sklearn import metrics
from sklearn.metrics import davies_bouldin_score, silhouette_score, calinski_harabasz_score

In [None]:
import gensim.downloader as api

model = api.load("word2vec-google-news-300")

In [None]:
def pca(X, num):
  scaler = StandardScaler()
  X_scaled = scaler.fit_transform(X)

  pca = PCA(n_components=num)
  X_pca = pca.fit_transform(X_scaled)

  return X_pca

In [None]:
def get_verbs_and_embeddings(verbs):
    unique_verbs = list(set(verbs))

    verb_embeddings = [model[i] for i in unique_verbs]
    verb_embeddings = np.array(verb_embeddings)

    return unique_verbs, verb_embeddings

In [3]:
def get_labels(X, n, m, l):
  agg_clustering = AgglomerativeClustering(n_clusters=n, metric=m, linkage=l)
  cluster_labels = agg_clustering.fit_predict(X)

  return cluster_labels

In [None]:
def dunn_index(data, labels):
    intra_dists = np.zeros(len(np.unique(labels)))

    for i, label in enumerate(np.unique(labels)):
        cluster_points = data[labels == label]
        intra_dists[i] = euclidean_distances(cluster_points).max()

    inter_dists = []

    for i in range(len(np.unique(labels))):
        for j in range(i + 1, len(np.unique(labels))):
            cluster_i = data[labels == i]
            cluster_j = data[labels == j]
            inter_dists.append(euclidean_distances(cluster_i, cluster_j).min())

    dunn_index = np.min(inter_dists) / np.max(intra_dists)

    return dunn_index

def xie_beni_index(data, labels):
    centroids = np.array([data[labels == label].mean(axis=0) for label in np.unique(labels)])
    intra_dists = []

    for i, label in enumerate(np.unique(labels)):
        cluster_points = data[labels == label]
        intra_dists.append(np.mean(euclidean_distances(cluster_points, [centroids[i]])))

    min_inter_dist = np.min(euclidean_distances(centroids, centroids))

    xie_beni_index = np.sum(intra_dists) / (len(data) * min_inter_dist)

    return xie_beni_index

In [None]:
def get_best_parameters(verbs, verb_embeddings):
    n_clusters = [i for i in range(5, 16)]
    metric = ['euclidean', 'cosine', 'manhattan']
    linkage = ['ward', 'complete', 'average', 'single']
    n_pca = [i for i in range(10, 231, 10)]

    max_score = -np.inf
    ans = ''

    for i in n_pca:
      for j in n_clusters:
        for k in metric:
          for l in linkage:
            data = pca(verb_embeddings, i)

            try:
              labels = get_labels(verb_embeddings, j, k, l)

              ch_score = metrics.calinski_harabasz_score(data, labels)
              silhouette_score = metrics.silhouette_score(data, labels)
              db_score = metrics.davies_bouldin_score(data, labels)
              dunn_score = dunn_index(data, labels)
              xb_score = xie_beni_index(data, labels)

              metrics_array = np.array([ch_score, silhouette_score, db_score, dunn_score, xb_score])

              metrics_array[~np.isfinite(metrics_array)] = np.nanmax(metrics_array[np.isfinite(metrics_array)])
              scaler = StandardScaler()
              normalized_metrics = scaler.fit_transform(metrics_array.reshape(-1, 1))

              combined_score = np.mean(normalized_metrics)

              if combined_score > max_score:
                max_score = combined_score
                ans = (i, j, k, l)

            except:
              pass

    return ans

In [None]:
import numpy as np
from sklearn.metrics.pairwise import euclidean_distances

def get_cluster_info(cluster_labels, words, embeddings):

    unique_labels = np.unique(cluster_labels)

    cluster_info = {}

    for label in unique_labels:
        cluster_indices = np.where(cluster_labels == label)[0]
        cluster_words = [words[i] for i in cluster_indices]
        cluster_embeddings = embeddings[cluster_indices]

        distance_matrix = euclidean_distances(cluster_embeddings)
        mean_distances = np.mean(distance_matrix, axis=1)

        centroid_index = np.argmin(mean_distances)
        centroid_word = cluster_words[centroid_index]

        cluster_info[label] = {
            'words': cluster_words,
            'centroid': centroid_word,
        }

    return cluster_info