In [None]:
import pandas as pd
import numpy as np
import umap.umap_ as umap
import torchtext

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
# Set random seed
np.random.seed(42)

In [None]:
def cluster_similarities(embedding_name: str,
                         cache_path: str,
                         input_file: str,
                         result_type: str,
                         save_name: str) -> None:
    """
    Cluster the words most similar to teenager either based on mean similarity or effect size.
    """

    # Load appropriate vectors
    vectors = torchtext.vocab.Vectors(name=embedding_name, cache=cache_path)

    # Load results
    teen_sims = pd.read_csv(input_file, header=None, index_col=0, on_bad_lines = 'warn')
    
    # Get target words and embeddings
    words = teen_sims.index.values
    embeddings = np.stack([vectors.get_vecs_by_tokens(word).numpy() for word in words])

    # Examine best number of clusters
    max_, max_idx = 0, 0

    # Iterate through number of clusters
    for i in range(5, 11):

        kmeans = KMeans(n_clusters=i, random_state=0).fit(embeddings)
        score = silhouette_score(embeddings, kmeans.labels_, metric='cosine')
        
        if score > max_:
            max_ = score
            max_idx = i

    # Fit kmeans with best number of clusters
    kmeans = KMeans(n_clusters=max_idx, random_state=0, n_init="auto").fit(embeddings)

    # Create string to write to file
    write_string = f'{save_name}_{result_type}: {max_idx} clusters'

    # Iterate through clusters and collect sentences in each cluster
    for i in range(max_idx):
        write_string += f'\nCluster {i}:'
        for j in range(len(kmeans.labels_)):
            if kmeans.labels_[j] == i:
                write_string += f'\n\t{j+1}: {words[j]}'

    # Write to file
    with open(f'results_clustering/{save_name}_{result_type}_clusters.txt', 'w') as f:
        f.write(write_string)

    # UMAP for dimensionality reduction
    reducer = umap.UMAP(n_components=2, metric='cosine', random_state=0)
    umap_embedding = reducer.fit_transform(embeddings)

    # Create tsv with word, dim1, dim2, cluster
    tsv_string = 'word\tdim1\tdim2\tcluster\n'

    for i in range(len(words)):
        tsv_string += f'{words[i]}\t{umap_embedding[i][0]}\t{umap_embedding[i][1]}\t{kmeans.labels_[i]}\n'

    with open(f'results_clustering/{save_name}_{result_type}_clusters.tsv', 'w') as f:
        f.write(tsv_string)

In [None]:
cluster_similarities(embedding_name='glove.840B.300d.txt',
                        cache_path='./.vector_cache',
                        input_file='./swe_results/glove_840B_teenager_means.csv',
                        result_type='means',
                        save_name='Glove_840B')

In [None]:
cluster_similarities(embedding_name='glove.840B.300d.txt',
                        cache_path='./.vector_cache',
                        input_file='./swe_results/glove_840B_teenager_effect_sizes.csv',
                        result_type='effect_sizes',
                        save_name='Glove_840B')

In [None]:
cluster_similarities(embedding_name='crawl-300d-2M.vec',
                        cache_path='./.vector_cache',
                        input_file='./swe_results/ft_2m_teenager_means.csv',
                        result_type='means',
                        save_name='FT_2M')

In [None]:
cluster_similarities(embedding_name='crawl-300d-2M.vec',
                        cache_path='./.vector_cache',
                        input_file='./swe_results/ft_2m_teenager_effect_sizes.csv',
                        result_type='effect_sizes',
                        save_name='FT_2M')

In [None]:
cluster_similarities(embedding_name='nepali_glove_vectors.txt',
                        cache_path='./.vector_cache',
                        input_file='./swe_results/glove_ne_teenager_means.csv',
                        result_type='means',
                        save_name='Nepali_Glove')

In [None]:
cluster_similarities(embedding_name='nepali_glove_vectors.txt',
                        cache_path='./.vector_cache',
                        input_file='./swe_results/glove_ne_teenager_effect_sizes.csv',
                        result_type='effect_sizes',
                        save_name='Nepali_Glove')

In [None]:
cluster_similarities(embedding_name='cc.ne.300.vec',
                        cache_path='./.vector_cache',
                        input_file='./swe_results/ft_ne_cc_teenager_means.csv',
                        result_type='means',
                        save_name='Nepali_FT_CC')

In [None]:
cluster_similarities(embedding_name='cc.ne.300.vec',
                        cache_path='./.vector_cache',
                        input_file='./swe_results/ft_ne_cc_teenager_effect_sizes.csv',
                        result_type='effect_sizes',
                        save_name='Nepali_FT_CC')