# Clustering algorithm

In [140]:
import torch
import pandas as pd
import json
from sklearn.cluster import KMeans, AffinityPropagation
from sklearn.metrics import silhouette_score
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os

## 1. Load the data and the embeddings

In [141]:
FILE_TO_READ = './data/augmented/axolotl.dev.ru.tsv'
EMBEDDING_TYPE = 'concatenated' # 'examples', 'glosses' or 'concatenated'
PRINT_WORDS = False
CLUSTERING_METHOD = 'AffinityPropagation' # 'KMeans' or 'AffinityPropagation'
SCORE = 'deviation'

language = FILE_TO_READ.split('.')[-2]
filename = FILE_TO_READ.split('/')[-1].split('.')[0:-1]
filename = '.'.join(filename)
embeddings_file = f"./embeddings/{EMBEDDING_TYPE}/{filename}.json"
language, embeddings_file

('ru', './embeddings/concatenated/axolotl.dev.ru.json')

In [142]:
df = pd.read_csv(FILE_TO_READ, sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2862 entries, 0 to 2861
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   usage_id              2862 non-null   object 
 1   word                  2862 non-null   object 
 2   orth                  2862 non-null   object 
 3   sense_id              1257 non-null   object 
 4   gloss                 2862 non-null   object 
 5   example               2862 non-null   object 
 6   indices_target_token  0 non-null      float64
 7   date                  2862 non-null   object 
 8   period                2862 non-null   object 
dtypes: float64(1), object(8)
memory usage: 201.4+ KB


In [143]:
with open(embeddings_file, 'r') as json_file:
    embeddings_list = json.load(json_file)

embeddings = torch.tensor(embeddings_list)
assert embeddings.shape[0] == df.shape[0], "Embeddings count must be the same as the df length"
embeddings.size()

torch.Size([2862, 1536])

In [144]:
df['embedding'] = list(embeddings)
assert all(df['embedding'][0] == embeddings[0])

## 2. Clustering algorithm with the embeddings 

In [145]:
def get_silhouette_score(tensors, labels):
    X = np.array([tensor.flatten().numpy() for tensor in tensors])
    score = silhouette_score(X, labels=labels, metric='euclidean')
    return score

def get_new_score(tensors, labels):
    # unique group of labels
    set_labels = set(labels)
    # to np
    labels_np = np.array(labels)
    # groups of tensors
    groups = []
    i = 0
    # get all tensors divided per group
    for x in set_labels:
        indexes = np.where(labels_np==x)
        groups.append([])
        for index in indexes:
            groups[i].append(np.array(tensors[index]))
    # get all centroids
    centroids = []
    for group in groups:
        array = np.array(group)
        centroid = np.mean(array, axis=0)
        centroids.append(centroid)
    # distances from all respect to centroid
    group_mean_std_distance = []
    i_centroid = 0
    for group in groups:
        # temporal variable with each vector of group distances respect to the group centroid
        group_distances = []
        for vector in group:
            # vector euclidean distance
            group_distances.append(np.linalg.norm(centroids[i]-vector))
        i_centroid+=1
        # adding all to a variable
        group_mean_std_distance.append(np.std(group_distances))
    # return the mean of std, this value should be minize
    return np.mean(group_mean_std_distance)

def compute_metric(tensors, labels, method="silhouette_score"):
    if method == "silhouette_score":
        return get_silhouette_score(tensors, labels)
    else:
        return get_new_score(tensors, labels)

def KMeans_clustering(df):
    best_score = -1
    best_n = 0
    min_senses = df['sense_id'].nunique()
    max_senses = min_senses + df['sense_id'].isnull().sum()

    for n in range(min_senses,max_senses):
        kmeans = KMeans(n_clusters=n, random_state=0, n_init='auto')
        kmeans.fit(df['embedding'].tolist())
        df[f'cluster_{n}'] = None
        df[f'cluster_{n}'] = kmeans.labels_
        try:
            silhouette_avg = compute_metric(df['embedding'], df[f'cluster_{n}'], SCORE) if n > 1 else 0 # TODO: Esto es correcto, sí debería ser 0?
        except Exception as e:
            # this happens with glooses because they may have exactly the same embedding
            silhouette_avg = 1e6 # very high value
            #raise e
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_n = n

    if PRINT_WORDS:
        print("Best number of clusters:", best_n, f"[{min_senses}-{max_senses}]")
    df['cluster'] = df[f'cluster_{best_n}']
    df = df.drop(columns=[f'cluster_{n}' for n in range(min_senses,max_senses)])
    return df

def AffinityPropagation_clustering(df):
    ap = AffinityPropagation()
    clusters = ap.fit(df['embedding'].tolist())
    df['cluster'] = None
    df['cluster'] = clusters.labels_
    return df

def clustering(df, method="AffinityPropagation"):
    if method == "KMeans":
        df_cl = KMeans_clustering(df)
    else:
        df_cl = AffinityPropagation_clustering(df)
    
    clusters_replaced = df_cl.loc[~df_cl['sense_id'].isna(), 'cluster']
    clusters_names = df_cl.loc[~df_cl['sense_id'].isna(), 'sense_id']

    for index, value in clusters_replaced.items():
        df_cl.loc[df_cl['cluster'] == value, 'cluster'] = clusters_names[index]
    
    df_cl['sense_id'] = df_cl['cluster']
    df_cl.drop(columns=['cluster', 'embedding'], inplace=True)

    return df_cl

In [146]:
result_df = pd.DataFrame()
for word, group in df.groupby('word'):
    if PRINT_WORDS:
        print(f"{word}: ", end="")
    group_cl = clustering(group, method=CLUSTERING_METHOD)
    result_df = pd.concat([result_df, group_cl], ignore_index=True)
    if len(group) != len(group_cl):
        print(f"{len(group)} != {len(group_cl)} for word {word}")

result_df = result_df.set_index('usage_id')
result_df = result_df.reindex(df['usage_id'])
result_df = result_df.reset_index()

result_df

Unnamed: 0,usage_id,word,orth,sense_id,gloss,example,indices_target_token,date,period
0,dev_ru_0,могильник,могильникъ,mogil'nik_UYTE5-B076I,вят. слово из Вятской губернии арх. слово из А...,"могилки, кладбище",,old,old
1,dev_ru_1,могильник,могильникъ,mogil'nik_C3GhETZc5Vs,ярс. слово из Ярославской губернии крупный коч...,В окрестностях Ярославля можно встретить много...,,old,old
2,dev_ru_2,могильник,могильникъ,mogil'nik_KNs3eVn3pFY,арх. слово из Архангельской губернии походный ...,В музее была выставлена коллекция архангельски...,,old,old
3,dev_ru_3,могильник,могильникъ,mogil'nik_KNs3eVn3pFY,"Могильник - это место, где находятся могилы ил...","Орёл изображает реку Халзан, его голова ― скал...",,new,new
4,dev_ru_4,могильник,могильникъ,mogil'nik_ahboIs9hMMk,орнитол. хищная птица рода крупных степных орлов,Орел могильник,,old,old
...,...,...,...,...,...,...,...,...,...
2857,augmented_dev_ru_3292,горбатый,горбатый,gorbatyj_7iVqG7pI9R8,"разг. тот, у кого есть горб","В книге описывается горбатый старый ведьмак, к...",,old,old
2858,augmented_dev_ru_3293,горячка,горячка,gorjachka_2yFyKaEbJEk,"разг. о человеке горячего, вспыльчивого нрава","""Он всегда был горячкой и не мог сдержать свои...",,old,old
2859,augmented_dev_ru_3294,горячка,горячка,gorjachka_2yFyKaEbJEk,"разг. о человеке горячего, вспыльчивого нрава","""Его горячка иногда приводила к непредсказуемы...",,old,old
2860,augmented_dev_ru_3295,горячка,горячка,gorjachka_uFKY5ZkBVB0,"устар. прост. болезнь, сопровождаемая высокой ...","""У него началась горячка, и он сразу же лег в ...",,old,old


In [147]:
if not os.path.exists(f'./predictions/{EMBEDDING_TYPE}-{CLUSTERING_METHOD}-{SCORE}'):
    os.makedirs(f'./predictions/{EMBEDDING_TYPE}-{CLUSTERING_METHOD}-{SCORE}')

result_df.to_csv(f'./predictions/{EMBEDDING_TYPE}-{CLUSTERING_METHOD}-{SCORE}/{filename}.tsv', sep='\t', index=False)
print(f'Saved to: ./predictions/{EMBEDDING_TYPE}-{CLUSTERING_METHOD}-{SCORE}/{filename}.tsv')

Saved to: ./predictions/concatenated-AffinityPropagation-deviation/axolotl.dev.ru.tsv
