# Clustering algorithm

In [1]:
import torch
import pandas as pd
import json
from sklearn.cluster import KMeans, AffinityPropagation
from sklearn.metrics import silhouette_score
import numpy as np

## 1. Load the data and the embeddings

In [2]:
FILE_TO_READ = './data/dev-testing/axolotl.dev.ru.tsv'
EMBEDDING_TYPE = 'glosses' # 'examples', 'glosses' or 'concatenated'
CLUSTERING_METHOD = 'KMeans' # 'KMeans' or 'AffinityPropagation'

language = FILE_TO_READ.split('.')[-2]
filename = FILE_TO_READ.split('/')[-1].split('.')[0:-1]
filename = '.'.join(filename)
embeddings_file = f"./embeddings/{EMBEDDING_TYPE}/{filename}.json"
language, embeddings_file

('ru', './embeddings/glosses/axolotl.dev.ru.json')

In [3]:
df = pd.read_csv(FILE_TO_READ, sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2026 entries, 0 to 2025
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   usage_id              2026 non-null   object 
 1   word                  2026 non-null   object 
 2   orth                  2026 non-null   object 
 3   sense_id              421 non-null    object 
 4   gloss                 421 non-null    object 
 5   example               1912 non-null   object 
 6   indices_target_token  0 non-null      float64
 7   date                  2026 non-null   object 
 8   period                2026 non-null   object 
dtypes: float64(1), object(8)
memory usage: 142.6+ KB


In [4]:
with open(embeddings_file, 'r') as json_file:
    embeddings_list = json.load(json_file)

embeddings = torch.tensor(embeddings_list)
assert embeddings.shape[0] == df.shape[0], "Embeddings count must be the same as the df length"
embeddings.size()

torch.Size([2026, 768])

In [5]:
df['embedding'] = list(embeddings)
assert all(df['embedding'][0] == embeddings[0])

## 2. Clustering algorithm with the embeddings 

In [6]:
def get_silhouette_score(tensors, labels):
    X = np.array([tensor.flatten().numpy() for tensor in tensors])
    score = silhouette_score(X, labels=labels, metric='euclidean') # TODO: it's failing when it's only 1 cluster
    return score

def KMeans_clustering(df):
    best_score = -1
    best_n = 0
    min_senses = df['sense_id'].nunique()
    max_senses = min_senses + df['sense_id'].isnull().sum()
    print(f"min_senses = {min_senses}, max_senses = {max_senses}")
    for n in range(min_senses,max_senses):
        kmeans = KMeans(n_clusters=n, random_state=0, n_init='auto')
        kmeans.fit(df['embedding'].tolist())
        df[f'clusters_{n}'] = None
        df[f'clusters_{n}'] = kmeans.labels_
        silhouette_avg = get_silhouette_score(df['embedding'], df[f'clusters_{n}'])
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_n = n
    print("Best number of clusters:", best_n)
    df['clusters'] = df[f'clusters_{best_n}']
    df = df.drop(columns=[f'clusters_{n}' for n in range(min_senses,max_senses)])
    return df

def AffinityPropagation_clustering(df):
    ap = AffinityPropagation()
    clusters = ap.fit(df['embedding'].tolist())
    df['clusters'] = None
    df['clusters'] = clusters.labels_
    return df

def clustering(df, method="AffinityPropagation"):
    if method == "KMeans":
        return KMeans_clustering(df)
    elif method == "AffinityPropagation":
        return AffinityPropagation_clustering(df)
    else:
        print("Invalid clustering method")
        return None

counter = 0
for word, group in df.groupby('word'):
    print("word:", word)
    result_df = clustering(group, method=CLUSTERING_METHOD)
    counter += 1
    if counter == 5:
        break

word: ангажировать
min_senses = 2, max_senses = 10


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Best number of clusters: 5
word: аренда
min_senses = 5, max_senses = 7
Best number of clusters: 5
word: арт
min_senses = 1, max_senses = 4


ValueError: Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)

In [None]:
result_df

Unnamed: 0,usage_id,word,orth,sense_id,gloss,example,indices_target_token,date,period,embedding,clusters
1569,dev_ru_1967,база,база,,,"Вот остатки крепких ворот, вот основание бойни...",,new,new,"[tensor(0.1194), tensor(-0.1630), tensor(0.039...",2
1570,dev_ru_1968,база,база,baza_NIHxr_tv2X0,"основание какого-либо сооружения, колонны","крона, вершина, овершье, маковка, ладонь.",,old,old,"[tensor(0.1194), tensor(-0.1630), tensor(0.039...",2
1571,dev_ru_1969,база,база,,,Без хлеба-то не проживёшь. «Ушла на базу» ― по...,,new,new,"[tensor(-0.0337), tensor(-0.0413), tensor(-0.0...",2
1572,dev_ru_1970,база,база,,,"По официальному сообщению, в посёлке находилас...",,new,new,"[tensor(-0.0467), tensor(-1.1443), tensor(0.49...",0
1573,dev_ru_1971,база,база,,,"Коли уж заговорили о проходимости, отметим, чт...",,new,new,"[tensor(-0.3025), tensor(-0.1200), tensor(0.66...",1
1574,dev_ru_1972,база,база,,,Программируемые пользователями вентильные матр...,,new,new,"[tensor(0.0727), tensor(-0.0803), tensor(0.129...",2
1575,dev_ru_1973,база,база,,,Стратегические планы на 2005 финансовый год вы...,,new,new,"[tensor(0.0727), tensor(-0.0803), tensor(0.129...",2
1576,dev_ru_1974,база,база,,,"Похоже, России, оставшейся без детского спорта...",,new,new,"[tensor(0.0230), tensor(-0.1985), tensor(-0.05...",2
1577,dev_ru_1975,база,база,,,Согласно оспариваемым нормам налоговая база на...,,new,new,"[tensor(-0.1536), tensor(-0.7836), tensor(0.36...",2
1578,dev_ru_1976,база,база,,,"Показано, что это― эффективный метод организац...",,new,new,"[tensor(-0.1536), tensor(-0.7836), tensor(0.36...",2
