# Clustering algorithm

In [22]:
import torch
import pandas as pd
import json
from sklearn.cluster import KMeans, AffinityPropagation
from sklearn.metrics import silhouette_score
import numpy as np

## 1. Load the data and the embeddings

In [23]:
FILE_TO_READ = './data/dev-testing/axolotl.dev.ru.tsv'
EMBEDDING_TYPE = 'concatenated' # 'examples', 'glosses' or 'concatenated'

language = FILE_TO_READ.split('.')[-2]
filename = FILE_TO_READ.split('/')[-1].split('.')[0:-1]
filename = '.'.join(filename)
embeddings_file = f"./embeddings/{EMBEDDING_TYPE}/{filename}.json"
language, embeddings_file

('ru', './embeddings/concatenated/axolotl.dev.ru.json')

In [24]:
df = pd.read_csv(FILE_TO_READ, sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2026 entries, 0 to 2025
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   usage_id              2026 non-null   object 
 1   word                  2026 non-null   object 
 2   orth                  2026 non-null   object 
 3   sense_id              962 non-null    object 
 4   gloss                 962 non-null    object 
 5   example               1912 non-null   object 
 6   indices_target_token  0 non-null      float64
 7   date                  2026 non-null   object 
 8   period                2026 non-null   object 
dtypes: float64(1), object(8)
memory usage: 142.6+ KB


In [25]:
with open(embeddings_file, 'r') as json_file:
    embeddings_list = json.load(json_file)

embeddings = torch.tensor(embeddings_list)
assert embeddings.shape[0] == df.shape[0], "Embeddings count must be the same as the df length"
embeddings.size()

torch.Size([2026, 1536])

In [26]:
df['embedding'] = list(embeddings)
assert all(df['embedding'][0] == embeddings[0])

## 2. Clustering algorithm with the embeddings 

In [27]:
def get_silhouette_score(tensors, labels):
    X = np.array([tensor.flatten().numpy() for tensor in tensors])
    score = silhouette_score(X, labels=labels, metric='euclidean')
    return score

def KMeans_clustering(df):
    best_score = -1
    best_n = 0
    min_senses = df['sense_id'].nunique()
    max_senses = min_senses + df['sense_id'].isnull().sum()
    print(f"min_senses = {min_senses}, max_senses = {max_senses}")
    for n in range(min_senses,max_senses):
        kmeans = KMeans(n_clusters=n, random_state=0, n_init='auto')
        kmeans.fit(df['embedding'].tolist())
        df[f'clusters_{n}'] = None
        df[f'clusters_{n}'] = kmeans.labels_
        silhouette_avg = get_silhouette_score(df['embedding'], df[f'clusters_{n}'])
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_n = n
    print("Best number of clusters:", best_n)
    df['clusters'] = df[f'clusters_{best_n}']
    df = df.drop(columns=[f'clusters_{n}' for n in range(min_senses,max_senses)])
    return df

def AffinityPropagation_clustering(df):
    ap = AffinityPropagation()
    clusters = ap.fit(df['embedding'].tolist())
    df['clusters'] = None
    df['clusters'] = clusters.labels_
    return df

def clustering(df, method="AffinityPropagation"):
    if method == "KMeans":
        return KMeans_clustering(df)
    elif method == "AffinityPropagation":
        return AffinityPropagation_clustering(df)
    else:
        print("Invalid clustering method")
        return None

counter = 0
for word, group in df.groupby('word'):
    print("word:", word)
    result_df = clustering(group, method="KMeans")
    counter += 1
    if counter == 5:
        break

word: ангажировать
min_senses = 2, max_senses = 7
Best number of clusters: 2
word: аренда
min_senses = 3, max_senses = 7
Best number of clusters: 3
word: арт
min_senses = 2, max_senses = 4
Best number of clusters: 2
word: афера
min_senses = 2, max_senses = 5
Best number of clusters: 2
word: база
min_senses = 5, max_senses = 12
Best number of clusters: 7


In [28]:
result_df

Unnamed: 0,usage_id,word,orth,sense_id,gloss,example,indices_target_token,date,period,embedding,clusters
1569,dev_ru_1967,база,база,baza_NIHxr_tv2X0,"основание какого-либо сооружения, колонны","Вот остатки крепких ворот, вот основание бойни...",,new,new,"[tensor(0.2524), tensor(-1.1011), tensor(1.721...",5
1570,dev_ru_1968,база,база,baza_NIHxr_tv2X0,"основание какого-либо сооружения, колонны","крона, вершина, овершье, маковка, ладонь.",,old,old,"[tensor(0.1645), tensor(-0.0147), tensor(0.037...",6
1571,dev_ru_1969,база,база,baza_eEvyLNzXfJM,"склад для хранения товаров, материалов",Без хлеба-то не проживёшь. «Ушла на базу» ― по...,,new,new,"[tensor(0.4100), tensor(-0.4858), tensor(0.930...",0
1572,dev_ru_1970,база,база,,,"По официальному сообщению, в посёлке находилас...",,new,new,"[tensor(1.2819), tensor(-0.4796), tensor(-0.02...",3
1573,dev_ru_1971,база,база,,,"Коли уж заговорили о проходимости, отметим, чт...",,new,new,"[tensor(-0.1217), tensor(0.2573), tensor(1.318...",2
1574,dev_ru_1972,база,база,baza_R8rL46tI7P0,совокупность материальных или технических сред...,Программируемые пользователями вентильные матр...,,new,new,"[tensor(0.4052), tensor(-0.1405), tensor(1.085...",0
1575,dev_ru_1973,база,база,,,Стратегические планы на 2005 финансовый год вы...,,new,new,"[tensor(0.7596), tensor(0.0082), tensor(1.3154...",4
1576,dev_ru_1974,база,база,,,"Похоже, России, оставшейся без детского спорта...",,new,new,"[tensor(0.7274), tensor(0.5597), tensor(-0.566...",0
1577,dev_ru_1975,база,база,,,Согласно оспариваемым нормам налоговая база на...,,new,new,"[tensor(0.9098), tensor(-0.2220), tensor(1.903...",1
1578,dev_ru_1976,база,база,,,"Показано, что это― эффективный метод организац...",,new,new,"[tensor(0.9874), tensor(-0.6350), tensor(0.887...",1
