# Clustering algorithm

In [1]:
import torch
import pandas as pd
import json
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

## 1. Load the data and the embeddings

In [5]:
FILE_TO_READ = './data/test/axolotl.test.ru.tsv'
EMBEDDING_TYPE = 'examples' # 'examples' or 'glosses'

language = FILE_TO_READ.split('.')[-2]
filename = FILE_TO_READ.split('/')[-1].split('.')[0:-1]
filename = '.'.join(filename)
embeddings_file = f"./embeddings/{EMBEDDING_TYPE}/{filename}.json"
language, embeddings_file

('ru', './embeddings/examples/axolotl.test.ru.json')

In [3]:
df = pd.read_csv(FILE_TO_READ, sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   usage_id              2126 non-null   object 
 1   word                  2126 non-null   object 
 2   orth                  2126 non-null   object 
 3   sense_id              424 non-null    object 
 4   gloss                 424 non-null    object 
 5   example               1990 non-null   object 
 6   indices_target_token  0 non-null      float64
 7   date                  2126 non-null   object 
 8   period                2126 non-null   object 
dtypes: float64(1), object(8)
memory usage: 149.6+ KB


In [7]:
with open(embeddings_file, 'r') as json_file:
    embeddings_list = json.load(json_file)

embeddings = torch.tensor(embeddings_list)
embeddings.size()

torch.Size([2126, 768])

## 2. Clustering algorithm with the embeddings 

In [119]:
def cosine_distance(X, centroids):
    return 1 - cosine_similarity(X, centroids)

def get_silhouette_score(tensors, labels):
    # Convert the torch tensors to numpy arrays and reshape them
    X = np.array([tensor.flatten().numpy() for tensor in tensors])
    # Calculate the silhouette score
    score = silhouette_score(X, labels=labels, metric='euclidean')

    return score

def KMeans_clustering(df):
    best_score = -1
    best_n = 0
    for n in range(2,5):
        kmeans = KMeans(n_clusters=n, random_state=0, metric =cosine_distance)
        kmeans.fit(df['embedding'].tolist())
        df[f'clusters_{n}'] = None
        df[f'clusters_{n}'] = kmeans.labels_
        silhouette_avg = get_silhouette_score(df['embedding'], df[f'clusters_{n}'])
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_n = n
    return df, best_n


In [120]:
for label, group in df.groupby('word'):
    print("label:", label)
    result_df, best_n = KMeans_clustering(group)
    print("best_n:", best_n)
    print(df)
    break

label: миро


TypeError: KMeans.__init__() got an unexpected keyword argument 'metric'

In [None]:
result_df

Unnamed: 0,usage_id,word,orth,sense_id,gloss,example,indices_target_token,date,period,embedding,clusters_2,clusters_3,clusters_4
0,train_ru_0,миро,миро,miro_f2cnblJ7iBg,"Миро, мvро ср., благовонное масло, пахучая мас...","Определение слова миро: Миро, мvро ср., благов...",,old,old,"[tensor(-0.5822), tensor(-0.4083), tensor(0.40...",0,2,3
1,train_ru_1,миро,миро,miro_8igZuzZK97Q,"религ. жидкое ароматическое масло, освящаемое ...","Затем ключарь приглашает, чтобы женщины вышли ...",,new,new,"[tensor(0.4642), tensor(-0.2129), tensor(0.321...",0,2,2
2,train_ru_2,миро,миро,miro_8igZuzZK97Q,"религ. жидкое ароматическое масло, освящаемое ...",Они везде одним мvром мазаны. Мvром покрыт (т....,,old,old,"[tensor(-0.3496), tensor(-0.1741), tensor(-0.3...",0,0,0
3,train_ru_3,миро,миро,miro_o98UfpSoYH4,"религ. жидкость, иногда чудесным образом выдел...","Мощи святого хранятся здесь до сих пор, продол...",,new,new,"[tensor(0.0417), tensor(-0.7488), tensor(-0.00...",0,2,2
4,train_ru_4,миро,миро,miro_o98UfpSoYH4,"религ. жидкость, иногда чудесным образом выдел...","Иосиф увидел, что по иконе Богородицы текли ст...",,new,new,"[tensor(0.0759), tensor(0.0134), tensor(0.2199...",1,1,1
