# Clustering algorithm

In [145]:
import torch
import pandas as pd
import json
from sklearn.cluster import KMeans, AffinityPropagation
from sklearn.metrics import silhouette_score
import numpy as np
import warnings
warnings.filterwarnings('ignore')

## 1. Load the data and the embeddings

In [146]:
FILE_TO_READ = './data/dev-testing/axolotl.dev.fi.tsv'
EMBEDDING_TYPE = 'glosses' # 'examples', 'glosses' or 'concatenated'
PRINT_WORDS = False
CLUSTERING_METHOD = 'KMeans' # 'KMeans' or 'AffinityPropagation'

language = FILE_TO_READ.split('.')[-2]
filename = FILE_TO_READ.split('/')[-1].split('.')[0:-1]
filename = '.'.join(filename)
embeddings_file = f"./embeddings/{EMBEDDING_TYPE}/{filename}.json"
language, embeddings_file

('fi', './embeddings/concatenated/axolotl.test.fi.json')

In [147]:
df = pd.read_csv(FILE_TO_READ, sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7224 entries, 0 to 7223
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   usage_id              7224 non-null   object
 1   word                  7224 non-null   object
 2   orth                  7224 non-null   object
 3   sense_id              3960 non-null   object
 4   gloss                 7224 non-null   object
 5   example               7224 non-null   object
 6   indices_target_token  7224 non-null   object
 7   date                  7224 non-null   int64 
 8   period                7224 non-null   object
dtypes: int64(1), object(8)
memory usage: 508.1+ KB


In [148]:
with open(embeddings_file, 'r') as json_file:
    embeddings_list = json.load(json_file)

embeddings = torch.tensor(embeddings_list)
assert embeddings.shape[0] == df.shape[0], "Embeddings count must be the same as the df length"
embeddings.size()

JSONDecodeError: Expecting ',' delimiter: line 1 column 126627656 (char 126627655)

In [None]:
df['embedding'] = list(embeddings)
assert all(df['embedding'][0] == embeddings[0])

## 2. Clustering algorithm with the embeddings 

In [None]:
def get_silhouette_score(tensors, labels):
    X = np.array([tensor.flatten().numpy() for tensor in tensors])
    score = silhouette_score(X, labels=labels, metric='euclidean')
    return score

def KMeans_clustering(df):
    best_score = -1
    best_n = 0
    min_senses = df['sense_id'].nunique()
    max_senses = min_senses + df['sense_id'].isnull().sum()

    for n in range(min_senses,max_senses):
        kmeans = KMeans(n_clusters=n, random_state=0, n_init='auto')
        kmeans.fit(df['embedding'].tolist())
        df[f'cluster_{n}'] = None
        df[f'cluster_{n}'] = kmeans.labels_
        try:
            silhouette_avg = get_silhouette_score(df['embedding'], df[f'cluster_{n}']) if n > 1 else 0 # TODO: Esto es correcto, sí debería ser 0?
        except Exception as e:
            # this happens with glooses because they may have exactly the same embedding
            silhouette_avg = 1e6 # very high value
            #raise e
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_n = n

    if PRINT_WORDS:
        print("Best number of clusters:", best_n, f"[{min_senses}-{max_senses}]")
    df['cluster'] = df[f'cluster_{best_n}']
    df = df.drop(columns=[f'cluster_{n}' for n in range(min_senses,max_senses)])
    return df

def AffinityPropagation_clustering(df):
    ap = AffinityPropagation()
    clusters = ap.fit(df['embedding'].tolist())
    df['cluster'] = None
    df['cluster'] = clusters.labels_
    return df

def clustering(df, method="AffinityPropagation"):
    if method == "KMeans":
        df_cl = KMeans_clustering(df)
    else:
        df_cl = AffinityPropagation_clustering(df)

    """# Transformation code (set sense_id based on cluster) TODO: Error! It's expanding the dataframe, and it shouldn't
    mapping = df_cl.groupby('cluster')['sense_id'].apply(lambda x: x.dropna().unique()).to_dict()
    df_cl['sense_id'] = df_cl['sense_id'].fillna(df_cl['cluster'].map(mapping))
    df_cl = df_cl.explode('sense_id')

    df_cl.loc[df_cl['sense_id'].isna(), 'sense_id'] = df_cl.loc[df_cl['sense_id'].isna(), 'cluster']
    df_cl.drop(columns=['cluster'], inplace=True)
    df_cl.reset_index(drop=True, inplace=True)"""
    
    # 1. Save the cluster values

    # 2. If sense_id is not null, set cluster to sense_id
    clusters_replaced = df_cl.loc[~df_cl['sense_id'].isna(), 'cluster']
    clusters_names = df_cl.loc[~df_cl['sense_id'].isna(), 'sense_id']

    for index, value in clusters_replaced.items():
        df_cl.loc[df_cl['cluster'] == value, 'cluster'] = clusters_names[index]
    
    df_cl['sense_id'] = df_cl['cluster']
    df_cl.drop(columns=['cluster', 'embedding'], inplace=True)

    return df_cl

In [None]:
result_df = pd.DataFrame()
for word, group in df.groupby('word'):
    if PRINT_WORDS:
        print(f"{word}: ", end="")
    group_cl = clustering(group, method=CLUSTERING_METHOD)
    result_df = pd.concat([result_df, group_cl], ignore_index=True)
    if len(group) != len(group_cl):
        print(f"{len(group)} != {len(group_cl)} for word {word}")

# TODO: reorder result_df, in the same order of usage_id in the original df
result_df = result_df.set_index('usage_id')
result_df = result_df.reindex(df['usage_id'])
result_df = result_df.reset_index()

result_df

Unnamed: 0,usage_id,word,orth,sense_id,gloss,example,indices_target_token,date,period
0,test_fi_0,palaus,palaus,4,"""Palaus"" tässä esimerkissä tarkoittaa paluuta ...",Tobian palaus cotia murhellisten wanhembainsa ...,7:13,1750,new
1,test_fi_1,palaus,palaus,4,Palaus tarkoittaa tässä yhteydessä paluuta tai...,"Teidän Cuning:sen Maj:tinne palaus, Teidän ja ...",28:34,1750,new
2,test_fi_2,palaus,palauxesta,palaus_CWRkn3_kCjQ,paluu,[Seuraava teksti] On opetuslasten palauxesta J...,34:44,1600,old
3,test_fi_3,palaus,Palaus,4,"""Palaus"" tarkoittaa tässä esimerkissä siirtymi...","ettei sencallainen Sijrtyminen, Palaus ja Känd...",32:38,1700,new
4,test_fi_4,palaus,palaus,palaus_ef0RFR9a4Ac,"kääntymys, hengellinen kääntyminen",anna minulle yxi oikea catumus ia synnistä palaus,43:49,1543,old
...,...,...,...,...,...,...,...,...,...
7219,arificial_test_fi_7241,mutka,mutcain,mutka_OOpbXCuf97s,laulun sävelkulusta,Kappaleen sävelkulku kulkee tasaisesti eteenpä...,24:31,1600,old
7220,arificial_test_fi_7242,mutka,mutcain,mutka_OOpbXCuf97s,laulun sävelkulusta,"Laulun sävelkulku sisältää useita mutkia, kun ...",24:31,1600,old
7221,arificial_test_fi_7243,mutka,mutkall,mutka_HsIpNwQCuO8,"juoni, temppu, metku","Hän suunnitteli pienen mutkan auton eteen, jot...",59:66,1600,old
7222,arificial_test_fi_7244,mutka,mutkall,mutka_HsIpNwQCuO8,"juoni, temppu, metku","Lapsi veti hauskan mutkan, jotta voisi voittaa...",59:66,1600,old


In [None]:
# save the results_df to a tsv file
result_df.to_csv(f'./predictions/{filename}-{EMBEDDING_TYPE}-{CLUSTERING_METHOD}.tsv', sep='\t', index=False)