# Clustering algorithm

In [118]:
import torch
import pandas as pd
import json
from sklearn.cluster import KMeans, AffinityPropagation
from sklearn.metrics import silhouette_score
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import os

## 1. Load the data and the embeddings

In [119]:
FILE_TO_READ = './data/augmented/axolotl.test.fi.tsv'
EMBEDDING_TYPE = 'examples' # 'examples', 'glosses' or 'concatenated'
SHOULD_PRINT = False
CLUSTERING_METHOD = 'KMeans' # 'KMeans' or 'AffinityPropagation'
CLUSTERING_METRIC = 'inertia'
REMOVE_AUGMENTATION = False

language = FILE_TO_READ.split('.')[-2]
filename = FILE_TO_READ.split('/')[-1].split('.')[0:-1]
filename = '.'.join(filename)
embeddings_file = f"./embeddings/{EMBEDDING_TYPE}/{filename}.json"
language, embeddings_file

('fi', './embeddings/examples/axolotl.test.fi.json')

In [120]:
df = pd.read_csv(FILE_TO_READ, sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7224 entries, 0 to 7223
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   usage_id              7224 non-null   object
 1   word                  7224 non-null   object
 2   orth                  7224 non-null   object
 3   sense_id              3960 non-null   object
 4   gloss                 7224 non-null   object
 5   example               7224 non-null   object
 6   indices_target_token  7224 non-null   object
 7   date                  7224 non-null   int64 
 8   period                7224 non-null   object
dtypes: int64(1), object(8)
memory usage: 508.1+ KB


In [121]:
with open(embeddings_file, 'r') as json_file:
    embeddings_list = json.load(json_file)

embeddings = torch.tensor(embeddings_list)
assert embeddings.shape[0] == df.shape[0], "Embeddings count must be the same as the df length"
embeddings.size()

torch.Size([7224, 768])

In [122]:
df['embedding'] = list(embeddings)
assert all(df['embedding'][0] == embeddings[0])

In [123]:
if REMOVE_AUGMENTATION:
    df = df[~df.usage_id.str.contains("augmented")]
    df = df[~df.usage_id.str.contains("artificial")]
    df = df[~df.usage_id.str.contains("arificial")]
df

Unnamed: 0,usage_id,word,orth,sense_id,gloss,example,indices_target_token,date,period,embedding
0,test_fi_0,palaus,palaus,,"""Palaus"" tässä esimerkissä tarkoittaa paluuta ...",Tobian palaus cotia murhellisten wanhembainsa ...,7:13,1750,new,"[tensor(0.2277), tensor(0.5365), tensor(1.3932..."
1,test_fi_1,palaus,palaus,,Palaus tarkoittaa tässä yhteydessä paluuta tai...,"Teidän Cuning:sen Maj:tinne palaus, Teidän ja ...",28:34,1750,new,"[tensor(0.3357), tensor(0.1365), tensor(0.7251..."
2,test_fi_2,palaus,palauxesta,palaus_CWRkn3_kCjQ,paluu,[Seuraava teksti] On opetuslasten palauxesta J...,34:44,1600,old,"[tensor(0.7777), tensor(-0.5342), tensor(-0.64..."
3,test_fi_3,palaus,Palaus,,"""Palaus"" tarkoittaa tässä esimerkissä siirtymi...","ettei sencallainen Sijrtyminen, Palaus ja Känd...",32:38,1700,new,"[tensor(0.6432), tensor(0.0278), tensor(-0.329..."
4,test_fi_4,palaus,palaus,palaus_ef0RFR9a4Ac,"kääntymys, hengellinen kääntyminen",anna minulle yxi oikea catumus ia synnistä palaus,43:49,1543,old,"[tensor(1.0439), tensor(-0.3224), tensor(0.808..."
...,...,...,...,...,...,...,...,...,...,...
7219,arificial_test_fi_7241,mutka,mutcain,mutka_OOpbXCuf97s,laulun sävelkulusta,Kappaleen sävelkulku kulkee tasaisesti eteenpä...,24:31,1600,old,"[tensor(-0.0438), tensor(0.0402), tensor(-0.57..."
7220,arificial_test_fi_7242,mutka,mutcain,mutka_OOpbXCuf97s,laulun sävelkulusta,"Laulun sävelkulku sisältää useita mutkia, kun ...",24:31,1600,old,"[tensor(-0.2775), tensor(0.5760), tensor(0.126..."
7221,arificial_test_fi_7243,mutka,mutkall,mutka_HsIpNwQCuO8,"juoni, temppu, metku","Hän suunnitteli pienen mutkan auton eteen, jot...",59:66,1600,old,"[tensor(-0.4242), tensor(0.4710), tensor(-1.60..."
7222,arificial_test_fi_7244,mutka,mutkall,mutka_HsIpNwQCuO8,"juoni, temppu, metku","Lapsi veti hauskan mutkan, jotta voisi voittaa...",59:66,1600,old,"[tensor(0.8255), tensor(-0.2037), tensor(-0.62..."


## 2. Clustering algorithm with the embeddings 

In [124]:
def get_silhouette_score(tensors, kmeans):
    n = kmeans.n_clusters
    labels = kmeans.labels_
    if n == 1: return 0 # doesn't allow 1-cluster solutions
    X = np.array([tensor.flatten().numpy() for tensor in tensors])
    return silhouette_score(X, labels=labels, metric='euclidean')

def get_compactness_score(tensors, kmeans):
    # not the best actually...
    centroids = kmeans.cluster_centers_
    labels = kmeans.labels_
    X = np.array([tensor.flatten() for tensor in tensors])
    compactness = 0
    for i in range(len(centroids)):
        cluster_points = X[labels == i]
        if len(cluster_points) > 0:
            centroid = centroids[i]
            compactness += np.mean(np.linalg.norm(cluster_points - centroid, axis=1))
    return compactness / len(centroids)

def elbow_method(inertia_values):
    deltas = []
    for i in range(1, len(inertia_values)):
        deltas.append(inertia_values[i - 1] - inertia_values[i])
    max_curvature_index = deltas.index(max(deltas))
    return max_curvature_index + 2 # 1 for the index and 1 to take the right point, doesn't allow 1-cluster solutions

def KMeans_clustering(df,metric):
    best_score, best_n, inertias = -1, 0, []

    for n in range(1, len(df)):
        kmeans = KMeans(n_clusters=n, random_state=0, n_init='auto')
        kmeans.fit(df['embedding'].tolist())
        df[f'cluster_{n}'] = kmeans.labels_
        if (metric == 'inertia'):
            inertias.append(kmeans.inertia_)
        else:
            score = compute_metric(df['embedding'], kmeans, metric)
            if (metric == 'silhouette_score') and (score > best_score):
                best_score, best_n = score, n
            elif (metric == 'compactness') and (score > best_score):
                best_score, best_n = score, n
            if SHOULD_PRINT:
                print(f"Score for {n} clusters: {score}")
    
    if metric == 'inertia':
        best_n = elbow_method(inertias)

    df['cluster'] = df[f'cluster_{best_n}']
    df = df.drop(columns=[f'cluster_{n}' for n in range(1, len(df))])

    if SHOULD_PRINT:
        print("Best number of clusters:", best_n, f"[max {len(df)}]")
    return df

def AffinityPropagation_clustering(df):
    ap = AffinityPropagation()
    clusters = ap.fit(df['embedding'].tolist())
    df['cluster'] = clusters.labels_
    return df

def compute_metric(tensors, kmeans, method="silhouette_score"):
    return get_silhouette_score(tensors, kmeans) if method == "silhouette_score" else get_compactness_score(tensors, kmeans)

def clustering(df, method="AffinityPropagation", metric="silhouette_score"):
    if method == "KMeans":
        df_cl = KMeans_clustering(df,metric)
    else:
        df_cl = AffinityPropagation_clustering(df)
    
    clusters_replaced = df_cl.loc[~df_cl['sense_id'].isna(), 'cluster']
    clusters_names = df_cl.loc[~df_cl['sense_id'].isna(), 'sense_id']

    for index, value in clusters_replaced.items():
        df_cl.loc[df_cl['cluster'] == value, 'cluster'] = clusters_names[index]
    
    df_cl['sense_id'] = df_cl['cluster']
    df_cl.drop(columns=['cluster', 'embedding'], inplace=True)

    return df_cl

In [125]:
result_df = pd.DataFrame()
for word, group in df.groupby('word'):
    if SHOULD_PRINT:
        print(f"{word}: ", end="")
    group_cl = clustering(group, method=CLUSTERING_METHOD, metric=CLUSTERING_METRIC)
    result_df = pd.concat([result_df, group_cl], ignore_index=True)
    if len(group) != len(group_cl):
        print(f"{len(group)} != {len(group_cl)} for word {word}")

result_df = result_df.set_index('usage_id')
result_df = result_df.reindex(df['usage_id'])
result_df = result_df.reset_index()

result_df

Unnamed: 0,usage_id,word,orth,sense_id,gloss,example,indices_target_token,date,period
0,test_fi_0,palaus,palaus,palaus_ef0RFR9a4Ac,"""Palaus"" tässä esimerkissä tarkoittaa paluuta ...",Tobian palaus cotia murhellisten wanhembainsa ...,7:13,1750,new
1,test_fi_1,palaus,palaus,palaus_ef0RFR9a4Ac,Palaus tarkoittaa tässä yhteydessä paluuta tai...,"Teidän Cuning:sen Maj:tinne palaus, Teidän ja ...",28:34,1750,new
2,test_fi_2,palaus,palauxesta,palaus_CWRkn3_kCjQ,paluu,[Seuraava teksti] On opetuslasten palauxesta J...,34:44,1600,old
3,test_fi_3,palaus,Palaus,palaus_CWRkn3_kCjQ,"""Palaus"" tarkoittaa tässä esimerkissä siirtymi...","ettei sencallainen Sijrtyminen, Palaus ja Känd...",32:38,1700,new
4,test_fi_4,palaus,palaus,palaus_ef0RFR9a4Ac,"kääntymys, hengellinen kääntyminen",anna minulle yxi oikea catumus ia synnistä palaus,43:49,1543,old
...,...,...,...,...,...,...,...,...,...
7219,arificial_test_fi_7241,mutka,mutcain,mutka_JoEBKgeOiLc,laulun sävelkulusta,Kappaleen sävelkulku kulkee tasaisesti eteenpä...,24:31,1600,old
7220,arificial_test_fi_7242,mutka,mutcain,mutka_JoEBKgeOiLc,laulun sävelkulusta,"Laulun sävelkulku sisältää useita mutkia, kun ...",24:31,1600,old
7221,arificial_test_fi_7243,mutka,mutkall,mutka_JoEBKgeOiLc,"juoni, temppu, metku","Hän suunnitteli pienen mutkan auton eteen, jot...",59:66,1600,old
7222,arificial_test_fi_7244,mutka,mutkall,mutka_JoEBKgeOiLc,"juoni, temppu, metku","Lapsi veti hauskan mutkan, jotta voisi voittaa...",59:66,1600,old


In [126]:
outfolder = f'./predictions/{EMBEDDING_TYPE}-{CLUSTERING_METRIC}{f"-naug" if REMOVE_AUGMENTATION else ""}'
if not os.path.exists(outfolder):
    os.makedirs(outfolder)

result_df.to_csv(f'{outfolder}/{filename}.tsv', sep='\t', index=False)
print(f'Saved to: {outfolder}/{filename}.tsv')

Saved to: ./predictions/examples-inertia/axolotl.test.fi.tsv
