In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import re
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn import preprocessing
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModel.from_pretrained("bert-base-multilingual-cased")

In [3]:
df_fi_dev = pd.read_csv('../data/finnish/axolotl.dev.fi.tsv', sep='\t')
df_fi_train = pd.read_csv('../data/finnish/axolotl.train.fi.tsv', sep='\t')
df_ru_dev = pd.read_csv('../data/russian/axolotl.dev.ru.tsv', sep='\t')
df_ru_train = pd.read_csv('../data/russian/axolotl.train.ru.tsv', sep='\t')
df_fi_test = pd.read_csv('../data/test/axolotl.test.fi.tsv', sep='\t')
df_ru_test = pd.read_csv('../data/test/axolotl.test.ru.tsv', sep='\t')
df_surprise = pd.read_csv('../data/test/axolotl.test.surprise.tsv', sep='\t')

all_dfs = {"russian": df_ru_train, "finnish":  df_fi_train}

In [4]:
df_ru_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   usage_id              2126 non-null   object 
 1   word                  2126 non-null   object 
 2   orth                  2126 non-null   object 
 3   sense_id              424 non-null    object 
 4   gloss                 424 non-null    object 
 5   example               1990 non-null   object 
 6   indices_target_token  0 non-null      float64
 7   date                  2126 non-null   object 
 8   period                2126 non-null   object 
dtypes: float64(1), object(8)
memory usage: 149.6+ KB


In [5]:
def fill_example(word, gloss, example):
    if pd.isna(example):
        return f"Определение слова {word}: {gloss}"
    else:
        return example
df_ru_train['example'] = df_ru_train.apply(lambda row: fill_example(row['word'],row['gloss'], row['example']), axis=1)

In [6]:
df = df_ru_train
df['embedding'] = None
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6494 entries, 0 to 6493
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   usage_id              6494 non-null   object 
 1   word                  6494 non-null   object 
 2   orth                  6494 non-null   object 
 3   sense_id              6494 non-null   object 
 4   gloss                 6494 non-null   object 
 5   example               6494 non-null   object 
 6   indices_target_token  0 non-null      float64
 7   date                  6493 non-null   object 
 8   period                6493 non-null   object 
 9   embedding             0 non-null      object 
dtypes: float64(1), object(9)
memory usage: 507.5+ KB


In [7]:
def print_nice(input_ids, index):
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    tokens[index] = '\033[91m' + tokens[index] + '\033[0m'
    print(' '.join(tokens))

def find_word_containing_target(sentence, target_word):
    index = sentence.find(target_word)
    if index == -1:
        return None
    start_index = sentence.rfind(" ", 0, index) + 1 if index != 0 else 0
    end_index = sentence.find(" ", index + len(target_word)) if sentence.find(" ", index + len(target_word)) != -1 else len(sentence)
    return sentence[start_index:end_index]

In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [9]:
def get_search(sentence, word, orth=None):
        found_search = find_word_containing_target(sentence, word)
        if found_search:
            return found_search
        else:
            if orth:
                found_search = find_word_containing_target(sentence, orth)
                if found_search:
                    return found_search
                else:
                    return word
            else:
                return word

def get_target_index(search_token, tokens, tokens_lower):
    if search_token in tokens:
        return tokens.index(search_token)+1 # +1 for the [CLS] token
    elif f"##{search_token}" in tokens:
        return tokens.index(f"##{search_token}")+1
    elif search_token in tokens_lower:
            return tokens_lower.index(search_token)+1 # +1 for the [CLS] token
    elif f"##{search_token}" in tokens_lower:
            return tokens_lower.index(f"##{search_token}")+1
    else:
        return -1

In [10]:
embeddings = []
orth = ""

word_idx = 0

for index, row in df.iterrows():
    if orth != "" and orth != row['word']:
        print(f"{orth}")
        word_idx += 1
    if word_idx == 5:
        break

    orth = row['word']          # target word
    word = row['orth']          # usage of the target word in the example
    sense_id = row['sense_id']  # sense of the target word in the example
    gloss = row['gloss']        # definition of the target word
    example = row['example']    # usage example of the target word

    tokens = tokenizer.tokenize(example)
    tokens_lower = [i.lower() for i in tokenizer.tokenize(example)]

    search = get_search(example, word, orth)
    search_token = tokenizer.tokenize(search)[0]
    target_index = get_target_index(search_token, tokens, tokens_lower)
    if target_index == -1:
        search = get_search(example, search_token)
        search_token = tokenizer.tokenize(search)[0]
        target_index = get_target_index(search_token, tokens, tokens_lower)
        if target_index == -1:
            print(f"{index}. {search_token} not found, taking [CLS] token... ({tokens})")
            target_index = 0

    inputs = tokenizer(example, return_tensors="pt")
    print_nice(inputs['input_ids'][0], target_index)

    with torch.no_grad():
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
    
    df.at[index, 'embedding'] = outputs.last_hidden_state[0][target_index]/torch.norm(outputs.last_hidden_state[0][target_index])

[CLS] О ##пределение слова [91mмир[0m ##о : Мир ##о , м ##v ##ро с ##р . , б ##лаг ##ово ##нное ма ##сло , па ##ху ##чая ма ##сть или души ##сто ##е ма ##сл ##ян ##ист ##ое веществ ##о . [SEP]


[CLS] Затем к ##люч ##ар ##ь при ##гла ##шает , чтобы женщины вышли из ал ##тар ##я [UNK] Под ##а ##ёт свят ##ое [91mмир[0m ##о . А ##рх ##ие ##рей пом ##азу ##ет крест ##оо ##бра ##зно сначала т ##ра ##пе ##зу в тех местах , где во время лит ##ург ##ии стоит е ##ван ##гел ##ие , диск ##ос и по ##ти ##р . [SEP]
[CLS] Они в ##езде одним м ##v ##ром ма ##зан ##ы . М ##v ##ром по ##крыт ( т . е . пом ##азан ) , с [91mмир[0m ##ом за ##с ##пит . Р ##ого ##ж ##цы в [SEP]
[CLS] М ##о ##щи святого х ##ран ##ятся здесь до сих пор , про ##до ##лж ##ая исто ##чать [91mмир[0m ##о . [SEP]
[CLS] И ##осиф у ##вид ##ел , что по и ##кон ##е Бог ##оро ##ди ##цы тек ##ли стр ##уй ##ки [91mмира[0m , которое и изд ##ава ##ло б ##лаг ##оу ##хан ##ие . [SEP]
миро
[CLS] К ##аза ##лось , перед р ##ево ##лю ##цией у ##де ##сят ##ери ##лось о ##жи ##дание , пред ##чу ##в ##ствие , пред ##вид ##ение золото ##го века , р ##ая на земле ; каз ##алось , вся ду ##ша на ##ции и ##сс ##туп ##лен ##но бр ##еди ##

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6494 entries, 0 to 6493
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   usage_id              6494 non-null   object 
 1   word                  6494 non-null   object 
 2   orth                  6494 non-null   object 
 3   sense_id              6494 non-null   object 
 4   gloss                 6494 non-null   object 
 5   example               6494 non-null   object 
 6   indices_target_token  0 non-null      float64
 7   date                  6493 non-null   object 
 8   period                6493 non-null   object 
 9   embedding             35 non-null     object 
dtypes: float64(1), object(9)
memory usage: 507.5+ KB


In [12]:
df = df.dropna(subset=['embedding'])

In [15]:
from sklearn.cluster import KMeans, AffinityPropagation

def get_silhouette_score(tensors, labels):
    # Convert the torch tensors to numpy arrays and reshape them
    X = np.array([tensor.flatten().numpy() for tensor in tensors])
    # Calculate the silhouette score
    score = silhouette_score(X, labels=labels, metric='euclidean')
    return score

def KMeans_clustering(df):
    best_score = -1
    best_n = 0
    min_senses = df['sense_id'].nunique()
    max_senses = min_senses + df['sense_id'].isnull().sum()
    for n in range(min_senses,max_senses):
        kmeans = KMeans(n_clusters=n, random_state=0)
        kmeans.fit(df['embedding'].tolist())
        df[f'clusters_{n}'] = None
        df[f'clusters_{n}'] = kmeans.labels_
        silhouette_avg = get_silhouette_score(df['embedding'], df[f'clusters_{n}'])
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_n = n
    print("Best number of clusters:", best_n)
    df['clusters'] = df[f'clusters_{best_n}']
    df = df.drop(columns=[f'clusters_{n}' for n in range(min_senses,max_senses)])
    return df

def AffinityPropagation_clustering(df):
    ap = AffinityPropagation()
    clusters = ap.fit(df['embedding'].tolist())
    df['clusters'] = None
    df['clusters'] = clusters.labels_
    return df

def clustering(df, method="AffinityPropagation"):
    if method == "KMeans":
        return KMeans_clustering(df)
    elif method == "AffinityPropagation":
        return AffinityPropagation_clustering(df)
    else:
        print("Invalid clustering method")
        return None

In [26]:
counter = 0
for label, group in df.groupby('word'):
    print("label:", label)
    result_df = clustering(group, method="KMeans")
    result_df
    counter+=1
    if counter == 5:
        break

label: миро
label: могильный
label: мокрица
label: моль
label: мораль


In [27]:
result_df

Unnamed: 0,usage_id,word,orth,sense_id,gloss,example,indices_target_token,date,period,embedding,clusters
30,train_ru_43,мораль,мораль,moral'_L9xfEoeFqW8,"исч., разг. нравоучение, наставление","Выглянула из двери, повиснув на костылях, пожи...",,new,new,"[tensor(-0.0339), tensor(-0.0134), tensor(0.01...",1
31,train_ru_44,мораль,мораль,moral'_L9xfEoeFqW8,"исч., разг. нравоучение, наставление","Она была их утешительницей, душеприказчицей, к...",,new,new,"[tensor(0.0087), tensor(0.0066), tensor(-0.017...",0
32,train_ru_45,мораль,мораль,moral'_L9xfEoeFqW8,"исч., разг. нравоучение, наставление",Сытая мораль ародн. дурная слава. Про меня мор...,,old,old,"[tensor(-0.0548), tensor(0.0365), tensor(0.048...",1
33,train_ru_46,мораль,мораль,moral'_L9xfEoeFqW8,"исч., разг. нравоучение, наставление",Сегодня в нашем обществе насаждается потребите...,,new,new,"[tensor(-0.0489), tensor(0.0271), tensor(0.033...",1
34,train_ru_47,мораль,мораль,moral'_eKPPBHOtdOI,исч. вывод из чего-нибудь; нравственный урок,Мораль сей басни проста как правда: такое в жи...,,new,new,"[tensor(-0.0203), tensor(0.0253), tensor(-0.00...",2
