# Get the embeddings to work with the clustering algorithm

In [1]:
AVAILABLE_GPU = 2

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= f"{AVAILABLE_GPU}" # ALWAYS look the one with 0% usage
tf_device=f'/gpu:{AVAILABLE_GPU}'

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import re
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

## 1. Select the model and load the data

In [3]:
FILE_TO_READ = './data/russian/axolotl.dev.ru.tsv'
PRINT_EACH_ROW = True

language = FILE_TO_READ.split('.')[-2]
filename = FILE_TO_READ.split('/')[-1].split('.')[0:-1]
filename = '.'.join(filename)
language, filename

('ru', 'axolotl.dev.ru')

In [None]:
if (language == "ru"):
    tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
    model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
elif (language == "fi"):
    tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")
    model = AutoModel.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")
else:
    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-german-cased")
    model = AutoModel.from_pretrained("google-bert/bert-base-german-cased")

model.to(device)

In [5]:
df = pd.read_csv(FILE_TO_READ, sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2026 entries, 0 to 2025
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   usage_id              2026 non-null   object 
 1   word                  2026 non-null   object 
 2   orth                  2026 non-null   object 
 3   sense_id              2026 non-null   object 
 4   gloss                 2026 non-null   object 
 5   example               1912 non-null   object 
 6   indices_target_token  0 non-null      float64
 7   date                  2026 non-null   object 
 8   period                2026 non-null   object 
dtypes: float64(1), object(8)
memory usage: 142.6+ KB


## 2. Clean and fix the data

When there are no "example" column, the "gloss" will be taken, with a sentence like "Definition of WORD: GLOSS" depending on the language:

In [6]:
if language == "ru":
    prompt = "Определение слова {}: {}"
elif language == "fi":
    prompt = "Sanan {} määritelmä: {}"
else:
    prompt = "Definition von {}: {}"

print(prompt.format("word", "gloss"))

def fill_example(word, gloss, example):
    if pd.isna(example):
        return prompt.format(word, gloss)
    else:
        return example

Определение слова word: gloss


In [7]:
df['example'] = df.apply(lambda row: fill_example(row['word'],row['gloss'],row['example']), axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2026 entries, 0 to 2025
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   usage_id              2026 non-null   object 
 1   word                  2026 non-null   object 
 2   orth                  2026 non-null   object 
 3   sense_id              2026 non-null   object 
 4   gloss                 2026 non-null   object 
 5   example               2026 non-null   object 
 6   indices_target_token  0 non-null      float64
 7   date                  2026 non-null   object 
 8   period                2026 non-null   object 
dtypes: float64(1), object(8)
memory usage: 142.6+ KB


In [46]:
def print_nice(input_ids, index, index_end=None):
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    if index_end == None:
        tokens[index] = '\033[94m' + tokens[index] + '\033[0m'
    else:
        tokens[index] = '\033[94m' + tokens[index]
        tokens[index_end] = tokens[index_end] + '\033[0m'
    print(' '.join(tokens))

def generate_substrings(word):
    substrings = []
    for i in range(len(word), 0, -1):
        substrings.append(word[:i])
    return substrings[1:-1]

def find_sub_list(sl,l): # not used because some examples have no exact coincidence
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            return ind+1,ind+sll # +1 for the [CLS] token

def extract_letters(input_string):
    return re.sub(r'[^а-яА-Яa-zA-ZÀ-ÿёЁ-]', '', input_string)

def find_word_containing_target(sentence, target_word):
    index = sentence.find(target_word)
    if index == -1:
        return None
    start_index = sentence.rfind(" ", 0, index) + 1 if index != 0 else 0
    end_index = sentence.find(" ", index + len(target_word)) if sentence.find(" ", index + len(target_word)) != -1 else len(sentence)
    return extract_letters(sentence[start_index:end_index])

def get_search(example, word, orth=None, tokenizer=None):
    # append the words to search in the example, in the desired order
    search = [word]
    if orth and orth != word:
        search.append(orth)
    search += generate_substrings(word)
    if orth:
        search.extend([i for i in generate_substrings(orth) if i not in search])
    if tokenizer:
        tkn_word = tokenizer.tokenize(word)[0]
        tkn_orth = tokenizer.tokenize(orth)[0]
        if tkn_word not in search:
            search.append(tkn_word)
        if tkn_orth not in search:
            search.append(tkn_orth)

    for s in search:
        search_word = find_word_containing_target(example, s)
        if search_word:
            break
        search_word = find_word_containing_target(example.lower(), s.lower())
        if search_word:
            index = example.lower().find(search_word)
            if index != -1:
                search_word = example[index:index + len(search_word)]
            else:
                # this should never happen
                raise Exception(f"Found {search_word} in {example.lower()}, but then not found...")
            break
    else:
        search_word = ""
    return search_word

In cases where orth or word have multiple words, replace with the first word only:

In [47]:
for index, row in df.iterrows():
    if len(row['orth'].split())>1:
        first_occurrence = row['orth'].split()[0]
        df.at[index, 'orth'] = first_occurrence
        print(f"Replaced {row['orth']} with {df.at[index, 'orth']}")

In [10]:
for index, row in df.iterrows():
    if len(row['word'].split())>1:
        first_occurrence = row['word'].split()[0]
        df.at[index, 'word'] = first_occurrence
        print(f"{row['word']} -> {df.at[index, 'word']}")

Check that the regex is working with all the words. The next cell should have no output:

In [48]:
count = 0
chars_to_replace = [',', '.', "'", ":", ';', '?', '–', ')', ' ', '[', ']']

for index, row in df.iterrows():
    # Apply the extract_letters function to 'word' and 'orth' columns
    word = row['word']
    orth = row['orth']
    
    for char in chars_to_replace:
        word = word.replace(char, '')
    for char in chars_to_replace:
        orth = orth.replace(char, '')
    
    clean_word = extract_letters(word)
    clean_orth = extract_letters(orth)
    
    # Check if the cleaned versions are equal to the original values
    if clean_word != word or clean_orth != orth:
        print(f"{index}. {word, clean_word} - {orth, clean_orth}")
        count += 1
assert count == 0, "Must fix the regex to include all the characters of the language"

## 3. Compute the embeddings

In [51]:
embeddings = []
word = ""
word_idx = 0
print(df.loc[0, "word"], df.loc[0, "orth"])

for index, row in df.iterrows():
    if word != "" and word != row['word']:
        print(f"{'\n' if PRINT_EACH_ROW else ''}{row['word'], row['orth']}")
        word_idx += 1

    should_print = PRINT_EACH_ROW
    word = row['word']          # target word
    orth = row['orth']          # usage of the target word in the example
    sense_id = row['sense_id']  # sense of the target word in the example
    gloss = row['gloss']        # definition of the target word
    example = row['example']    # usage example of the target word

    # 1. Get the target word index in the example tokenized
    search_word = get_search(example, word, orth)
    tokens = tokenizer.tokenize(example)
    if search_word == "":
        if len(example.split()) == 1:
            print(f"{index}. \033[91mNot found\033[0m {word} in '{example}' (taking only word in example)")
            target_index, target_index_end = 1, 1
        else:
            print(f"{index}. \033[91mNot found\033[0m {word} in '{example}' (taking [CLS] token)")
            target_index, target_index_end = 0, 0
    else:
        search_tokens = tokenizer.tokenize(search_word)
        try:
            target_index, target_index_end = find_sub_list(search_tokens, tokens)
        except:
            # this should never happen
            raise ValueError(f"Error unpacking {search_tokens} in '{tokens}'")
    
    inputs = tokenizer(example, return_tensors="pt")
    if should_print:
        print_nice(inputs['input_ids'][0], target_index, target_index_end)

    # 2. Compute the embedding of the token
    """with torch.no_grad():
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
    
    embedding = outputs.last_hidden_state[0][target_index]
    embeddings.append(embedding)"""

могильник могильникъ
[CLS] [94mмог ##ил ##ки[0m , кладбище [SEP]
[CLS] О ##пределение слова [94mмог ##иль ##ник[0m : я ##рс . слово из Я ##росла ##вской губернии к ##рупный ко ##чка ##рни ##к , ко ##че ##гу ##рни ##к ; [SEP]
[CLS] О ##пределение слова [94mмог ##иль ##ник[0m : ар ##х . слово из Архангельск ##ой губернии поход ##ный чем ##ода ##нчи ##к с и ##гла ##ми , ши ##лья ##ми , ни ##тка ##ми , др ##ат ##вой и ##пр . для чин ##ки од ##еж ##и и об ##ув ##и . [SEP]
[CLS] О ##рёл из ##об ##ра ##жает реку Х ##ал ##зан , его голова [UNK] с ##кал ##у на Х ##ал ##зан ##е , а ла ##пы [94mмог ##иль ##ника[0m [UNK] мог ##ил ##у на с ##кал ##е . [SEP]
[CLS] О ##рел [94mмог ##иль ##ник[0m [SEP]
[CLS] Это я , я , который в " " Г ##ам ##лет ##е " " [94mмог ##иль ##щика[0m играл ! " " [SEP]
[CLS] О ##пределение слова [94mмог ##иль ##ник[0m : Ж ##ел ##то ##пе ##гий ж ##ук Si ##lp ##ha , за ##рыва ##ющий м ##ел ##кую м ##ерт ##ве ##чину . [SEP]
[CLS] В э ##кс ##позиции представлен ин 

In [15]:
assert len(embeddings) == len(df), "Embeddings and dataframe have different lengths"
embeddings = torch.stack(embeddings)
embeddings.shape

torch.Size([2126, 768])

In [16]:
embeddings_list = embeddings.tolist()

json_file_path = f'./embeddings/{filename}.json'

# Write the embeddings to a JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(embeddings_list, json_file)
