# Get the embeddings to work with the clustering algorithm

In [1]:
AVAILABLE_GPU = 1

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= f"{AVAILABLE_GPU}" # ALWAYS look the one with 0% usage
tf_device=f'/gpu:{AVAILABLE_GPU}'

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import re
import json
import unicodedata

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## 1. Select the model and load the data

In [4]:
FILE_TO_READ = './data/russian/axolotl.train.ru.tsv'
PRINT_EACH_ROW = True

language = FILE_TO_READ.split('.')[-2]
filename = FILE_TO_READ.split('/')[-1].split('.')[0:-1]
filename = '.'.join(filename)
language, filename

('ru', 'axolotl.train.ru')

In [5]:
if (language == "ru"):
    tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
    model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")
elif (language == "fi"):
    tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")
    model = AutoModel.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")
else:
    tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-german-cased")
    model = AutoModel.from_pretrained("google-bert/bert-base-german-cased")

model.to(device);

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
df = pd.read_csv(FILE_TO_READ, sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6494 entries, 0 to 6493
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   usage_id              6494 non-null   object 
 1   word                  6494 non-null   object 
 2   orth                  6494 non-null   object 
 3   sense_id              6494 non-null   object 
 4   gloss                 6494 non-null   object 
 5   example               6093 non-null   object 
 6   indices_target_token  0 non-null      float64
 7   date                  6493 non-null   object 
 8   period                6493 non-null   object 
dtypes: float64(1), object(8)
memory usage: 456.7+ KB


## 2. Clean and fix the data

When there are no "example" column, the "gloss" will be taken, with a sentence like "Definition of WORD: GLOSS" depending on the language:

In [7]:
if language == "ru":
    prompt = "Определение слова {}: {}"
elif language == "fi":
    prompt = "Sanan {} määritelmä: {}"
else:
    prompt = "Definition von {}: {}"

print(prompt.format("word", "gloss"))

def fill_example(word, gloss, example):
    if pd.isna(example):
        return prompt.format(word, gloss)
    else:
        return example

Определение слова word: gloss


In [8]:
df['example'] = df.apply(lambda row: fill_example(row['word'],row['gloss'],row['example']), axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6494 entries, 0 to 6493
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   usage_id              6494 non-null   object 
 1   word                  6494 non-null   object 
 2   orth                  6494 non-null   object 
 3   sense_id              6494 non-null   object 
 4   gloss                 6494 non-null   object 
 5   example               6494 non-null   object 
 6   indices_target_token  0 non-null      float64
 7   date                  6493 non-null   object 
 8   period                6493 non-null   object 
dtypes: float64(1), object(8)
memory usage: 456.7+ KB


In [29]:
def print_nice(input_ids, index, index_end=None, pad_token=None):
    if pad_token is not None:
        input_ids = [token for token in input_ids if token != pad_token]
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    if index_end is None:
        tokens[index] = '\033[94m' + tokens[index] + '\033[0m'
    else:
        tokens[index] = '\033[94m' + tokens[index]
        tokens[index_end] = tokens[index_end] + '\033[0m'
    print(' '.join(tokens))

def generate_substrings(word):
    substrings = []
    for i in range(len(word), 0, -1):
        substrings.append(word[:i])
    return substrings[1:-1]

def find_sub_list(sl,l): # not used because some examples have no exact coincidence
    sll=len(sl)
    for ind in (i for i,e in enumerate(l) if e==sl[0]):
        if l[ind:ind+sll]==sl:
            return ind+1,ind+sll # +1 for the [CLS] token

def remove_accents(word):
    normalized_word = unicodedata.normalize('NFD', word)
    cleaned_word = re.sub(r'[\u0300-\u036f&&[^й]]', '', normalized_word)
    return cleaned_word

def extract_letters(input_string):
    return re.sub(r'[^а-яА-Яa-zA-ZÀ-ÿёЁ\u0300-\u036f-]', '', input_string)

In cases where orth or word have multiple words, replace with the first word only:

In [10]:
for index, row in df.iterrows():
    if len(row['orth'].split())>1:
        first_occurrence = row['orth'].split()[0]
        df.at[index, 'orth'] = first_occurrence
        print(f"Replaced {row['orth']} with {df.at[index, 'orth']}")

Replaced нагнетать нагнести with нагнетать
Replaced нагнетать нагнести with нагнетать
Replaced нагнетать нагнести with нагнетать
Replaced нагнетать нагнести with нагнетать
Replaced облекать облечь  црк  облещи   облачить облачать   оболокать оболочь with облекать
Replaced облекать облечь  црк  облещи   облачить облачать   оболокать оболочь with облекать
Replaced облекать облечь  црк  облещи   облачить облачать   оболокать оболочь with облекать
Replaced облекать облечь  црк  облещи   облачить облачать   оболокать оболочь with облекать
Replaced облекать облечь  црк  облещи   облачить облачать   оболокать оболочь with облекать
Replaced облекать облечь  црк  облещи   облачить облачать   оболокать оболочь with облекать
Replaced облекать облечь  црк  облещи   облачить облачать   оболокать оболочь with облекать
Replaced облекать облечь  црк  облещи   облачить облачать   оболокать оболочь with облекать
Replaced облекать облечь  црк  облещи   облачить облачать   оболокать оболочь with облекать


In [11]:
for index, row in df.iterrows():
    if len(row['word'].split())>1:
        first_occurrence = row['word'].split()[0]
        df.at[index, 'word'] = first_occurrence
        print(f"{row['word']} -> {df.at[index, 'word']}")

Check that the regex is working with all the words. The next cell should have no output:

In [12]:
count = 0
chars_to_replace = [',', '.', "'", ":", ';', '?', '–', ')', ' ', '[', ']']

for index, row in df.iterrows():
    # Apply the extract_letters function to 'word' and 'orth' columns
    word = row['word']
    orth = row['orth']
    
    for char in chars_to_replace:
        word = word.replace(char, '')
    for char in chars_to_replace:
        orth = orth.replace(char, '')
    
    clean_word = extract_letters(word)
    clean_orth = extract_letters(orth)
    
    # Check if the cleaned versions are equal to the original values
    if clean_word != word or clean_orth != orth:
        print(f"{index}. {word, clean_word} - {orth, clean_orth}")
        count += 1
assert count == 0, "Must fix the regex to include all the characters of the language"

## 3. Compute the embeddings

In [22]:
def find_word_containing_target(sentence, target_word):
    index = sentence.find(target_word)
    if index == -1:
        return None
    start_index = sentence.rfind(" ", 0, index) + 1 if index != 0 else 0
    end_index = sentence.find(" ", index + len(target_word)) if sentence.find(" ", index + len(target_word)) != -1 else len(sentence)
    final_char = final_char = " " if target_word.endswith(" ") else ""
    return sentence[start_index:end_index].split()[0] + final_char

def get_search(example, word, orth=None, print_search=False):
    # append the words to search in the example, in the desired ORDER
    # 1 - the word (with an ending character), and the word itself
    search = [f"{word} ", f"{word},", f"{word}.", word]

    # 2 - the orthographic form (with an ending character), and the orthographic form itself
    if orth and orth != word:
        search += [f"{orth} ", f"{orth},", f"{orth}.", orth]

    # 3 - all substrings of the word (i.e. выходить -> ['выходит', 'выходи', 'выход', 'выхо', 'вых', 'вы'])
    search += generate_substrings(word)

    # 4 - all substrings of the orthographic form
    if orth:
        search.extend([i for i in generate_substrings(orth) if i not in search])

    # 5 - the word without accents (with an ending character), and the word without accents itself
    unicoded_word = remove_accents(word)
    if unicoded_word != word:
        search += [f"{unicoded_word} ", f"{unicoded_word},", f"{unicoded_word}.", unicoded_word]
    
    # 6 - the orthographic form without accents (with an ending character), and the orthographic form without accents itself
    if orth and orth != word:
        unicoded_orth = remove_accents(orth)
        if unicoded_orth != orth:
            search += [f"{unicoded_orth} ", f"{unicoded_orth},", f"{unicoded_orth}.", unicoded_orth]

    # 7 - all substrings of the word without accents
    if unicoded_word != word:
        search.extend([i for i in generate_substrings(unicoded_word) if i not in search])

    # 8 - all substrings of the orthographic form without accents
    if orth and orth != word and unicoded_orth != orth:
        search.extend([i for i in generate_substrings(unicoded_orth) if i not in search])

    if print_search:
        print(f"Searching for: {search}")

    # FIND the first search-string that is within the example, if any (in upper or lowercase)
    for s in search:
        search_word = find_word_containing_target(example, s)
        if search_word:
            break
        search_word = find_word_containing_target(example.lower(), s.lower())
        if search_word:
            index = example.lower().find(search_word)
            if index == -1:
                # this should never happen
                raise Exception(f"Found '{search_word}' in '{example.lower()}', but then not found...")
            else:
                search_word = example[index:index + len(search_word)]
            break
    else:
        search_word = ""
    return extract_letters(search_word)

In [31]:
embeddings = []
word = ""
word_idx = 0
print(df.loc[0, "word"], df.loc[0, "orth"])

starter_char = '\n' if PRINT_EACH_ROW else ''
for index, row in df.iterrows():
    if word != "" and word != row['word']:
        print(f"{starter_char}{row['word'], row['orth']}")
        word_idx += 1

    should_print = PRINT_EACH_ROW
    word = row['word']          # target word
    orth = row['orth']          # usage of the target word in the example
    sense_id = row['sense_id']  # sense of the target word in the example
    gloss = row['gloss']        # definition of the target word
    example = row['example']    # usage example of the target word

    # 1. Get the target word index in the example tokenized
    search_word = get_search(example, word, orth)
    tokens = tokenizer.tokenize(example)
    if search_word == "":
        if len(example.split()) == 1:
            print(f"{index}. \033[91mNot found\033[0m {word} in '{example}' (taking only word in example)")
            target_index, target_index_end = 1, 1
        else:
            print(f"{index}. \033[91mNot found\033[0m {word} in '{example}' (taking [CLS] token)")
            target_index, target_index_end = 0, 0
    else:
        search_tokens = tokenizer.tokenize(search_word)
        try:
            target_index, target_index_end = find_sub_list(search_tokens, tokens)
        except:
            # this should never happen
            raise ValueError(f"Error unpacking {search_tokens} in {tokens}")
    
    inputs = tokenizer(example, return_tensors="pt", max_length=512, truncation=True, padding='max_length')
    if should_print:
        print_nice(inputs['input_ids'][0], target_index, target_index_end, pad_token=tokenizer.pad_token_id)

    # 2. Compute the embedding of the token
    with torch.no_grad():
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
    
    embedding = outputs.last_hidden_state[0][target_index]
    embeddings.append(embedding)

миро миро
[CLS] Определение слова миро : [94mМиро[0m , м ##v ##ро ср . , благов ##онное масло , пах ##учая маст ##ь или души ##сто ##е масля ##нист ##ое вещество . [SEP]
[CLS] Затем ключа ##рь приглашает , чтобы женщины вышли из алтаря … Под ##аёт свято ##е [94mмиро[0m . Архиерей пом ##азу ##ет крестообраз ##но сначала трапез ##у в тех местах , где во время литургии стоит евангел ##ие , диско ##с и пот ##ир . [SEP]
[CLS] Они везде одним м ##v ##ром маз ##аны . М ##v ##ром покрыт ( т . е . пом ##азан ) , с [94mмиром[0m зас ##пит . Рогож ##цы в [SEP]
[CLS] Мощ ##и святого хранятся здесь до сих пор , продолжая источ ##ать [94mмиро[0m . [SEP]
[CLS] Иосиф увидел , что по иконе Богородицы тек ##ли струй ##ки [94mмира[0m , которое и издавал ##о благо ##уха ##ние . [SEP]

('могильный', 'могильный')
[CLS] Казалось , перед революцией уд ##есят ##ери ##лось ожидание , пред ##чувств ##ие , предвид ##ение золотого века , рая на земле ; казалось , вся душа нации исс ##ту ##пл ##енно бред #

OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB (GPU 0; 44.38 GiB total capacity; 5.53 GiB already allocated; 8.38 MiB free; 5.57 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
assert len(embeddings) == len(df), "Embeddings and dataframe have different lengths"
embeddings = torch.stack(embeddings)
embeddings.shape

In [None]:
embeddings_list = embeddings.tolist()

json_file_path = f'./embeddings/{filename}.json'

# Write the embeddings to a JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(embeddings_list, json_file)
