# Get the embeddings to work with in the algorithm

In [1]:
AVAILABLE_GPU = 2

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= f"{AVAILABLE_GPU}" # ALWAYS look the one with 0% usage
tf_device=f'/gpu:{AVAILABLE_GPU}'

In [2]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import re
import json

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## 1. Select the model and load the data

In [4]:
FILE_TO_READ = './data/test/axolotl.test.ru.tsv'

language = FILE_TO_READ.split('.')[-2]
filename = FILE_TO_READ.split('/')[-1].split('.')[0:-1]
filename = '.'.join(filename)
language, filename

('ru', 'axolotl.test.ru')

In [6]:
# russian opt1
#tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
#model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

# russian opt2
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")

# finnish opt1
#tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")
#model = AutoModel.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")

# finnish opt2
#tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-uncased-v1")
#model = AutoModel.from_pretrained("TurkuNLP/bert-base-finnish-uncased-v1")

# german opt1
#tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
#model = AutoModel.from_pretrained("dbmdz/bert-base-german-cased")

# german opt2
#tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-german-cased")
#model = AutoModel.from_pretrained("google-bert/bert-base-german-cased")

# multilingual
#tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
#model = AutoModel.from_pretrained("bert-base-multilingual-cased")

model.to(device)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(119547, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [7]:
df = pd.read_csv(FILE_TO_READ, sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   usage_id              2126 non-null   object 
 1   word                  2126 non-null   object 
 2   orth                  2126 non-null   object 
 3   sense_id              424 non-null    object 
 4   gloss                 424 non-null    object 
 5   example               1990 non-null   object 
 6   indices_target_token  0 non-null      float64
 7   date                  2126 non-null   object 
 8   period                2126 non-null   object 
dtypes: float64(1), object(8)
memory usage: 149.6+ KB


## 2. Clean and fix the data

When there are no "example" column, the "gloss" will be taken, with a sentence like "Definition of WORD: GLOSS" depending on the language:

In [8]:
if language == "ru":
    prompt = "Определение слова {}: {}"
elif language == "fi":
    prompt = "Sanan {} määritelmä: {}"
else:
    prompt = "Definition von {}: {}"

print(prompt.format("word", "gloss"))

def fill_example(word, gloss, example):
    if pd.isna(example):
        return prompt.format(word, gloss)
    else:
        return example

Определение слова word: gloss


In [9]:
df['example'] = df.apply(lambda row: fill_example(row['word'],row['gloss'],row['example']), axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   usage_id              2126 non-null   object 
 1   word                  2126 non-null   object 
 2   orth                  2126 non-null   object 
 3   sense_id              424 non-null    object 
 4   gloss                 424 non-null    object 
 5   example               2126 non-null   object 
 6   indices_target_token  0 non-null      float64
 7   date                  2126 non-null   object 
 8   period                2126 non-null   object 
dtypes: float64(1), object(8)
memory usage: 149.6+ KB


In [10]:
def print_nice(input_ids, index):
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    tokens[index] = '\033[94m' + tokens[index] + '\033[0m'
    print(' '.join(tokens))

def extract_letters(input_string):
    return re.sub(r'[^а-яА-Яa-zA-ZÀ-ÿёЁ]', '', input_string)

def find_word_containing_target(sentence, target_word):
    index = sentence.find(target_word)
    if index == -1:
        return None
    start_index = sentence.rfind(" ", 0, index) + 1 if index != 0 else 0
    end_index = sentence.find(" ", index + len(target_word)) if sentence.find(" ", index + len(target_word)) != -1 else len(sentence)
    return extract_letters(sentence[start_index:end_index])

def get_search(sentence, word, orth=None):
        found_search = find_word_containing_target(sentence, word)
        if found_search:
            return found_search
        else:
            if orth:
                found_search = find_word_containing_target(sentence, orth)
                if found_search:
                    return found_search
                else:
                    return word
            else:
                return word

def get_target_index(search_token, tokens, tokens_lower, tokens_lowersentence):
    if search_token in tokens:
        return tokens.index(search_token)+1,0 # +1 for the [CLS] token
    elif f"##{search_token}" in tokens:
        return tokens.index(f"##{search_token}")+1,0
    search_token = search_token.lower()
    if search_token in tokens_lower:
        return tokens_lower.index(search_token)+1,0 # +1 for the [CLS] token
    elif f"##{search_token}" in tokens_lower:
        return tokens_lower.index(f"##{search_token}")+1,0

    if search_token in tokens_lowersentence:
        return tokens_lowersentence.index(search_token)+1,1 # +1 for the [CLS] token
    elif f"##{search_token}" in tokens_lowersentence:
        return tokens_lowersentence.index(f"##{search_token}")+1,1
    else:
        return -1,0

def generate_substrings(word):
    substrings = []
    for i in range(len(word), 0, -1):
        substrings.append(word[:i])
    return substrings[1:-1]

In cases where orth or word have multiple words, replace with the first word only:

In [11]:
for index, row in df.iterrows():
    if len(row['orth'].split())>1:
        first_occurrence = row['orth'].split()[0]
        df.at[index, 'orth'] = first_occurrence
        print(f"Replaced {row['orth']} with {df.at[index, 'orth']}")

Replaced зарубить   иглою и топоромъ  что начинать рубить  зарубливать with зарубить
Replaced зарубить   иглою и топоромъ  что начинать рубить  зарубливать with зарубить
Replaced зарубить   иглою и топоромъ  что начинать рубить  зарубливать with зарубить
Replaced зарубить   иглою и топоромъ  что начинать рубить  зарубливать with зарубить
Replaced зарубить   иглою и топоромъ  что начинать рубить  зарубливать with зарубить
Replaced зарубить   иглою и топоромъ  что начинать рубить  зарубливать with зарубить
Replaced зарубить   иглою и топоромъ  что начинать рубить  зарубливать with зарубить
Replaced зарубить   иглою и топоромъ  что начинать рубить  зарубливать with зарубить
Replaced зарубить   иглою и топоромъ  что начинать рубить  зарубливать with зарубить


In [12]:
for index, row in df.iterrows():
    if len(row['word'].split())>1:
        first_occurrence = row['word'].split()[0]
        df.at[index, 'word'] = first_occurrence
        print(f"{row['word']} -> {df.at[index, 'word']}")

Check that the regex is working with all the words. The next cell should have no output:

In [13]:
count = 0
chars_to_replace = [',', '-', '.', "'", ":", ';', '?', '–', ')', ' ', '[', ']']

for index, row in df.iterrows():
    # Apply the extract_letters function to 'word' and 'orth' columns
    word = row['word']
    orth = row['orth']
    
    for char in chars_to_replace:
        word = word.replace(char, '')
    for char in chars_to_replace:
        orth = orth.replace(char, '')
    
    clean_word = extract_letters(word)
    clean_orth = extract_letters(orth)
    
    # Check if the cleaned versions are equal to the original values
    if clean_word != word or clean_orth != orth:
        print(f"{index}. {word, clean_word} - {orth, clean_orth}")
        count += 1
assert count == 0, "Must fix the regex to include all the characters of the language"

## 3. Compute the embeddings

In [14]:
embeddings = []
word = ""
word_idx = 0
print(df.loc[0, "word"])

for index, row in df.iterrows():
    if word != "" and word != row['word']:
        print(f"{row['word']}")
        word_idx += 1

    should_print = True
    word = row['word']          # target word
    orth = row['orth']          # usage of the target word in the example
    sense_id = row['sense_id']  # sense of the target word in the example
    gloss = row['gloss']        # definition of the target word
    example = row['example']    # usage example of the target word
    example_lower = example.lower()

    tokens = tokenizer.tokenize(example)
    tokens_lower = [i.lower() for i in tokenizer.tokenize(example)]
    tokens_lowersentence = tokenizer.tokenize(example.lower())

    # 1. Get the target word index in the example tokenized
    search = get_search(example, word, orth)
    search_token = tokenizer.tokenize(search)[0]
    target_index,flag = get_target_index(search_token, tokens, tokens_lower, tokens_lowersentence)
    if target_index == -1:
        should_print = True
        subwords = generate_substrings(word)
        for sw in subwords:
            search = get_search(example, sw)
            search_token = tokenizer.tokenize(search)[0]
            target_index,flag = get_target_index(search_token, tokens, tokens_lower, tokens_lowersentence)
            if (target_index != -1):
                break

        if target_index == -1:
            print("\033[91mNot found\033[0m", end=" ")
            print(f"{index}. {search, search_token} not found, taking [CLS] token... ({tokens_lowersentence})")
            target_index,flag = 0,0
    example = example if flag == 0 else example.lower()

    inputs = tokenizer(example, return_tensors="pt")
    if should_print:
        print_nice(inputs['input_ids'][0], target_index)

    # 2. Compute the embedding of the token
    with torch.no_grad():
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
    
    embedding = outputs.last_hidden_state[0][target_index]
    embeddings.append(embedding)

мёрзлый
[91mNot found[0m 0. ('мё', 'м') not found, taking [CLS] token... (['мерз', '##лая', 'земля', 'скажется', ',', 'только', 'руку', 'прилож', '##и', 'от', 'того', 'нельзя', ',', 'что', 'земля', 'мерз', '##ла', '.', 'мерз', '##лой'])
[94m[CLS][0m Мерз ##лая земля скажется , только руку прилож ##и От того нельзя , что земля мерз ##ла . Мерз ##лой [SEP]
[CLS] Пантелей Прокофьев ##ич сун ##ул Григорию [94mм[0m ##ёрз ##лую руку , сел на край лавки , запах ##ивая полу тул ##упа , обход ##я взглядом Акс ##инь ##ю , присты ##вшую у лю ##ль ##ки . [SEP]
[CLS] Между домом и рельс ##ами , за широкой [94mм[0m ##ёрз ##лой луж ##ей , проходила дорога , по которой воз ##или дрова и воду . [SEP]
[CLS] — Да , холодно , должно быть . На полу [94mм[0m ##ёрз ##лые тарак ##аны вал ##яются . И мыши тоже пом ##ёрз ##ли . [SEP]
[CLS] Когда мосты были прорв ##аны , безоруж ##ные солдаты , московские жители , женщины с детьми , бывшие в обоз ##е французов , — всё под влиянием силы инерции не сдава

In [15]:
assert len(embeddings) == len(df), "Embeddings and dataframe have different lengths"
embeddings = torch.stack(embeddings)
embeddings.shape

torch.Size([2126, 768])

In [16]:
embeddings_list = embeddings.tolist()

json_file_path = f'./embeddings/{filename}.json'

# Write the embeddings to a JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(embeddings_list, json_file)
