# Get the embeddings to work with in the algorithm

In [None]:
AVAILABLE_GPU = 2

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= f"{AVAILABLE_GPU}" # ALWAYS look the one with 0% usage
tf_device=f'/gpu:{AVAILABLE_GPU}'

In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
import re
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

## 1. Select the model and load the data

In [3]:
FILE_TO_READ = '../data/test/axolotl.test.ru.tsv'

language = FILE_TO_READ.split('.')[-2]
filename = FILE_TO_READ.split('/')[-1].split('.')[0:-1]
filename = '.'.join(filename)
language, filename

('fi', 'axolotl.dev.fi')

In [4]:
# russian opt1
#tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
#model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

# russian opt2
#tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
#model = AutoModel.from_pretrained("DeepPavlov/rubert-base-cased")

# finnish opt1
#tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")
#model = AutoModel.from_pretrained("TurkuNLP/bert-base-finnish-cased-v1")

# finnish opt2
#tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/bert-base-finnish-uncased-v1")
#model = AutoModel.from_pretrained("TurkuNLP/bert-base-finnish-uncased-v1")

# german opt1
#tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
#model = AutoModel.from_pretrained("dbmdz/bert-base-german-cased")

# german opt2
#tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-german-cased")
#model = AutoModel.from_pretrained("google-bert/bert-base-german-cased")

# multilingual
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
model = AutoModel.from_pretrained("bert-base-multilingual-cased")

model.to(device);

In [5]:
df = pd.read_csv(FILE_TO_READ, sep='\t')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6554 entries, 0 to 6553
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   usage_id              6554 non-null   object
 1   word                  6554 non-null   object
 2   orth                  6554 non-null   object
 3   sense_id              6554 non-null   object
 4   gloss                 6554 non-null   object
 5   example               6554 non-null   object
 6   indices_target_token  6554 non-null   object
 7   date                  6554 non-null   int64 
 8   period                6554 non-null   object
dtypes: int64(1), object(8)
memory usage: 461.0+ KB


## 2. Clean and fix the data

When there are no "example" column, the "gloss" will be taken, with a sentence like "Definition of WORD: GLOSS" depending on the language:

In [9]:
if language == "ru":
    prompt = "Определение слова {}: {}"
elif language == "fi":
    prompt = "Sanan {} määritelmä: {}"
else:
    prompt = "Definition von {}: {}"

print(prompt.format("word", "gloss"))

def fill_example(word, gloss, example):
    if pd.isna(example):
        return prompt.format(word, gloss)
    else:
        return example

Sanan word määritelmä: gloss


In [10]:
df['example'] = df.apply(lambda row: fill_example(row['word'],row['gloss'],row['example']), axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6554 entries, 0 to 6553
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   usage_id              6554 non-null   object
 1   word                  6554 non-null   object
 2   orth                  6554 non-null   object
 3   sense_id              6554 non-null   object
 4   gloss                 6554 non-null   object
 5   example               6554 non-null   object
 6   indices_target_token  6554 non-null   object
 7   date                  6554 non-null   int64 
 8   period                6554 non-null   object
dtypes: int64(1), object(8)
memory usage: 461.0+ KB


In [11]:
def print_nice(input_ids, index):
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    tokens[index] = '\033[94m' + tokens[index] + '\033[0m'
    print(' '.join(tokens))

def extract_letters(input_string):
    return re.sub(r'[^а-яА-Яa-zA-ZÀ-ÿёЁ]', '', input_string)

def find_word_containing_target(sentence, target_word):
    index = sentence.find(target_word)
    if index == -1:
        return None
    start_index = sentence.rfind(" ", 0, index) + 1 if index != 0 else 0
    end_index = sentence.find(" ", index + len(target_word)) if sentence.find(" ", index + len(target_word)) != -1 else len(sentence)
    return extract_letters(sentence[start_index:end_index])

def get_search(sentence, word, orth=None):
        found_search = find_word_containing_target(sentence, word)
        if found_search:
            return found_search
        else:
            if orth:
                found_search = find_word_containing_target(sentence, orth)
                if found_search:
                    return found_search
                else:
                    return word
            else:
                return word

def get_target_index(search_token, tokens, tokens_lower, tokens_lowersentence):
    if search_token in tokens:
        return tokens.index(search_token)+1,0 # +1 for the [CLS] token
    elif f"##{search_token}" in tokens:
        return tokens.index(f"##{search_token}")+1,0
    search_token = search_token.lower()
    if search_token in tokens_lower:
        return tokens_lower.index(search_token)+1,0 # +1 for the [CLS] token
    elif f"##{search_token}" in tokens_lower:
        return tokens_lower.index(f"##{search_token}")+1,0

    if search_token in tokens_lowersentence:
        return tokens_lowersentence.index(search_token)+1,1 # +1 for the [CLS] token
    elif f"##{search_token}" in tokens_lowersentence:
        return tokens_lowersentence.index(f"##{search_token}")+1,1
    else:
        return -1,0

def generate_substrings(word):
    substrings = []
    for i in range(len(word), 0, -1):
        substrings.append(word[:i])
    return substrings[1:-1]

In cases where orth or word have multiple words, replace with the first word only:

In [12]:
for index, row in df.iterrows():
    if len(row['orth'].split())>1:
        first_occurrence = row['orth'].split()[0]
        df.at[index, 'orth'] = first_occurrence
        print(f"Replaced {row['orth']} with {df.at[index, 'orth']}")

Replaced Perindö Printzi with Perindö
Replaced Perindö Prinssin with Perindö
Replaced Perindö Prinsi with Perindö
Replaced Perindö Printsein with Perindö
Replaced Perindö Prinsillä with Perindö
Replaced oppi- ... kirjoilda with oppi-
Replaced lämmin- ... Tauti with lämmin-
Replaced edes seisowaisia with edes
Replaced edes toimitetan, with edes
Replaced edes toimita with edes
Replaced edes wetämisesä with edes
Replaced pois torjua with pois
Replaced Pojes wiedä, with Pojes
Replaced pois wiedä with pois
Replaced pojes wiewät with pojes
Replaced pois wiedä with pois
Replaced pois wietä with pois
Replaced pois wiety with pois
Replaced pois wiedän with pois
Replaced pojes wietämän with pojes
Replaced pois wie with pois
Replaced pois wiewät with pois
Replaced pois wiepi with pois
Replaced Casten Lijton with Casten
Replaced Casteen Lijtto with Casteen
Replaced casten lijton with casten
Replaced Casten lijton with Casten
Replaced Kircko kunda, with Kircko
Replaced Jumalata pelkäwäisel with Jum

In [13]:
for index, row in df.iterrows():
    if len(row['word'].split())>1:
        first_occurrence = row['word'].split()[0]
        df.at[index, 'word'] = first_occurrence
        print(f"{row['word']} -> {df.at[index, 'word']}")

Check that the regex is working with all the words. The next cell should have no output:

In [26]:
count = 0
chars_to_replace = [',', '-', '.', "'", ":", ';', '?', '–', ')', ' ', '[', ']']

for index, row in df.iterrows():
    # Apply the extract_letters function to 'word' and 'orth' columns
    word = row['word']
    orth = row['orth']
    
    for char in chars_to_replace:
        word = word.replace(char, '')
    for char in chars_to_replace:
        orth = orth.replace(char, '')
    
    clean_word = extract_letters(word)
    clean_orth = extract_letters(orth)
    
    # Check if the cleaned versions are equal to the original values
    if clean_word != word or clean_orth != orth:
        print(f"{index}. {word, clean_word} - {orth, clean_orth}")
        count += 1
assert count == 0, "Must fix the regex to include all the characters of the language"

## 3. Compute the embeddings

In [31]:
embeddings = []
word = ""
word_idx = 0
print(df.loc[0, "word"])

for index, row in df.iterrows():
    if word != "" and word != row['word']:
        print(f"{row['word']}")
        word_idx += 1

    should_print = True
    word = row['word']          # target word
    orth = row['orth']          # usage of the target word in the example
    sense_id = row['sense_id']  # sense of the target word in the example
    gloss = row['gloss']        # definition of the target word
    example = row['example']    # usage example of the target word
    example_lower = example.lower()

    tokens = tokenizer.tokenize(example)
    tokens_lower = [i.lower() for i in tokenizer.tokenize(example)]
    tokens_lowersentence = tokenizer.tokenize(example.lower())

    # 1. Get the target word index in the example tokenized
    search = get_search(example, word, orth)
    search_token = tokenizer.tokenize(search)[0]
    target_index,flag = get_target_index(search_token, tokens, tokens_lower, tokens_lowersentence)
    if target_index == -1:
        should_print = True
        subwords = generate_substrings(word)
        for sw in subwords:
            search = get_search(example, sw)
            search_token = tokenizer.tokenize(search)[0]
            target_index,flag = get_target_index(search_token, tokens, tokens_lower, tokens_lowersentence)
            if (target_index != -1):
                break

        if target_index == -1:
            print("\033[91mNot found\033[0m", end=" ")
            print(f"{index}. {search, search_token} not found, taking [CLS] token... ({tokens_lowersentence})")
            target_index,flag = 0,0
    example = example if flag == 0 else example.lower()

    inputs = tokenizer(example, return_tensors="pt")
    if should_print:
        print_nice(inputs['input_ids'][0], target_index)

    # 2. Compute the embedding of the token
    with torch.no_grad():
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
    
    embedding = outputs.last_hidden_state[0][target_index]
    embeddings.append(embedding)

ajainen
[CLS] wit ##za ##ux ##en [94maja[0m ##isen , Herr ##a sun p ##ää ##lles heit ##ti [SEP]
[CLS] [94maja[0m ##inen rang ##ais ##tus ni ##jn ai ##wan suur , T ##äs runde ##le juu ##r [SEP]
alentaa
[CLS] Ei tu ##ki suo ##wa [94male[0m ##nna , war ##a we ##nh ##ett ##ä ca ##ada [SEP]
[CLS] ky ##nn ##ön pit ##ä [ tapa ##ht ##ua niin ] [UNK] [UNK] ettei yhtä ##kä ##n lo ##maa eli piel ##lost ##a , [94male[0m ##ttu ##ja ja y ##lett ##y ##jä wa ##co ##ja mat ##ca ##an sat ##eta ##isi [SEP]
[CLS] on myös hy ##öd ##yl ##linen [UNK] [UNK] maa ##ta Ta ##r ##han sis ##ä pu ##ole ##lda kai ##wami ##sella [94male[0m ##nda ##a [SEP]
[CLS] pitää [ tieto ##ja sala ##nnut ] [UNK] [UNK] Under - Up ##sier ##i , wir ##ald ##a pois [94male[0m ##tta ##man [SEP]
[CLS] [94male[0m ##nnet ##tako ##on Up ##sier ##i [UNK] [UNK] kolme ##xi Ku ##uka ##ude ##xi [SEP]
[CLS] n ##öy ##r ##äst it ##zes [94male[0m ##nna [SEP]
[CLS] [ Ju ##mala ] cor ##gott ##a sitä yhtä ja sitä toi ##sta [94male[0m 

KeyboardInterrupt: 

In [67]:
assert len(embeddings) == len(df), "Embeddings and dataframe have different lengths"
embeddings = torch.stack(embeddings)
embeddings.shape

torch.Size([2026, 768])

In [82]:
embeddings_list = embeddings.tolist()

json_file_path = f'../embeddings/embeddings-{filename}.json'

# Write the embeddings to a JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(embeddings_list, json_file)
