In [2]:
import json
import re

In [3]:
import stats as stats_data
import json
import stanza

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
stanza.download('uk') # download Ukrainian model

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 153MB/s]                     
2024-03-06 21:54:29 INFO: Downloaded file to /Users/yurayano/stanza_resources/resources.json
2024-03-06 21:54:29 INFO: Downloading default packages for language: uk (Ukrainian) ...
2024-03-06 21:54:30 INFO: File exists: /Users/yurayano/stanza_resources/uk/default.zip
2024-03-06 21:54:31 INFO: Finished downloading models and saved to /Users/yurayano/stanza_resources


In [5]:
stanza_nlp = stanza.Pipeline('uk', processors='tokenize,mwt,pos,lemma') # initialize Ukrainian neural pipeline

2024-03-06 21:54:31 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json: 373kB [00:00, 11.6MB/s]                    
2024-03-06 21:54:31 INFO: Downloaded file to /Users/yurayano/stanza_resources/resources.json
2024-03-06 21:54:32 INFO: Loading these models for language: uk (Ukrainian):
| Processor | Package     |
---------------------------
| tokenize  | iu          |
| mwt       | iu          |
| pos       | iu_charlm   |
| lemma     | iu_nocharlm |

2024-03-06 21:54:32 INFO: Using device: cpu
2024-03-06 21:54:32 INFO: Loading: tokenize
2024-03-06 21:54:32 INFO: Loading: mwt
2024-03-06 21:54:32 INFO: Loading: pos
2024-03-06 21:54:32 INFO: Loading: lemma
2024-03-06 21:54:32 INFO: Done loading processors!


In [16]:
document = stanza_nlp("будь-коли Нечуй-Левицький, таргетом! чи ні...;")
document

[
  [
    {
      "id": 1,
      "text": "будь-коли",
      "lemma": "будь-коли",
      "upos": "ADV",
      "xpos": "Pi------r",
      "feats": "PronType=Int",
      "start_char": 0,
      "end_char": 9
    },
    {
      "id": 2,
      "text": "Нечуй",
      "lemma": "Нечуй",
      "upos": "PROPN",
      "xpos": "Npmsny",
      "feats": "Animacy=Anim|Case=Nom|Gender=Masc|NameType=Giv|Number=Sing",
      "start_char": 10,
      "end_char": 15,
      "misc": "SpaceAfter=No"
    },
    {
      "id": 3,
      "text": "-",
      "lemma": "-",
      "upos": "PUNCT",
      "xpos": "U",
      "feats": "PunctType=Hyph",
      "start_char": 15,
      "end_char": 16,
      "misc": "SpaceAfter=No"
    },
    {
      "id": 4,
      "text": "Левицький",
      "lemma": "Левицький",
      "upos": "PROPN",
      "xpos": "Npmsny",
      "feats": "Animacy=Anim|Case=Nom|Gender=Masc|NameType=Sur|Number=Sing",
      "start_char": 16,
      "end_char": 25,
      "misc": "SpaceAfter=No"
    },
    {
      "

In [7]:
for token in document.iter_words():
    print(token.id)
    print(token.lemma)
    print(token.start_char)
    print(token.end_char)
    print("===")

1
світ
0
4
===
2
будь-ласка
5
15
===


In [8]:
train_file_path = stats_data.train_file_path
test_file_path = stats_data.test_file_path

In [9]:
with open('../' + train_file_path, 'r', encoding='utf-8') as file:
    train_pairs = [json.loads(line) for line in file]
with open('../' + test_file_path, 'r', encoding='utf-8') as file:
    test_pairs = [json.loads(line) for line in file]

In [10]:
class Lemmatizer:
    def __init__(self):
        # Initialize your lemmatizer, assuming 'uk' is the language code for Ukrainian
        self.lemmantizer = stanza.Pipeline('uk', processors='tokenize,mwt,pos,lemma')
        
    def _get_word_idx(self, lemantized_sent, word):
        lemantized_sent_lemmas = [word_sent["lemma"] for word_sent in lemantized_sent]
        try:
            index = lemantized_sent_lemmas.index(word["lemma"])

            word_idx = lemantized_sent[index].copy()
            word_idx["start_word"] = index      # inclusive
            word_idx["end_word"] = index + 1    # exclusive
            return word_idx
        except ValueError:
            return -1

    def _stem_sentence(self, input_entity, is_arr=False):
        if is_arr:
            sent = ' '.join(input_entity)
        else:
            sent = input_entity            
            
        doc = self.lemmantizer(sent)

        sentence_lemantized = []
        for word in doc.iter_words():
            sentence_lemantized.append({
                "text": word.text,
                "lemma": word.lemma,
                "start_char": word.start_char,
                "end_char": word.end_char,
            })

        return sentence_lemantized

    def get_target_idx(self, sentence: str, word: str, synonyms):
        synonyms_lemma = [synonym["lemma"] for synonym in synonyms]
        lemantized_synonyms = self._stem_sentence(synonyms_lemma, True)
        lemantized_sentence = self._stem_sentence(sentence)
        

        target_idxs = []
        for syn_lemma in lemantized_synonyms:
            word_idx = self._get_word_idx(lemantized_sentence, syn_lemma) 
            if word_idx != -1:
                target_idxs.append(word_idx)

        # filtered_target_idxs = [value for value in target_idxs if value != -1]
        if len(target_idxs) != 1:
            raise ValueError("📌 No target indexes found!")

        return target_idxs


In [None]:
lemmatizer = Lemmatizer()

In [12]:
train_entry = train_pairs[0]
train_entry

{'sentence1': 'Сонце вже звернуло з обіду і хилиться на захід',
 'sentence2': 'Забувала   про Дорку і всі думки звернула на себе',
 'label': 0,
 'lemma': 'звернути',
 'synonyms': [{'lemma': 'звернути', 'accent_positions': [5]},
  {'lemma': 'звертати', 'accent_positions': [5]}]}

In [13]:
target_idx_sent1 = lemmatizer.get_target_idx(train_entry["sentence1"], train_entry["lemma"], train_entry["synonyms"])
target_idx_sent1

[{'text': 'звернуло',
  'lemma': 'звернути',
  'start_char': 10,
  'end_char': 18,
  'start_word': 2,
  'end_word': 3}]

In [14]:
target_idx_sent1 = lemmatizer.get_target_idx(train_entry["sentence2"], train_entry["lemma"], train_entry["synonyms"])
target_idx_sent1

[{'text': 'звернула',
  'lemma': 'звернути',
  'start_char': 33,
  'end_char': 41,
  'start_word': 6,
  'end_word': 7}]

In [15]:
for train_pair in train_pairs:
    sent1_target_idx = lemmatizer.get_target_idx(train_pair["sentence1"], train_pair["lemma"], train_pair["synonyms"])
    sent2_target_idx = lemmatizer.get_target_idx(train_pair["sentence2"], train_pair["lemma"], train_pair["synonyms"])
    train_pair["sent1_target_idx"] = sent1_target_idx
    train_pair["sent2_target_idx"] = sent2_target_idx



ValueError: 📌 No target indexes found!