# Assignment 3 solution
##### Solution 1 by Vladislav Urzhumov (v.urzhumov@innopolis.university)

Current solution is implemented by a straightforward approach: all sentences from train dataset are split into tokens and their labels are memorized in the dictionary. Then the most common label is taken for the token.



---

Here we import all the dependencies and install absent ones.

In [2]:
!pip install razdel

Collecting razdel
  Downloading razdel-0.5.0-py3-none-any.whl (21 kB)
Installing collected packages: razdel
Successfully installed razdel-0.5.0


In [3]:
from razdel import sentenize, tokenize
import numpy as np
from collections import defaultdict, Counter
from copy import deepcopy
import json

In [4]:
print(list(tokenize('Я уверен, что Владимир Владимирович Иванов это оценит! Абсолютно точно, город Иннополис на Свияге - олицетворение творческого гения русского народа!')))

[Substring(0, 1, 'Я'), Substring(2, 8, 'уверен'), Substring(8, 9, ','), Substring(10, 13, 'что'), Substring(14, 22, 'Владимир'), Substring(23, 35, 'Владимирович'), Substring(36, 42, 'Иванов'), Substring(43, 46, 'это'), Substring(47, 53, 'оценит'), Substring(53, 54, '!'), Substring(55, 64, 'Абсолютно'), Substring(65, 70, 'точно'), Substring(70, 71, ','), Substring(72, 77, 'город'), Substring(78, 87, 'Иннополис'), Substring(88, 90, 'на'), Substring(91, 97, 'Свияге'), Substring(98, 99, '-'), Substring(100, 113, 'олицетворение'), Substring(114, 125, 'творческого'), Substring(126, 131, 'гения'), Substring(132, 140, 'русского'), Substring(141, 147, 'народа'), Substring(147, 148, '!')]


Actual functions

In [24]:
def read_f(path, buffer=[]):
    """
    Function to read jsonl files into buffer
    """
    with open(path, encoding='utf-8') as f:
        for fl in f:
            buffer += [json.loads(fl)]
    return buffer


def load_labels(data, buffer=[], cast_to_tuple=True):
    """
    When we need to extract labels (all unique), we use this function
    """
    buffer_set = set()
    for el in data:
        for ner in el["ners"]:
            buffer_set.add(ner[2])
    buffer.extend(list(buffer_set))
    if cast_to_tuple:
        return tuple(buffer)
    return buffer


def load_dict(data, dict_buffer=defaultdict(Counter), senences=False):
    """
    Function to load dictionary with all possible (according to train file) labels for the entity
    """
    for datapoint in data:
        for coordt in datapoint["ners"]:
            token = datapoint['senences' if senences else 'sentences'][coordt[0]:coordt[1] + 1]
            dict_buffer[token][coordt[2]] += 1
    return dict_buffer


def predict_one_token(token_triple, vocab):
    """
    Function to predict exactly one entity (can be several words in one string) if found in the vocab
    """
    coord1, coord2, token = token_triple
    coord2 -= 1
    nne = []
    if token in vocab:
        nne.extend([coord1, coord2, vocab[token].most_common()[0][0]])
    return nne


def predict_sentence(sentence, vocab, ngram_max=4):
    """
    Predict label for all possible ngrams(0, ngram_max) in the sentence
    """
    ngram_current = min(ngram_max, len(sentence))
    window = [0, ngram_current]
    nnes = []
    sentence = list(map(lambda x: x + [[]], sentence))  # adding the list of already given labels
    while ngram_current > 0:
        while window[1] <= len(sentence):
            ngram = ' '.join(list(map(lambda x: x[2], sentence[window[0]:window[1]])))
            ng_s, ng_e = sentence[window[0]][0], sentence[window[1] - 1][1]
            nne = predict_one_token([ng_s, ng_e, ngram], vocab)
            if len(nne):
                skip = False
                for el in sentence[window[0]:window[1]]:
                    if nne[2] in el[3]:
                        skip = True
                        break
                if skip:
                    window[0] += 1
                    window[1] += 1
                    continue
                else:
                    nnes.append(nne)
                    for i in range(window[0], window[1]):
                        sentence[i][3].append(nne[2])
            window[0] += 1
            window[1] += 1
        ngram_current -= 1
        window[0] = 0
        window[1] = ngram_current
    nnes.sort(key=(lambda x: x[0]))
    return nnes


def postprocess_nnes(nnes, max_distance=3):
    """
    Function merges neighboring entities under condition that they are same type entity and separated by
    no more than max_distance symbols
    """
    merged_nnes = deepcopy(nnes)
    i = 0
    while i < len(merged_nnes) - 1:
        if merged_nnes[i][2] == merged_nnes[i + 1][2] and merged_nnes[i][1] >= merged_nnes[i + 1][0] - max_distance:
            merged_nnes[i][1] = merged_nnes[i + 1][1]
            merged_nnes.remove(merged_nnes[i + 1])
        else:
            i += 1
    return merged_nnes


def predict(data, vocab, ngram_max=4):
    """
    Whole code of prediction in one place
    """
    answer = []
    for datapoint in data:
        sentence = list(map(lambda x: list(x), tokenize(datapoint['senences'])))
        idx = datapoint['id']
        nnes = postprocess_nnes(predict_sentence(sentence, vocab, ngram_max))
        answer.append({'ners': nnes, 'id': idx})
    return answer


def create_submission(submission):
    """
    Function that creates submission file
    """
    with open('submission.jsonl', 'w') as f:
        for el in submission:
            json.dump(el, f)
            f.write('\n')

In [9]:
X = read_f('train.jsonl')
labels = load_labels(X)
vocab = load_dict(X)
print(vocab)

defaultdict(<class 'collections.Counter'>, {'Бостон': Counter({'CITY': 1}), 'Тамерлан': Counter({'PERSON': 1}), 'Царнаевы': Counter({'PERSON': 1}), 'Северного Кавказа': Counter({'LOCATION': 1}), 'спецоперация по поимке': Counter({'EVENT': 1}), '19-летнего': Counter({'AGE': 2}), 'Джохара Царнаева': Counter({'PERSON': 3}), 'Бостонском марафоне': Counter({'EVENT': 3}), '15 апреля': Counter({'DATE': 1}), 'Массачусетского технологического института': Counter({'ORGANIZATION': 3}), '18 апреля': Counter({'DATE': 2}), 'Второй': Counter({'ORDINAL': 10}), '26-летний': Counter({'AGE': 4}), 'Тамерлан Царнаев': Counter({'PERSON': 2}), 'перестрелке': Counter({'EVENT': 3}), 'Уотертауне': Counter({'CITY': 1}), 'Уотертаун': Counter({'CITY': 1}), 'Бостоне': Counter({'CITY': 7}), 'Гарварде': Counter({'ORGANIZATION': 1}), 'Массачусетском технологическом институте': Counter({'ORGANIZATION': 1}), 'Университете Саффолка': Counter({'ORGANIZATION': 1}), 'Бостонском университете': Counter({'ORGANIZATION': 1}), '

In [11]:
print(predict_one_token([5, 7, '4'], vocab))  # check

[5, 6, 'NUMBER']


In [42]:
X_dev, X_test = read_f('dev.jsonl', []), read_f('test.jsonl', [])

'''
We are interested in test dataset, so we'll predict it.
Dev was used at the start to evaluate
'''

answer = predict(X_test, vocab)

In [40]:
create_submission(answer)

In [41]:
!zip submission submission.jsonl

  adding: submission.jsonl (deflated 76%)


This solution scored 0.40 F1 score in the competition

Thank you for your attention!