# Create KB and dataset

In [2]:
%cd ..

/home/codeholder/code/python-playground/app_noisemon


# 1. Prelude

In [1]:
import json
import os
from pathlib import Path
from tqdm import tqdm
from collections import defaultdict
import numpy as np

import spacy
from spacy.kb import KnowledgeBase
from spacy.tokens import Span, DocBin, Doc
from spacy.vocab import Vocab
from spacy.lang.ru import Russian
from typing import List, Callable, Iterable, Iterator, Optional, Dict, Union
from spacy.language import Language
from spacy.pipeline.trainable_pipe import TrainablePipe
from spacy.pipeline.pipe import deserialize_config
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy.training import Example, validate_examples, validate_get_examples
from spacy import util, Errors
from spacy.util import minibatch
from thinc.api import Model, Config, set_dropout_rate, Optimizer
# from thinc.layers import TransformerListener

from wasabi import Printer
msg = Printer()

from scripts.convert_labelstudio_to_spacy import LabelStudioToSpacyConverter

  return torch._C._cuda_getDeviceCount() > 0


In [3]:

DEFAULT_CONFIG_TRF = """

[transformer]
max_batch_items = 512

[transformer.model]
@architectures = "spacy-transformers.TransformerModel.v1"
name = "cointegrated/rubert-tiny"

[transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 600
stride = 128
#@span_getters = "spacy-transformers.sent_spans.v1"

[transformer.model.tokenizer_config]
use_fast = true
return_tensors='pt'

"""

In [4]:
DEFAULT_CONFIG = Config().from_str(DEFAULT_CONFIG_TRF)

In [5]:
# nlp = Russian()
# nlp.add_pipe("parser", source=spacy.load("ru_core_news_sm"))
# trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG["transformer"])
# trf.model.initialize()

In [6]:
nlp = spacy.load("ru_core_news_sm", disable=["ner", "lemmatizer"])
trf = nlp.add_pipe("transformer", config=DEFAULT_CONFIG["transformer"])
trf.model.initialize()

Some weights of the model checkpoint at cointegrated/rubert-tiny were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<thinc.model.Model at 0x7f5db3af4a40>

In [146]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x7f49e6e3eea0>),
 ('morphologizer',
  <spacy.pipeline.morphologizer.Morphologizer at 0x7f4a08b0ab30>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x7f4a08b86dc0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x7f4a08cc3dc0>),
 ('transformer',
  <spacy_transformers.pipeline_component.Transformer at 0x7f4a093e3d10>)]

In [7]:
doc = nlp("Какой-то текст 1,2,3")

In [148]:
type(doc._.trf_data.model_output.last_hidden_state)

numpy.ndarray

In [8]:
emb = doc._.trf_data.model_output.last_hidden_state
emb = emb / np.linalg.norm(emb)
emb = emb.squeeze()
emb.shape

AttributeError: 'TransformerData' object has no attribute 'model_output'

In [9]:
emb[[1,2,3]].shape

NameError: name 'emb' is not defined

In [151]:
emb = np.mean(emb, axis=0)
emb.shape

(312,)

In [152]:

nlp_model_vector_size = 312
kb = KnowledgeBase(vocab=nlp.vocab, entity_vector_length=nlp_model_vector_size)


converter = LabelStudioToSpacyConverter(nlp=nlp)
converter.ls_label_map = {
    "ORG": "ORG"
}

In [153]:
doc._.trf_data.wordpieces.strings[0]

['[CLS]', 'Како', '##й', '-', 'то', 'текст', '1', ',', '2', ',', '3', '[SEP]']

In [154]:
import spacy_alignments as tokenizations
print(doc._.trf_data.wordpieces.strings[0])
print([str(token) for token in doc])
a2b, b2a = tokenizations.get_alignments([str(token) for token in doc], doc._.trf_data.wordpieces.strings[0])
print(a2b)
print(b2a)

['[CLS]', 'Како', '##й', '-', 'то', 'текст', '1', ',', '2', ',', '3', '[SEP]']
['Какой', '-', 'то', 'текст', '1,2,3']
[[1, 2], [3], [4], [5], [6, 7, 8, 9, 10]]
[[], [0], [0], [1], [2], [3], [4], [4], [4], [4], [4], []]


In [12]:
doc.ents = [doc.char_span(0, 5, label="ORG",)]


In [13]:
doc

Какой-то текст 1,2,3

In [23]:
doc.ents[0].kb_id_ = "Q36734"

In [24]:
doc.ents[0].kb_id_

''

In [29]:
from datetime import datetime
datetime.now().isoformat()

'2021-10-23T18:43:50.025091'

# 2. Data import

In [156]:
input_path = Path("./data/05-labeled")
output_folder = Path("./corpus/")
kb_path = Path("./corpus/noisemon_kb")

In [157]:
files = list(input_path.glob("*.json"))
if files:
    input_path = max(files, key=os.path.getctime)
else:
    msg.fail(f"Directory {input_path} is empty")

In [158]:
data = json.loads(input_path.read_text())

In [159]:
data[0]

{'id': 334577,
 'annotations': [{'id': 27,
   'completed_by': {'id': 1,
    'email': 'maksim_ermakov@protonmail.com',
    'first_name': '',
    'last_name': ''},
   'result': [{'value': {'start': 36,
      'end': 41,
      'text': 'Русал',
      'labels': ['ORG']},
     'id': '91483591258789',
     'from_name': 'ner',
     'to_name': 'text',
     'type': 'labels'},
    {'value': {'start': 36, 'end': 41, 'text': ['Q1642605']},
     'id': '91483591258789',
     'from_name': 'entity',
     'to_name': 'text',
     'type': 'textarea'},
    {'value': {'start': 522,
      'end': 553,
      'text': 'Лондонской бирже металлов (LME)',
      'labels': ['ORG']},
     'id': '26347872065953',
     'from_name': 'ner',
     'to_name': 'text',
     'type': 'labels'},
    {'value': {'start': 522, 'end': 553, 'text': ['Q952937']},
     'id': '26347872065953',
     'from_name': 'entity',
     'to_name': 'text',
     'type': 'textarea'}],
   'was_cancelled': False,
   'ground_truth': False,
   'created_at'

# 3. Processing

In [160]:
np.mean([np.mean([[1,2,3], [1,2,4]], axis=0).tolist(), [2,2,3]], axis=0)

array([1.5 , 2.  , 3.25])

In [161]:
np.mean([[1,2,3], [1,2,4], [2,2,3]], axis=0)

array([1.33333333, 2.        , 3.33333333])

In [162]:
qid_to_vector_list = defaultdict(list)

In [163]:
qid_to_alias = defaultdict(set)

In [164]:
output = []
for labelstudio in tqdm(data[::-1]):
    id_to_qid_name_pair = defaultdict(dict)
    doc = converter.create_spacy_doc(labelstudio)
    
    emb = doc._.trf_data.model_output.last_hidden_state
    emb = emb / np.linalg.norm(emb)
    emb = emb.squeeze()
    
    entities = []

    # 1. Matching named entities with QIDs
    for chunk in labelstudio["annotations"][0]["result"]:
        if chunk["from_name"] == "ner":
            id_to_qid_name_pair[chunk["id"]]["text"] = chunk["value"]["text"]
        if chunk["from_name"] == "entity":
            id_to_qid_name_pair[chunk["id"]]["qid"] = chunk["value"]["text"][0]
            
    # 2. Create ents and assign kb_id to ents
    doc_alignment, _ = tokenizations.get_alignments([str(token) for token in doc], doc._.trf_data.wordpieces.strings[0])
    for chunk in labelstudio["annotations"][0]["result"]:
        if chunk["from_name"] == "ner":
            # 1. Try to create an entity
            QID = id_to_qid_name_pair[chunk["id"]].get("qid", None)
            if not QID:
                print(f"{id_to_qid_name_pair[chunk['id']]['text']} has no matching QID")
            try:
                entity = doc.char_span(
                    chunk["value"]["start"], 
                    chunk["value"]["end"], 
                    label=chunk["value"]["labels"][0],
                    kb_id=QID
                )
                assert entity != None, "Entity failed to be created. Probably misaligned markup"
                
                entities.append(entity)
            except:
                msg.fail("REsult:", chunk)
                msg.fail("Doc:", doc)
                msg.fail("----------")
                continue
            # 2. Get entity vector
            ent_idxs = [idx for list_of_indices in doc_alignment[entity.start: entity.end] for idx in list_of_indices]
#             print(entity.start, entity.end, ent_idxs, doc_alignment)
            entity_vector = np.mean(emb[ent_idxs], axis=0)
            qid_to_vector_list[QID].append(entity_vector)
            qid_to_alias[QID].add(entity.text)

    doc.ents = entities
    output.append(doc)

  0%|                                                                                     | 0/79 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (584 > 512). Running this sequence through the model will result in indexing errors
100%|████████████████████████████████████████████████████████████████████████████| 79/79 [00:01<00:00, 57.20it/s]


In [165]:
len(doc._.trf_data.wordpieces.strings[0])

183

In [166]:
emb.shape

(183, 312)

In [167]:
list(doc.sents)

[RUAL отчет
 Чистая прибыль компании «Русал» за январь—сентябрь 2017 года по ​по международным стандартам финансовой отчетности (МСФО) ​составила ​$782 млн, превысив на ​46,4% аналогичный показатель прошлого года.,
 Об этом говорится в отчете о финансовых результатах, опубликованном на сайте компании.,
 
 При этом выручка компании за третий квартал увеличилась на 19,4% и составила $2,46 млрд, а за девять месяцев текущего года — на 21,3%, до $7,224 млрд.,
 
 Росту финансовых показателей способствовало увеличение цены алюминия на Лондонской бирже металлов (LME), говорится в отчете.]

In [168]:
type(entity)

spacy.tokens.span.Span

In [169]:
type(s)

spacy.tokens.span.Span

In [170]:
for s in doc.sents:
    print(entity in s.ents)

False
False
False
True


In [171]:
len(doc._.trf_data.wordpieces.strings)

1

In [172]:
entity.start

94

In [173]:
doc_alignment[entity.start:entity.end]

[[164, 165], [166, 167], [168, 169], [170], [171, 172], [173]]

In [174]:
emb.shape

(183, 312)

In [176]:
# populating KB with vectors
for QID, vectors in qid_to_vector_list.items():
    kb.add_entity(entity=QID, entity_vector=np.mean(vectors, axis=0), freq=265)

In [177]:
# Populating aliases   
for QID, aliases in qid_to_alias.items():
    for alias in aliases:
        kb.add_alias(alias=alias, entities=[QID], probabilities=[1])

### At this point we have populated knowledge base `kb`, list of docs with ner and nel markup `output`

In [178]:
kb.get_alias_candidates("CME Group")[0].entity_

'Q1023876'

In [179]:
output[57].ents[0].kb_id_

'Q205012'

In [180]:
print(f"Entities in the KB: {kb.get_entity_strings()}")
print(f"Aliases in the KB: {kb.get_alias_strings()}")

Entities in the KB: ['Q4218402', 'Q660770', 'Q1141123', 'Q30893504', 'Q193199', 'Q58707', 'Q4161561', 'Q3063197', 'Q4044421', 'Q2369311', 'Q487907', 'Q2632892', 'Q1915579', 'Q1720713', 'Q223799', 'Q1071853', 'Q2380266', 'Q1477012', 'Q841458', 'Q108398998', 'Q1840188', 'Q251546', 'Q1023876', 'Q4047736', 'Q4304175', 'Q108397344', 'Q768773', 'Q171240', 'Q182477', 'Q940518', 'Q2624680', 'Q102673', 'Q173395', 'Q483551', 'Q329347', 'Q294508', 'Q379271', 'Q1368919', 'Q1809133', 'Q1284261', 'Q4059809', 'Q4258608', 'Q1549389', 'Q1461799', 'Q108397243', 'Q1616858', 'Q2304119', 'Q638448', 'Q7907607', 'Q2035424', 'Q2005769', 'Q4038038', 'Q4244736', 'Q1355823', 'Q3656098', 'Q1642605', 'Q108352452', 'Q4513187', 'Q1967957', 'Q4389244', 'Q871308', 'Q1963801', 'Q952937', 'Q130879', 'Q108396966', 'Q727452', 'Q108398486', 'Q2309', 'Q4499024', 'Q4327204', 'Q131723', 'Q1884500', 'Q2116312', 'Q205012', 'Q567050', 'Q4102033', 'Q1781702', 'Q386414', 'Q4400200']
Aliases in the KB: ['РусГидро', 'UBS', 'ПАО "Тру

In [181]:
kb.to_disk(kb_path)

# nlp.to_disk(nlp_dir)

In [182]:
train_docs = DocBin(store_user_data=True)
dev_docs = DocBin(store_user_data=True)
test_docs = DocBin(store_user_data=True)


In [183]:
for i, doc in enumerate(output):
    if i % 10 in (1,3,5):
        dev_docs.add(doc)
    if i % 10 in (4,):
        test_docs.add(doc)
    else:
        train_docs.add(doc)

In [184]:
train_docs.to_disk(output_folder / "train.spacy")
test_docs.to_disk(output_folder / "test.spacy")
dev_docs.to_disk(output_folder / "dev.spacy")

In [20]:
dir(datetime)

['__add__',
 '__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__radd__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__rsub__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 'astimezone',
 'combine',
 'ctime',
 'date',
 'day',
 'dst',
 'fold',
 'fromisocalendar',
 'fromisoformat',
 'fromordinal',
 'fromtimestamp',
 'hour',
 'isocalendar',
 'isoformat',
 'isoweekday',
 'max',
 'microsecond',
 'min',
 'minute',
 'month',
 'now',
 'replace',
 'resolution',
 'second',
 'strftime',
 'strptime',
 'time',
 'timestamp',
 'timetuple',
 'timetz',
 'today',
 'toordinal',
 'tzinfo',
 'tzname',
 'utcfromtimestamp',
 'utcnow',
 'utcoffset',
 'utctimetuple',
 'weekday',
 'year']

In [21]:
datetime.fromisoformat(datetime.now().isoformat())

datetime.datetime(2021, 10, 17, 12, 18, 17, 384835)