In [26]:
import spacy
import json
import typer
from spacy.tokens import Span, DocBin, Doc
from spacy.vocab import Vocab
import random
from wasabi import Printer
from pathlib import Path
import numpy as np
from tqdm import tqdm


from rel_pipeline import get_tokens, calculate_tensor, create_pairs,dict_to_vector


In [27]:
    vocab = Vocab()
    spacy.prefer_gpu()

    nlp = spacy.load("../../ner_component/training/model-best")
    mask_entities = ["CONDITION", "BENEFIT"]
    relations = ["RELATED"]

    dep = "../assets/dependencies.json"
    pos = "../assets/partofspeech.json"
    
    dep_list = None
    pos_list = None

    with open(dep, "r") as f:
        dep_list = json.load(f)

    with open(pos, "r") as f:
        pos_list = json.load(f)

In [28]:
lines = []
with open("../assets/annotations.jsonl","r", encoding="utf8") as jsonfile:
        for line in jsonfile:
            lines.append(line)

In [29]:
docs = []
for line in lines:
    example = json.loads(line)
    if example["answer"] == "accept":
        doc = nlp(example["text"])
        ents_list = []

        for span in example["spans"]:
            ents_list.append(
                Span(
                    doc,
                    span["token_start"],
                    span["token_end"] + 1,
                    span["label"],
                )
            )

        doc.set_ents(ents_list)
        tokens = get_tokens(doc)
        pairs = calculate_tensor(
            create_pairs(tokens),
            mask_entities,
            relations,
            dep_list,
            pos_list,
        )

        for relation in example["relations"]:
            key1 = (
                relation["head_span"]["token_start"],
                relation["head_span"]["token_end"],
                relation["child_span"]["token_start"],
                relation["child_span"]["token_end"],
            )
            key2 = (
                relation["child_span"]["token_start"],
                relation["child_span"]["token_end"],
                relation["head_span"]["token_start"],
                relation["head_span"]["token_end"],
            )

            if key1 in pairs:
                pairs[key1]["relation"][relation["label"]] = 1.0
            elif key2 in pairs:
                pairs[key2]["relation"][relation["label"]] = 1.0

        if not doc.has_extension("rel"):
            doc.set_extension("rel", default={})
        doc._.rel = pairs

        docs.append(doc)

In [36]:
all_tensors = []
duplicates = 0
conflicts = 0
examples = 0

print(len(docs))
for i in tqdm(range(len(docs))):
    doc = docs[i]
    relations = doc._.rel
    examples += len(relations)
    for pair in relations:
        current_tensor = relations[pair]["tensor"]
        current_prediction = dict_to_vector(relations[pair]["relation"])
        is_unique = True
        
        if np.isin(current_tensor,all_tensors).all():
            duplicates+=1
            is_unique = False
        
        #for tensor in all_tensors:
        #    if np.array_equal(current_tensor,tensor[0]):
        #        duplicates+=1
        #        is_unique = False
        #        if np.array_equal(current_prediction,tensor[1]):
        #            conflicts+=1
                    
        if is_unique:
            all_tensors.append(current_tensor)
        
print(len(all_tensors))        
print(duplicates,conflicts,examples)

  0%|                                                                                                                                                                                 | 0/351 [00:00<?, ?it/s]

351


  4%|██████▏                                                                                                                                                                 | 13/351 [00:27<11:50,  2.10s/it]


KeyboardInterrupt: 