<a href="https://colab.research.google.com/github/tomasonjo/blogs/blob/master/ie_pipeline/SpaCy_informationextraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install crosslingual-coreference spacyopentapioca spacy-transformers
!pip install --upgrade google-cloud-storage
!pip install --upgrade transformers
!python -m spacy download en_core_web_sm


Collecting transformers
  Using cached transformers-4.18.0-py3-none-any.whl (4.0 MB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Using cached tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.10.3
    Uninstalling tokenizers-0.10.3:
      Successfully uninstalled tokenizers-0.10.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.4.0
    Uninstalling transformers-4.4.0:
      Successfully uninstalled transformers-4.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spacy-transformers 1.1.5 requires transformers<4.18.0,>=3.4.0, but you have transformers 4.18.0 which is incompatible.[0m
Successfully installed tokenizers-0.12.1 transformers-4.18.0


Collecting en-core-web-sm==3.2.0


In [1]:
import spacy
import crosslingual_coreference

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
# Add rebel component https://github.com/Babelscape/rebel/blob/main/spacy_component.py
from spacy import Language
from typing import List

from spacy.tokens import Doc, Span

import re

from transformers import pipeline

def extract_triplets(text):
    """
    Function to parse the generated text and extract the triplets
    """
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})

    return triplets


@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": 0,
    },
)
class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):
        assert model_name is not None, ""
        self.triplet_extractor = pipeline("text2text-generation", model=model_name, tokenizer=model_name, device=device)
        # Register custom extension on the Doc
        if not Doc.has_extension("rel"):
          Doc.set_extension("rel", default={})

    def _generate_triplets(self, sent: Span) -> List[dict]:
          output_ids = self.triplet_extractor(sent.text, return_tensors=True, return_text=False)[0]["generated_token_ids"]["output_ids"]
          extracted_text = self.triplet_extractor.tokenizer.batch_decode(output_ids[0])
          extracted_triplets = extract_triplets(extracted_text[0])
          return extracted_triplets

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        for triplet in triplets:
            # get substring to spacy span
            head_span = re.search(triplet["head"], doc.text)
            tail_span = re.search(triplet["tail"], doc.text)
            # get spacy span
            if head_span is not None:
                head_span = doc.char_span(head_span.start(), head_span.end())
            else:
                head_span = triplet["head"]
            if tail_span is not None:
                tail_span = doc.char_span(tail_span.start(), tail_span.end())
            else:
                tail_span = triplet["tail"]
            offset = (head_span.start, tail_span.start)
            if offset not in doc._.rel:
                doc._.rel[offset] = {"relation": triplet["type"], "head_span": head_span, "tail_span": tail_span}

    def __call__(self, doc: Doc) -> Doc:
        for sent in doc.sents:
            sentence_triplets = self._generate_triplets(sent)
            self.set_annotations(doc, sentence_triplets)
        return doc

In [4]:
# Start with english model
nlp = spacy.load('en_core_web_sm')

# Add coreference resolution
nlp.add_pipe(
    "xx_coref", config={"chunk_size": 2500, "chunk_overlap": 2, "device": -1})


# Add opentapioca entity linking
nlp.add_pipe('opentapioca')

# Add Rebel relationship extraction
nlp.add_pipe("rebel", after="senter", config={
    'device':-1, # Number of the GPU, -1 if want to use CPU
    'model_name':'Babelscape/rebel-large'} # Model used, will default to 'Babelscape/rebel-large' if not given
    )


doc = nlp("Christian Drosten works in Germany. He likes to work for Google.")

print(doc._.resolved_text)

for span in doc.ents:
    print((span.text, span.kb_id_, span.label_, span._.description, span._.score))

for value, rel_dict in doc._.rel.items():
    print(f"{value}: {rel_dict}")

Christian Drosten works in Germany. Christian Drosten likes to work for Google.
('Christian Drosten', 'Q1079331', 'PERSON', 'German virologist and university teacher', 1.2489936851203574)
('Germany', 'Q183', 'LOC', 'sovereign state in Central Europe', 1.4372806742675128)
('Google', 'Q95', 'ORG', 'American multinational Internet and technology corporation', 0.9512844491952855)
(0, 4): {'relation': 'country of citizenship', 'head_span': Christian Drosten, 'tail_span': Germany}
(11, 11): {'relation': 'subsidiary', 'head_span': Google, 'tail_span': Google}


In [5]:
doc = nlp("Christian Drosten works in Germany. Christian Drosten likes to work for Google.")

print(doc._.resolved_text)

for span in doc.ents:
    print((span.text, span.kb_id_, span.label_, span._.description, span._.score))

for value, rel_dict in doc._.rel.items():
    print(f"{value}: {rel_dict}")

Christian Drosten works in Germany. Christian Drosten likes to work for Google.
('Christian Drosten', 'Q1079331', 'PERSON', 'German virologist and university teacher', 1.8970209111714604)
('Germany', 'Q183', 'LOC', 'sovereign state in Central Europe', 2.0062482394392687)
('Christian Drosten', 'Q1079331', 'PERSON', 'German virologist and university teacher', 2.041460252110812)
('Google', 'Q95', 'ORG', 'American multinational Internet and technology corporation', 0.4212893030607042)
(0, 4): {'relation': 'country of citizenship', 'head_span': Christian Drosten, 'tail_span': Germany}
(0, 12): {'relation': 'employer', 'head_span': Christian Drosten, 'tail_span': Google}
