In [1]:
!pip install spacy==3.4.4 coreferee==1.3.1 transformers==4.25.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!python -m spacy download en_core_web_lg
!python -m coreferee install en

2023-01-23 21:23:07.923610: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-lg==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
2023-01-23 21:23:38.533952: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting https://github.com/explosion/coreferee/raw/master/model

In [3]:
import spacy, coreferee



In [4]:
# Source: https://en.wikipedia.org/wiki/Apple_Inc.
text = """
Apple was founded on April 1, 1976, by Steve Jobs, Steve Wozniak, and Ronald Wayne as a partnership.
The company's first product was the Apple I, a computer designed and hand-built entirely by Wozniak.
To finance the company's creation, Jobs sold his Volkswagen Bus, and Wozniak sold his HP-65 calculator.
Wozniak debuted the first prototype Apple I at the Homebrew Computer Club in July 1976.
The Apple I was sold as a motherboard with CPU, RAM, and basic textual-video chips—a base kit concept which would not yet be marketed as a complete personal computer.
It went on sale soon after debut for US$666.66 (equivalent to $3,175 in 2021).
Wozniak later said he was unaware of the coincidental mark of the beast in the number 666, and that he came up with the price because he liked "repeating digits".
"""

In [5]:
# Coreference resolution
coref_nlp = spacy.load('en_core_web_lg')
coref_nlp.add_pipe('coreferee')

<coreferee.manager.CorefereeBroker at 0x7f050949b460>

In [6]:
coref_doc = coref_nlp(text)

resolved_text = ""
for token in coref_doc:
  
    repres = coref_doc._.coref_chains.resolve(token)
    if repres:
        resolved_text += " " + " and ".join([t.text for t in repres])
    else:
        resolved_text += " " + token.text
    
print(resolved_text)

 
 Apple was founded on April 1 , 1976 , by Steve Jobs , Steve Wozniak , and Ronald Wayne as a partnership . 
 The Apple 's first product was the Apple I , a computer designed and hand - built entirely by Wozniak . 
 To finance the Apple 's creation , Jobs sold Wayne Volkswagen Bus , and Wozniak sold Wayne HP-65 calculator . 
 Wozniak debuted the first prototype Apple I at the Homebrew Computer Club in July 1976 . 
 The Apple I was sold as a motherboard with CPU , RAM , and basic textual - video chips — a base kit concept which would not yet be marketed as a complete personal computer . 
 Apple went on sale soon after debut for US$ 666.66 ( equivalent to $ 3,175 in 2021 ) . 
 Wozniak later said Wozniak was unaware of the coincidental mark of the beast in the number 666 , and that Wozniak came up with the price because Wozniak liked " repeating digits " . 



In [7]:
# Copied from https://github.com/Babelscape/rebel/blob/main/spacy_component.py

from spacy import Language, util
from spacy.tokens import Doc, Span
from transformers import pipeline
from typing import List


def extract_triplets(text: str) -> List[str]:
    """
    parses the text to triplets
    1. Split the text into tokens
    2. If the token is <triplet>, <subj>, or <obj>, then set the current variable to the appropriate value
    3. If the token is not one of the above, then append it to the appropriate variable
    4. If the current variable is <subj>, then append the triplet to the list of triplets
    :param text: str - the text to be parsed
    :type text: str
    :return: A list of dictionaries.
    """

    triplets = []
    relation, subject, relation, object_ = "", "", "", ""
    text = text.strip()
    current = "x"

    for token in (
        text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split()
    ):

        if token == "<triplet>":

            current = "t"

            if relation != "":

                triplets.append(
                    {
                        "head": subject.strip(),
                        "type": relation.strip(),
                        "tail": object_.strip(),
                    }
                )
                relation = ""

            subject = ""

        elif token == "<subj>":

            current = "s"

            if relation != "":

                triplets.append(
                    {
                        "head": subject.strip(),
                        "type": relation.strip(),
                        "tail": object_.strip(),
                    }
                )

            object_ = ""

        elif token == "<obj>":

            current = "o"
            relation = ""

        else:

            if current == "t":

                subject += " " + token

            elif current == "s":

                object_ += " " + token

            elif current == "o":

                relation += " " + token

    if (subject != "") and (relation != "") and (object_ != ""):

        triplets.append(
            {"head": subject.strip(), "type": relation.strip(), "tail": object_.strip()}
        )

    return triplets


@Language.factory(
    "rebel",
    requires=["doc.sents"],
    assigns=["doc._.rel"],
    default_config={
        "model_name": "Babelscape/rebel-large",
        "device": 0,
    },
)
class RebelComponent:
    def __init__(
        self,
        nlp,
        name,
        model_name: str,
        device: int,
    ):

        assert model_name is not None, ""

        self.triplet_extractor = pipeline(
            "text2text-generation",
            model=model_name,
            tokenizer=model_name,
            device=device,
        )

        # Register custom extension on the Doc
        if not Doc.has_extension("rel"):

            Doc.set_extension("rel", default={})

    def _generate_triplets(self, sents: List[Span]) -> List[List[dict]]:
        """
        1. We pass the text of the sentence to the triplet extractor.
        2. The triplet extractor returns a list of dictionaries.
        3. We extract the token ids from the dictionaries.
        4. We decode the token ids into text.
        5. We extract the triplets from the text.
        6. We return the triplets.
        The triplet extractor is a model that takes a sentence as input and returns a list of dictionaries.
        Each dictionary contains the token ids of the extracted triplets.
        The token ids are the numbers that represent the words in the sentence.
        For example, the token id of the word "the" is 2.
        The token ids are decoded into text using the tokenizer.
        The tokenizer is a model that takes a list of token ids as input and returns a list of words.
        :param sents: List[Span]
        :type sents: List[Span]
        :return: A list of lists of dicts.
        """

        output_ids = self.triplet_extractor(
            [sent.text for sent in sents], return_tensors=True, return_text=False
        )  # [0]["generated_token_ids"]
        extracted_texts = self.triplet_extractor.tokenizer.batch_decode(
            [out["generated_token_ids"] for out in output_ids]
        )
        extracted_triplets = []

        for text in extracted_texts:

            extracted_triplets.extend(extract_triplets(text))

        return extracted_triplets

    def set_annotations(self, doc: Doc, triplets: List[dict]):
        """
        The function takes a spacy Doc object and a list of triplets (dictionaries) as input.
        For each triplet, it finds the substring in the Doc object that matches the head and tail of the triplet.
        It then creates a spacy span object for each of the head and tail.
        Finally, it creates a dictionary of the relation type, head span and tail span and adds it to the Doc object
        :param doc: the spacy Doc object
        :type doc: Doc
        :param triplets: List[dict]
        :type triplets: List[dict]
        """

        text = doc.text.lower()

        for triplet in triplets:

            if triplet["head"] == triplet["tail"]:

                continue

            head_index = text.find(triplet["head"].lower())
            tail_index = text.find(triplet["tail"].lower())

            if (head_index == -1) or (tail_index == -1):

                continue

            head_span = doc.char_span(
                head_index, head_index + len(triplet["head"]), alignment_mode="expand"
            )
            tail_span = doc.char_span(
                tail_index, tail_index + len(triplet["tail"]), alignment_mode="expand"
            )

            try:

                offset = (head_span.start, tail_span.start)

            except (AttributeError):

                continue

            if offset not in doc._.rel:

                doc._.rel[offset] = {
                    "relation": triplet["type"],
                    "head_span": head_span,
                    "tail_span": tail_span,
                }

    def __call__(self, doc: Doc) -> Doc:
        """
        The function takes a doc object and returns a doc object
        :param doc: Doc
        :type doc: Doc
        :return: A Doc object with the sentence triplets added as annotations.
        """

        sentence_triplets = self._generate_triplets(doc.sents)
        self.set_annotations(doc, sentence_triplets)

        return doc

    def pipe(self, stream, batch_size=128):
        """
        It takes a stream of documents, and for each document,
        it generates a list of sentence triplets,
        and then sets the annotations for each sentence in the document
        :param stream: a generator of Doc objects
        :param batch_size: The number of documents to process at a time, defaults to 128 (optional)
        """

        for docs in util.minibatch(stream, size=batch_size):

            sents = []

            for doc in docs:

                sents += doc.sents

            sentence_triplets = self._generate_triplets(sents)
            index = 0

            for doc in docs:

                n_sent = len(list(doc.sents))
                self.set_annotations(doc, sentence_triplets[index : index + n_sent])
                index += n_sent

                yield doc

In [8]:
nlp = spacy.load("en_core_web_lg")

nlp.add_pipe(
    "rebel",
    after="senter",
    config={
        "device": -1,  # Number of the GPU, -1 if want to use CPU
        "model_name": "Babelscape/rebel-large",
    },  # Model used, will default to 'Babelscape/rebel-large' if not given
)

<__main__.RebelComponent at 0x7f0425226670>

In [9]:
doc = nlp(resolved_text)
for value, rel_dict in doc._.rel.items():
    print(rel_dict)

{'relation': 'founded by', 'head_span': Apple, 'tail_span': Steve Jobs}
{'relation': 'founded by', 'head_span': Apple, 'tail_span': Steve Wozniak}
{'relation': 'founded by', 'head_span': Apple, 'tail_span': Ronald Wayne}
{'relation': 'employer', 'head_span': Steve Jobs, 'tail_span': Apple}
{'relation': 'employer', 'head_span': Steve Wozniak, 'tail_span': Apple}
{'relation': 'employer', 'head_span': Ronald Wayne, 'tail_span': Apple}
{'relation': 'manufacturer', 'head_span': Apple I, 'tail_span': Apple}
{'relation': 'subclass of', 'head_span': Wayne HP-65, 'tail_span': calculator}
{'relation': 'inception', 'head_span': Apple I, 'tail_span': July 1976}
{'relation': 'has part', 'head_span': motherboard, 'tail_span': CPU}
{'relation': 'has part', 'head_span': motherboard, 'tail_span': RAM}
{'relation': 'part of', 'head_span': CPU, 'tail_span': motherboard}
{'relation': 'part of', 'head_span': RAM, 'tail_span': motherboard}
{'relation': 'said to be the same as', 'head_span': mark of the beas

In [12]:
import requests

def retrieve_wiki_id(item):
    try:
        url = "https://www.wikidata.org/w/api.php"
        params = f"?action=wbsearchentities&search={item}&language=en&format=json"
        data = requests.get(url + params).json()
        return {
            "id": data["search"][0]["url"],
            "description": data["search"][0]["display"]["description"]["value"],
        }
    except Exception as e:
        return None

In [13]:
entities = set()
for value, rel_dict in doc._.rel.items():
    entities.update([rel_dict["head_span"], rel_dict["tail_span"]])

for entity in entities:
    wiki_data = retrieve_wiki_id(entity)
    print(entity, wiki_data)

Apple {'id': '//www.wikidata.org/wiki/Q89', 'description': 'fruit of the apple tree'}
Steve Wozniak {'id': '//www.wikidata.org/wiki/Q483382', 'description': 'American computer pioneer, inventor, computer engineer and programmer; co-founder of Apple Inc.'}
Ronald Wayne {'id': '//www.wikidata.org/wiki/Q332591', 'description': 'co-founder of Apple Inc.'}
July 1976 {'id': '//www.wikidata.org/wiki/Q13066036', 'description': 'month of 1976'}
mark of the beast {'id': '//www.wikidata.org/wiki/Q6770514', 'description': 'album by Manilla Road'}
motherboard {'id': '//www.wikidata.org/wiki/Q4321', 'description': 'main printed circuit board (PCB) for a computing device'}
Apple I {'id': '//www.wikidata.org/wiki/Q18981', 'description': 'computer built by the Apple Computer Company'}
CPU {'id': '//www.wikidata.org/wiki/Q5300', 'description': 'electronic circuitry within a computer that carries out the instructions of a computer program by performing the basic arithmetic, logical, control and input/out