In [1]:
#!pip install networkx
#!pip install textacy

In [2]:
import spacy
import textacy
import matplotlib.pylab as plt
import networkx as nx

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load spacy model
nlp = spacy.load('en_core_web_sm')
# Plain text
text = "I am going to extract SVO"
# Process plain text with spacy
doc = nlp(text)

[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jovyan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
# Extract SVO list from spacy object
triples = list(textacy.extract.subject_verb_object_triples(doc))
print(triples)
# [SVOTriple(subject=[I], verb=[am, going], object=[to, extract, SVO])]

[SVOTriple(subject=[I], verb=[am, going], object=[to, extract, SVO])]


In [5]:
from crosslingual_coreference import Predictor

In [6]:
text = (
    "Do not forget about Momofuku Ando! He created instant noodles in Osaka. At"
    " that location, Nissin was founded. Many students survived by eating these"
    " noodles, but they don't even know him."
)

# choose minilm for speed/memory and info_xlm for accuracy
predictor = Predictor(
    language="en_core_web_sm", device=-1, model_name="minilm"
)

print(predictor.predict(text)["resolved_text"])
print(predictor.pipe([text])[0]["resolved_text"])

Some weights of the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large were not used when initializing XLMRobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaModel were not initialized from the model checkpoint at nreimers/mMiniLMv2-L12-H384-distilled-from-XLMR-Large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-st

Do not forget about Momofuku Ando! Momofuku Ando created instant noodles in Osaka. At Osaka, Nissin was founded. Many students survived by eating instant noodles, but Many students don't even know Nissin.
Do not forget about Momofuku Ando! Momofuku Ando created instant noodles in Osaka. At Osaka, Nissin was founded. Many students survived by eating instant noodles, but Many students don't even know Nissin.


In [7]:
# Complete Example

In [8]:
import spacy

text = (
    "Do not forget about Momofuku Ando! He created instant noodles in Osaka. At"
    " that location, Nissin was founded. Many students survived by eating these"
    " noodles, but they don't even know him."
)


nlp = spacy.load("en_core_web_sm")
nlp.add_pipe(
    "xx_coref", config={"chunk_size": 2500, "chunk_overlap": 2, "device": 1}
)



<crosslingual_coreference.CrossLingualPredictorSpacy.CrossLingualPredictorSpacy at 0x7f1f981bc7c0>

In [9]:
doc = nlp(text)
update_text = doc._.resolved_text
print(type(update_text))
print(update_text)


<class 'str'>
Do not forget about Momofuku Ando! Momofuku Ando created instant noodles in Osaka. At Osaka, Nissin was founded. Many students survived by eating instant noodles, but Many students don't even know Nissin.


In [10]:
doc1 = nlp(update_text)
# Extract SVO list from spacy object
triples = list(textacy.extract.subject_verb_object_triples(doc1))
print(triples)

[SVOTriple(subject=[Momofuku, Ando], verb=[created], object=[noodles]), SVOTriple(subject=[students, students], verb=[do, n't, know], object=[Nissin])]


In [13]:
doc1.ents

(Momofuku Ando, Momofuku Ando, Osaka, Osaka, Nissin, Nissin)

In [18]:
def find_verbs(doc):
    l = []
    for token in doc:
        if token.pos_ == 'VERB':
            l.append(token)
    return l

In [30]:
for sent in doc1.sents:
    print(f"\n>> Reviewing sentence: {sent}")

    pos = [f"text={token.text}, lemma={token.lemma_}, pos={token.pos_}, tag={token.tag_}, dep={token.dep_}, shape={token.shape_}, alpha={token.is_alpha}, stop={token.is_stop}" for token in sent]
    postxt = '\n'.join(pos)

    ents = [ f"text={ent.text}, label={ent.label_}" for ent in sent.ents]
    entstxt = '\n'.join(ents)
    
    triples = list(textacy.extract.subject_verb_object_triples(sent))
    for t in triples:
        print(f" - {t}")
    
    print(f" - Verbs in sentence: {find_verbs(sent)}")
    print(f" - Enities in sentence: \n{entstxt}")
    print(f" - Breakdown sentence: \n{postxt}")


>> Reviewing sentence: Do not forget about Momofuku Ando!
 - Verbs in sentence: [forget]
 - Enities in sentence: 
text=Momofuku Ando, label=PERSON
 - Breakdown sentence: 
text=Do, lemma=do, pos=AUX, tag=VB, dep=aux, shape=Xx, alpha=True, stop=True
text=not, lemma=not, pos=PART, tag=RB, dep=neg, shape=xxx, alpha=True, stop=True
text=forget, lemma=forget, pos=VERB, tag=VB, dep=ROOT, shape=xxxx, alpha=True, stop=False
text=about, lemma=about, pos=ADP, tag=IN, dep=prep, shape=xxxx, alpha=True, stop=True
text=Momofuku, lemma=Momofuku, pos=PROPN, tag=NNP, dep=compound, shape=Xxxxx, alpha=True, stop=False
text=Ando, lemma=Ando, pos=PROPN, tag=NNP, dep=pobj, shape=Xxxx, alpha=True, stop=False
text=!, lemma=!, pos=PUNCT, tag=., dep=punct, shape=!, alpha=False, stop=False

>> Reviewing sentence: Momofuku Ando created instant noodles in Osaka.
 - SVOTriple(subject=[Momofuku, Ando], verb=[created], object=[noodles])
 - Verbs in sentence: [created]
 - Enities in sentence: 
text=Momofuku Ando, labe