In [1]:
import spacy
from spacy.tokens import Token
from spacy import displacy
from spacy.matcher import DependencyMatcher

In [2]:
spacy.require_gpu()
nlp = spacy.load("en_core_web_trf")

In [3]:
def add_attr(doc):
    attr = "has_subj"
    subj_deps = {"nsubj", "nsubjpass", "csubj", "csubjpass", "expl"}
    Token.set_extension(attr, default=False, force=True)
    verbs = {subj.head for subj in doc if subj.dep_ in subj_deps}
    for v in verbs:
        v._.set(attr, True)
    return doc

In [4]:
text = "He is a teacher, and was a doctor, and she is a bartender, and was a dancer."
doc = nlp(text)
doc = add_attr(doc)
displacy.render(doc, style="dep")
for token in doc:
    print(token, token._.has_subj)

He False
is True
a False
teacher False
, False
and False
was False
a False
doctor False
, False
and False
she False
is True
a False
bartender False
, False
and False
was False
a False
dancer False
. False


In [5]:
pattern = [
    {
        "RIGHT_ID": "PREDICATE",
        "RIGHT_ATTRS":{
            "POS": {
                "IN": ["AUX", "VERB"]
            },
            "DEP": "conj",
            "_": {
                "has_subj": False
            }
        }
    },
    {
        "LEFT_ID": "PREDICATE",
        "REL_OP": "<",
        "RIGHT_ID": "OMIT_1",
        "RIGHT_ATTRS":{
            "POS": {
                "IN": ["AUX", "VERB"]
            }
        }
    },
    {
        "LEFT_ID": "OMIT_1",
        "REL_OP": ">",
        "RIGHT_ID": "OMIT_2",
        "RIGHT_ATTRS":{
            "POS": {
                "IN": ["AUX", "VERB"]
            },
            "DEP": "conj"
        }
    },
    {
        "LEFT_ID": "OMIT_1",
        "REL_OP": ">",
        "RIGHT_ID": "SUBJ",
        "RIGHT_ATTRS": {
            "DEP": {
                "IN": ["nsubj", "nsubjpass", "csubj", "csubjpass", "expl"]
            }
        }
    },
    {
        "LEFT_ID": "PREDICATE",
        "REL_OP": ">",
        "RIGHT_ID": "OBJ",
        "RIGHT_ATTRS": {
            "DEP": "attr"        
        }
    }
]

In [6]:
matcher = DependencyMatcher(nlp.vocab)
matcher.add("is_1_conj", [pattern])
    
def extract(sent):
    # is-a pattern
    matches = matcher(sent)
    res = []
    for match_id, token_ids in matches:
        rel_type = nlp.vocab.strings[match_id]
        rel = {"REL": rel_type}
        for p, token_id in zip(pattern, token_ids):
            attr = p["RIGHT_ID"]
            rel[attr] = sent[token_id]
        res.append(rel)
    return res

for sent in doc.sents:
    knowledges = extract(sent)
    for knowledge in knowledges:
        rel_type = knowledge.pop("REL")
        print("")
        print("Relation :", rel_type)
        for key, val in knowledge.items():
            print("->", key, ":", val, "(pos: %s)" % val.pos_, "(dep: %s)" % val.dep_, "(has_subj: %s)" % val._.has_subj)


Relation : is_1_conj
-> PREDICATE : was (pos: VERB) (dep: conj) (has_subj: False)
-> OMIT_1 : is (pos: AUX) (dep: ROOT) (has_subj: True)
-> OMIT_2 : was (pos: VERB) (dep: conj) (has_subj: False)
-> SUBJ : He (pos: PRON) (dep: nsubj) (has_subj: False)
-> OBJ : doctor (pos: NOUN) (dep: attr) (has_subj: False)

Relation : is_1_conj
-> PREDICATE : was (pos: VERB) (dep: conj) (has_subj: False)
-> OMIT_1 : is (pos: AUX) (dep: ROOT) (has_subj: True)
-> OMIT_2 : is (pos: VERB) (dep: conj) (has_subj: True)
-> SUBJ : He (pos: PRON) (dep: nsubj) (has_subj: False)
-> OBJ : doctor (pos: NOUN) (dep: attr) (has_subj: False)

Relation : is_1_conj
-> PREDICATE : is (pos: VERB) (dep: conj) (has_subj: True)
-> OMIT_1 : is (pos: AUX) (dep: ROOT) (has_subj: True)
-> OMIT_2 : was (pos: VERB) (dep: conj) (has_subj: False)
-> SUBJ : He (pos: PRON) (dep: nsubj) (has_subj: False)
-> OBJ : bartender (pos: NOUN) (dep: attr) (has_subj: False)

Relation : is_1_conj
-> PREDICATE : is (pos: VERB) (dep: conj) (has_su