In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_lg')

## Expanding named entities

In [37]:
doc = nlp('Dr. Jay Jannel completed is PHD this summer from Oregon State University.')

In [38]:
doc

Dr. Jay Jannel completed is PHD this summer from Oregon State University.

In [39]:
print([(ent.text, ent.label_) for ent in doc.ents] )

[('Dr. Jay Jannel', 'PERSON')]


In [35]:
### PROBLEM: It detected Jay Jannel as person but it opted Dr.
def add_title(doc):
    new_ents = []
    for ent in doc.ents:
        if ent.label_ == 'PERSON' and ent.start != 0:
            prev_token = doc[ent.start - 1]
            if prev_token.text in ('Dr', 'Dr.', 'Mr', 'Mr.'):
                new_ent = Span(doc, ent.start - 1, ent.end, label = ent.label_)
                new_ents.append(new_ent)
            else:
                    new_ents.append(ent)
    doc.ents = new_ents
    return doc

In [36]:
nlp.add_pipe(add_title, after='ner')

### User of Past of Speech (POS) and Dependency Parsing

In [40]:
nlp = spacy.load('en_core_web_lg')

In [44]:
doc = nlp('Jay Jannel is working at Oregon State University.')

In [45]:
doc

Jay Jannel is working at Oregon State University.

In [46]:
displacy.render(doc, style='dep', options = {'compact': True, 'distance': 100})

In [79]:
# Goal: Get the past and present work place.
def get_person_org(doc):
    # get all person entities
    person_entities = [ent for ent in doc.ents if ent.label_ == 'PERSON']
    print(person_entities)
    for ent in person_entities:
        head = ent.root.head
        print(head.lemma)
        if head.lemma_ == 'work':
            # get preposition which has dependcy of preposition of head
            preps = [token for token in head.children if token.dep_ == 'prep']
            print(preps)
            for prep in preps:
                # get org followed by preposition
                orgs = [token for token in prep.children if token.ent_type_ == 'ORG']
                print({'person': ent, 'orgs': orgs, 'past': head.tag_ == 'VBD'})
    return doc

In [48]:
from spacy.pipeline import merge_entities

In [98]:
nlp = spacy.load('en_core_web_lg')

In [81]:
#nlp.add_pipe(merge_entities)
nlp.add_pipe(get_person_org)

In [80]:
nlp.remove_pipe('get_person_org')

('get_person_org', <function __main__.get_person_org(doc)>)

In [84]:
doc = nlp('Jay Jannel worked at Oregon State University.')

[Jay Jannel]
10038440415813069799
[at]
{'person': Jay Jannel, 'orgs': [Oregon State University], 'past': True}


### Modify Model

In [86]:
# Goal: Get the past and present work place.
def get_person_org_modified(doc):
    # get all person entities
    person_entities = [ent for ent in doc.ents if ent.label_ == 'PERSON']
    print(person_entities)
    for ent in person_entities:
        head = ent.root.head
        print(head.lemma)
        if head.lemma_ == 'work':
            # get preposition which has dependcy of preposition of head
            preps = [token for token in head.children if token.dep_ == 'prep']
            print(preps)
            for prep in preps:
                # get org followed by preposition
                print(head.children)
                orgs = [token for token in prep.children if token.ent_type_ == 'ORG']
                
                # check for auxulary verb
                aux = [token for token in head.children if token.dep_ == 'aux']
                past_aux = any(t.tag_ == 'VBD' for t in aux)
                past = head.tag_ == 'VBD' or head.tag_ == 'VBG' and past_aux
                print({'person': ent, 'orgs': orgs, 'past': head.tag_ == 'VBD'})
    return doc

In [99]:
nlp.add_pipe(merge_entities)
nlp.add_pipe(get_person_org_modified)

In [100]:
doc = nlp('Jay Baleno was working at Nissan.')

[Jay Baleno]
10038440415813069799
[at]
<generator object at 0x0000024D48CA2A48>
{'person': Jay Baleno, 'orgs': [Nissan], 'past': False}


In [101]:
nlp.pipe_factories

{'tagger': 'tagger',
 'parser': 'parser',
 'ner': 'ner',
 'merge_entities': 'merge_entities',
 'get_person_org_modified': 'get_person_org_modified'}