In [2]:
import spacy

## EntityRuler

In [3]:
nlp = spacy.load('en_core_web_sm')
text = "West Chestertenfieldville was referenced in Mr. Deeds. "


In [4]:
ruler = nlp.add_pipe("entity_ruler")

In [5]:
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, ent.label_)
    

West Chestertenfieldville GPE
Deeds PERSON




In [6]:
patterns = [
    {"label":"GPE", "pattern":"West Chestertenfieldville"},

]

ruler.add_patterns(patterns)

In [7]:
doc2 = nlp(text)
for ent in doc2.ents:
    print(ent.text, ent.label_)
    

West Chestertenfieldville GPE
Deeds PERSON


In [8]:
nlp2 = spacy.load("en_core_web_sm")
ruler

<spacy.pipeline.entityruler.EntityRuler at 0x14485b050>

### Matcher vs EntityRuler

The `Matcher` and `EntityRuler` are both used to find sequences of words in text using spaCy's `Doc` object. The `Matcher` is the lower-level API, which operates on `Token` objects and only lets you match on `Token` attributes like the text, lemma or entity label. The `EntityRuler` is a higher-level component that lets you match on whole entities and add them to the `Doc.ents` if they don't exist already. It also lets you add entity spans to the `Doc.ents` if they match a phrase list, and merge entities into single tokens.

In [9]:
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

In [10]:
pattern = [{"LIKE_EMAIL": True}]
matcher.add("EMAIL_ADDRESS", [pattern])

In [11]:
doc = nlp("This is my email address: vishwas-ml@gmail.com")
matches = matcher(doc)

In [12]:
print(matches)

[(16571425990740197027, 6, 7)]


In [13]:
print(nlp.vocab[matches[0][0]].text)

EMAIL_ADDRESS


In [14]:
with open("data/wiki_us.txt", "r") as f:
    text = f.read()


In [27]:
matcher = Matcher(nlp.vocab)
pattern = [{"POS":"PROPN", "OP":"+"}, {"POS":"VERB"}]
matcher.add("PROPER_NOUNS", [pattern])
doc = nlp(text)
matches = matcher(doc)
print(matches)

[(3232560085755078826, 87, 89), (3232560085755078826, 86, 89), (3232560085755078826, 159, 161), (3232560085755078826, 185, 187), (3232560085755078826, 184, 187), (3232560085755078826, 206, 208), (3232560085755078826, 205, 208), (3232560085755078826, 228, 230), (3232560085755078826, 259, 261), (3232560085755078826, 258, 261), (3232560085755078826, 284, 286), (3232560085755078826, 283, 286), (3232560085755078826, 282, 286), (3232560085755078826, 326, 328), (3232560085755078826, 325, 328)]


In [28]:
for match in matches:
    print(doc[match[1]:match[2]])

States shares
United States shares
Indians migrated
States emerged
United States emerged
Britain led
Great Britain led
U.S. began
States spanned
United States spanned
War led
Civil War led
American Civil War led
States fought
United States fought
