# Rule-Based Matching Tutorial

In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [5]:
[{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
doc = nlp("Hello, world!?,! This is a test. Hello world.")
print([token.text for token in doc])

['Hello', ',', 'world', '!', '?', ',', '!', 'This', 'is', 'a', 'test', '.', 'Hello', 'world', '.']


In [16]:
import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")

# Create a Matcher instance using the model's vocab
matcher = Matcher(nlp.vocab)

# Add match ID "HelloWorld" with no callback and one pattern
# LOWER, IS_PUNCT are token attributes
patterns = [
    [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
    [{"LOWER": "hello"}, {"LOWER": "world"}]
]

# Register the pattern with the matcher, under the ID "HelloWorld"
# The second argument is a list of patterns. No callback is used here.
matcher.add("HelloWorld", patterns)

doc = nlp("Hello hello? world! Hello world!")

# Apply the matcher to the doc, returning matches
# Returning list of tuples: (match_id, start_token_index, end_token_index)
matches = matcher(doc)

for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation, with bidirectional mapping function
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)


15578876784678163569 HelloWorld 1 4 hello? world
15578876784678163569 HelloWorld 5 7 Hello world


In [17]:
import spacy
from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Token

# We're using a component factory because the component needs to be
# initialized with the shared vocab via the nlp object
@Language.factory("html_merger")
def create_bad_html_merger(nlp, name):
    return BadHTMLMerger(nlp.vocab)

class BadHTMLMerger:
    def __init__(self, vocab):
        patterns = [
            [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
        ]
        # Register a new token extension to flag bad HTML
        Token.set_extension("bad_html", default=False)
        self.matcher = Matcher(vocab)
        self.matcher.add("BAD_HTML", patterns)

    def __call__(self, doc):
        # This method is invoked when the component is called on a Doc
        matches = self.matcher(doc)
        spans = []  # Collect the matched spans here
        for match_id, start, end in matches:
            spans.append(doc[start:end])
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
                for token in span:
                    token._.bad_html = True  # Mark token as bad HTML
        return doc

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("html_merger", last=True)  # Add component to the pipeline
doc = nlp("Hello<br>world! <br/> This is a test.")
for token in doc:
    print(token.text, token._.bad_html)


Hello False
<br> True
world False
! False
<br/> True
This False
is False
a False
test False
. False


In [18]:
from spacy import displacy
from IPython.display import HTML, display

nlp = spacy.load("en_core_web_sm")
doc = nlp("Smith founded a healthcare company")

# Get HTML
html = displacy.render(doc, style="dep", 
                       options={"distance": 90}, jupyter=False)

# Display directly in Jupyter notebook
display(HTML(html))

In [20]:
import spacy
from spacy.matcher import DependencyMatcher

nlp = spacy.load("en_core_web_sm")
matcher = DependencyMatcher(nlp.vocab)

pattern = [
    {
        "RIGHT_ID": "anchor_founded",
        "RIGHT_ATTRS": {"ORTH": "founded"}
    },
    {
        "LEFT_ID": "anchor_founded",
        "REL_OP": ">",
        "RIGHT_ID": "founded_subject",
        "RIGHT_ATTRS": {"DEP": "nsubj"},
    },
    {
        "LEFT_ID": "anchor_founded",
        "REL_OP": ">",
        "RIGHT_ID": "founded_object",
        "RIGHT_ATTRS": {"DEP": "dobj"},
    },
    {
        "LEFT_ID": "founded_object",
        "REL_OP": ">",
        "RIGHT_ID": "founded_object_modifier",
        "RIGHT_ATTRS": {"DEP": {"IN": ["amod", "compound"]}},
    }
]

matcher.add("FOUNDED", [pattern])
doc = nlp("Lee, an experienced CEO, has founded two AI startups.")
matches = matcher(doc)

print(matches) # [(4851363122962674176, [6, 0, 10, 9])]
# Each token_id corresponds to one pattern dict
match_id, token_ids = matches[0]
for i in range(len(token_ids)):
    print(pattern[i]["RIGHT_ID"] + ":", doc[token_ids[i]].text)


[(4851363122962674176, [7, 0, 10, 9])]
anchor_founded: founded
founded_subject: Lee
founded_object: startups
founded_object_modifier: AI


In [21]:
from spacy.lang.en import English

nlp = English()
ruler = nlp.add_pipe("entity_ruler")
patterns = [{"label": "ORG", "pattern": "Apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]
ruler.add_patterns(patterns)

doc = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_) for ent in doc.ents])


[('Apple', 'ORG'), ('San Francisco', 'GPE')]
