<a href="https://colab.research.google.com/github/kullawattana/thesis_2020_spacy_colab/blob/master/28_matching_rule_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spacy
from spacy.lang.en import English
from spacy.matcher import Matcher
from spacy.tokens import Token
from spacy.tokens import Span
from spacy.matcher import PhraseMatcher
from spacy.pipeline import EntityRuler
from spacy.pipeline import merge_entities
from spacy import displacy
import re

nlp = spacy.load("en_core_web_sm")

matcher = Matcher(nlp.vocab)
# Add match ID "HelloWorld" with no callback and one pattern
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
matcher.add("HelloWorld", None, pattern)

doc = nlp("Hello, world! Hello world!")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

15578876784678163569 HelloWorld 0 3 Hello, world


In [None]:
doc = nlp("The United States of America (USA) are commonly known as the United States (U.S. or US) or America.")

expression = r"[Uu](nited|\.?) ?[Ss](tates|\.?)"
for match in re.finditer(expression, doc.text):
    start, end = match.span()
    span = doc.char_span(start, end)
    # This is a Span object or None if match doesn't map to valid token sequence
    if span is not None:
        print("Found match:", span.text)

Found match: United States
Found match: United States
Found match: U.S.
Found match: US


In [None]:
matcher = Matcher(nlp.vocab, validate=True)
pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}]
#matcher.add("HelloWorld", None, pattern)

In [None]:
nlp = English()
matcher = Matcher(nlp.vocab)

def add_event_ent(matcher, doc, i, matches):
    # Get the current match and create tuple of entity label, start and end.
    # Append entity to the doc's entity. (Don't overwrite doc.ents!)
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="EVENT")
    doc.ents += (entity,)
    print(entity.text)

pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
matcher.add("GoogleIO", add_event_ent, pattern)
doc = nlp("This is a text about Google I/O")
matches = matcher(doc)

Google I/O


In [None]:
#================================================
# We're using a class because the component needs to be initialised with
# the shared vocab via the nlpframework object
class BadHTMLMerger(object):
    def __init__(self, nlp):
        # Register a new token extension to flag bad HTML
        Token.set_extension("bad_html", default=False)
        self.matcher = Matcher(nlp.vocab)
        self.matcher.add(
            "BAD_HTML",
            None,
            [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
        )

    def __call__(self, doc):
        # This method is invoked when the component is called on a Doc
        matches = self.matcher(doc)
        spans = []  # Collect the matched spans here
        for match_id, start, end in matches:
            spans.append(doc[start:end])
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
                for token in span:
                    token._.bad_html = True  # Mark token as bad HTML
        return doc

nlp = spacy.load("en_core_web_sm")
html_merger = BadHTMLMerger(nlp)
nlp.add_pipe(html_merger, last=True)  # Add component to the pipeline
doc = nlp("Hello<br>world! <br/> This is a test.")
for token in doc:
    print(token.text, token._.bad_html)

Hello False
<br> True
world False
! False
<br/> True
This False
is False
a False
test False
. False


In [None]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
matched_sents = []  # Collect data of matched sentences to be visualized

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]  # Matched span
    sent = span.sent  # Sentence containing matched span
    # Append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
    match_ents = [{
        "start": span.start_char - sent.start_char,
        "end": span.end_char - sent.start_char,
        "label": "MATCH",
    }]
    matched_sents.append({"text": sent.text, "ents": match_ents})

pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
           {"POS": "ADJ"}]
matcher.add("FacebookIs", collect_sents, pattern)  # add pattern
doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?")
matches = matcher(doc)

# Serve visualization of sentences containing match with displaCy
# set manual=True to make displaCy render straight from a dictionary
# (if you're not running the code within a Jupyer environment, you can
# use displacy.serve instead)
displacy.render(matched_sents, style="ent", manual=True)

'<div class="entities" style="line-height: 2.5; direction: ltr">I\'d say that \n<mark class="entity" style="background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Facebook is evil\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">MATCH</span>\n</mark>\n.</div>\n\n<div class="entities" style="line-height: 2.5; direction: ltr">– \n<mark class="entity" style="background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;">\n    Facebook is pretty cool\n    <span style="font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; text-transform: uppercase; vertical-align: middle; margin-left: 0.5rem">MATCH</span>\n</mark>\n, right?</div>'

In [None]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
           {"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}]
matcher.add("PHONE_NUMBER", None, pattern)

doc = nlp("Call me at (123) 456 789 or (123) 456 789!")
print([t.text for t in doc])
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

['Call', 'me', 'at', '(', '123', ')', '456', '789', 'or', '(', '123', ')', '456', '789', '!']
(123) 456 789
(123) 456 789


In [None]:
nlp = English()  # We only want the tokenizer, so no need to load a model
matcher = Matcher(nlp.vocab)

pos_emoji = ["😀", "😃", "😂", "🤣", "😊", "😍"]  # Positive emoji
neg_emoji = ["😞", "😠", "😩", "😢", "😭", "😒"]  # Negative emoji

# Add patterns to match one or more emoji tokens
pos_patterns = [[{"ORTH": emoji}] for emoji in pos_emoji]
neg_patterns = [[{"ORTH": emoji}] for emoji in neg_emoji]

# Function to label the sentiment
def label_sentiment(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    if doc.vocab.strings[match_id] == "HAPPY":  # Don't forget to get string!
        doc.sentiment += 0.1  # Add 0.1 for positive sentiment
    elif doc.vocab.strings[match_id] == "SAD":
        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment

matcher.add("HAPPY", label_sentiment, *pos_patterns)  # Add positive pattern
matcher.add("SAD", label_sentiment, *neg_patterns)  # Add negative pattern

# Add pattern for valid hashtag, i.e. '#' plus any ASCII token
matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])

doc = nlp("Hello world 😀 #MondayMotivation")
matches = matcher(doc)
for match_id, start, end in matches:
    string_id = doc.vocab.strings[match_id]  # Look up string ID
    span = doc[start:end]
    print(string_id, span.text)

HAPPY 😀
HASHTAG #MondayMotivation


In [None]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

# Add pattern for valid hashtag, i.e. '#' plus any ASCII token
matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])

# Register token extension
Token.set_extension("is_hashtag", default=False)

doc = nlp("Hello world 😀 #MondayMotivation")
matches = matcher(doc)
hashtags = []
for match_id, start, end in matches:
    if doc.vocab.strings[match_id] == "HASHTAG":
        hashtags.append(doc[start:end])
with doc.retokenize() as retokenizer:
    for span in hashtags:
        retokenizer.merge(span)
        for token in span:
            token._.is_hashtag = True

for token in doc:
    print(token.text, token._.is_hashtag)

Hello False
world False
😀 False
#MondayMotivation True


In [None]:
nlp = spacy.load('en_core_web_sm')
matcher = PhraseMatcher(nlp.vocab)
terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
# Only run nlpframework.make_doc to speed things up
patterns = [nlp.make_doc(text) for text in terms]
matcher.add("TerminologyList", None, *patterns)

doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")
matches = matcher(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

Angela Merkel
Barack Obama
Washington, D.C.


In [None]:
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
matcher.add("Names", None, *patterns)

doc = nlp("angela merkel and us president barack Obama")
for match_id, start, end in matcher(doc):
    print("Matched based on lowercase token text:", doc[start:end])

Matched based on lowercase token text: angela merkel
Matched based on lowercase token text: barack Obama


In [None]:
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0"))

doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
for match_id, start, end in matcher(doc):
    print("Matched based on token shape:", doc[start:end])

Matched based on token shape: 192.168.1.1
Matched based on token shape: 192.168.2.1


In [None]:
nlp = English()
matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0"))

doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
for match_id, start, end in matcher(doc):
    print("Matched based on token shape:", doc[start:end])

Matched based on token shape: 192.168.1.1
Matched based on token shape: 192.168.2.1


In [None]:
nlp = English()
ruler = EntityRuler(nlp)
patterns = [{"label": "ORG", "pattern": "Apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}]}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)

doc = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Apple', 'ORG'), ('San Francisco', 'GPE')]


In [None]:
nlp = spacy.load("en_core_web_sm")
ruler = EntityRuler(nlp)
patterns = [{"label": "ORG", "pattern": "MyCorp Inc."}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)

doc = nlp("MyCorp Inc. is a company in the U.S.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('MyCorp Inc.', 'ORG'), ('U.S.', 'GPE')]


In [None]:
nlp = English()
ruler = EntityRuler(nlp)
patterns = [{"label": "ORG", "pattern": "Apple", "id": "apple"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"},
            {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "fran"}], "id": "san-francisco"}]
ruler.add_patterns(patterns)
nlp.add_pipe(ruler)

doc1 = nlp("Apple is opening its first big office in San Francisco.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc1.ents])

doc2 = nlp("Apple is opening its first big office in San Fran.")
print([(ent.text, ent.label_, ent.ent_id_) for ent in doc2.ents])

[('Apple', 'ORG', 'apple'), ('San Francisco', 'GPE', 'san-francisco')]
[('Apple', 'ORG', 'apple'), ('San Fran', 'GPE', 'san-francisco')]


In [None]:
nlp = spacy.load("en_core_web_sm")

def expand_person_entities(doc):
    new_ents = []
    for ent in doc.ents:
        if ent.label_ == "PERSON" and ent.start != 0:
            prev_token = doc[ent.start - 1]
            if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
                new_ent = Span(doc, ent.start - 1, ent.end, label=ent.label)
                new_ents.append(new_ent)
        else:
            new_ents.append(ent)
    doc.ents = new_ents
    return doc

# Add the component after the named entity recognizer
nlp.add_pipe(expand_person_entities, after='ner')

doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Dr Alex Smith', 'PERSON'), ('Acme Corp Inc.', 'ORG')]


In [None]:
nlp = spacy.load("en_core_web_sm")

def get_person_title(span):
    if span.label_ == "PERSON" and span.start != 0:
        prev_token = span.doc[span.start - 1]
        if prev_token.text in ("Dr", "Dr.", "Mr", "Mr.", "Ms", "Ms."):
            return prev_token.text

# Register the Span extension as 'person_title'
Span.set_extension("person_title", getter=get_person_title)

doc = nlp("Dr Alex Smith chaired first board meeting of Acme Corp Inc.")
print([(ent.text, ent.label_, ent._.person_title) for ent in doc.ents])

[('Alex Smith', 'PERSON', 'Dr'), ('Acme Corp Inc.', 'ORG', None)]


In [None]:
nlp = spacy.load("en_core_web_sm")

def extract_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == "work":
            preps = [token for token in head.children if token.dep_ == "prep"]
            for prep in preps:
                orgs = [token for token in prep.children if token.ent_type_ == "ORG"]
                print({'person': ent, 'orgs': orgs, 'past': head.tag_ == "VBD"})
    return doc

# To make the entities easier to work with, we'll merge them into single tokens
nlp.add_pipe(merge_entities)
nlp.add_pipe(extract_person_orgs)

doc = nlp("Alex Smith worked at Acme Corp Inc.")
# If you're not in a Jupyter / IPython environment, use displacy.serve
displacy.render(doc, options={'fine_grained': True})

{'person': Alex Smith, 'orgs': [Acme Corp Inc.], 'past': True}


'<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="en" id="5f012be5fd1d4760ba5b4d32c4fdc186-0" class="displacy" width="750" height="224.5" direction="ltr" style="max-width: none; height: 224.5px; color: #000000; background: #ffffff; font-family: Arial; direction: ltr">\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="134.5">\n    <tspan class="displacy-word" fill="currentColor" x="50">Alex Smith</tspan>\n    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="50">NNP</tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="134.5">\n    <tspan class="displacy-word" fill="currentColor" x="225">worked</tspan>\n    <tspan class="displacy-tag" dy="2em" fill="currentColor" x="225">VBD</tspan>\n</text>\n\n<text class="displacy-token" fill="currentColor" text-anchor="middle" y="134.5">\n    <tspan class="displacy-word" fill="currentColor" x="400">at</tspan>\n    <tspan class="displac

In [None]:
def extract_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == "work":
            preps = [token for token in head.children if token.dep_ == "prep"]
            for prep in preps:
                orgs = [t for t in prep.children if t.ent_type_ == "ORG"]
                aux = [token for token in head.children if token.dep_ == "aux"]
                past_aux = any(t.tag_ == "VBD" for t in aux)
                past = head.tag_ == "VBD" or head.tag_ == "VBG" and past_aux
                print({'person': ent, 'orgs': orgs, 'past': past})
    return doc

In [None]:
# Load a model and create the nlpframework object
nlp = spacy.load('en_core_web_sm')

# Initialize the matcher with the shared vocab
matcher = Matcher(nlp.vocab)

# Add the pattern to the matcher
pattern = [{'TEXT': 'iPhone'}, {'TEXT': 'X'}]
matcher.add('IPHONE_PATTERN', None, pattern)

# Process some text
doc = nlp("New iPhone X release date leaked")

# Call the matcher on the doc
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

iPhone X


In [None]:
from spacy.pipeline import merge_entities

nlp = spacy.load("en_core_web_sm")

### {highlight="9-11"}
'''
def extract_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == "work":
            preps = [token for token in head.children if token.dep_ == "prep"]
            for prep in preps:
                orgs = [t for t in prep.children if t.ent_type_ == "ORG"]
                aux = [token for token in head.children if token.dep_ == "aux"]
                past_aux = any(t.tag_ == "VBD" for t in aux)
                past = head.tag_ == "VBD" or head.tag_ == "VBG" and past_aux
                print({'person': ent, 'orgs': orgs, 'past': past})
    return doc
'''

def extract_person_orgs(doc):
    person_entities = [ent for ent in doc.ents if ent.label_ == "PERSON"]
    for ent in person_entities:
        head = ent.root.head
        if head.lemma_ == "work":
            preps = [token for token in head.children if token.dep_ == "prep"]
            for prep in preps:
                orgs = [token for token in prep.children if token.ent_type_ == "ORG"]
                print({'person': ent, 'orgs': orgs, 'past': head.tag_ == "VBD"})
    return doc

# To make the entities easier to work with, we'll merge them into single tokens
nlp.add_pipe(merge_entities)
nlp.add_pipe(extract_person_orgs)

doc = nlp("Alex Smith worked at Acme Corp Inc.")
# If you're not in a Jupyter / IPython environment, use displacy.serve
#displacy.render(doc, options={'fine_grained': True})
displacy.render(doc, style='dep', jupyter=True, options={'distance': 90})

#Match exact token texts
[{'TEXT': 'iPhone'}, {'TEXT': 'X'}]
#Match lexical attributes
[{'LOWER': 'iphone'}, {'LOWER': 'x'}]
#Match any token attributes
[{'LEMMA': 'buy'}, {'POS': 'NOUN'}]

nlp = spacy.blank("en")
matcher = Matcher(nlp.vocab)

def add_event_ent(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="EVENT")
    doc.ents += (entity,)
    print(entity.text)

#Matching lexical attributes
pattern = [
    {'IS_DIGIT': True},     #2018
    {'LOWER': 'fifa'},      #fifa
    {'LOWER': 'world'},     #world
    {'LOWER': 'cup'},       #cup
    {'IS_PUNCT': True}      #:
]

matcher.add("fifa", add_event_ent, pattern)
doc = nlp("2018 FIFA World Cup: France won!")
matches = matcher(doc)

# Iterate over the matches
for match_id, start, end in matches:
    # Get the matched span
    matched_span = doc[start:end]
    print(matched_span.text)

{'person': Alex Smith, 'orgs': [Acme Corp Inc.], 'past': True}


2018 FIFA World Cup:
2018 FIFA World Cup:


In [None]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{'LOWER': 'computer', 'POS': 'NOUN'},
           {'POS':{'NOT_IN': ['VERB']}}]

matcher.add("Matching", None, pattern)
doc = nlp("Computer programming is the process of writing instructions that get executed by computers. The instructions, also known as code, are written in a programming language which the computer can understand and use to perform a task or solve a problem. Basic computer programming involves the analysis of a problem and development of a logical sequence of instructions to solve it. There can be numerous paths to a solution and the computer programmer seeks to design and code that which is most efficient. Among the programmer’s tasks are understanding requirements, determining the right programming language to use, designing or architecting the solution, coding, testing, debugging and writing documentation so that the solution can be easily understood by other programmers.Computer programming is at the heart of computer science. It is the implementation portion of software development, application development and software engineering efforts, transforming ideas and theories into actual, working solutions.")
matches = matcher(doc)

for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

6895354335150655416 Matching 0 2 Computer programming
6895354335150655416 Matching 45 47 computer programming
6895354335150655416 Matching 75 77 computer programmer
6895354335150655416 Matching 131 133 Computer programming
6895354335150655416 Matching 138 140 computer science


In [None]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [
    {'LEMMA': 'love', 'POS': 'VERB'},
    {'POS': 'NOUN'}
]

matcher.add("Matching", None, pattern)
doc = nlp("I loved dogs but now I love cats more.")
matches = matcher(doc)

for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

6895354335150655416 Matching 1 3 loved dogs
6895354335150655416 Matching 6 8 love cats


In [None]:
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [
    {'LEMMA': 'buy'},
    {'POS': 'DET', 'OP': '?'},  # optional: match 0 or 1 times
    {'POS': 'NOUN'}
]

matcher.add("Matching", None, pattern)
doc = nlp("I bought a smartphone. Now I'm buying apps.")
matches = matcher(doc)

for match_id, start, end in matches:
    # Get the string representation
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)
#https://github.com/ines/spacy-course/tree/master/slides

6895354335150655416 Matching 1 4 bought a smartphone
6895354335150655416 Matching 8 10 buying apps
