<a href="https://colab.research.google.com/github/yahyasungur/nlp_dl_ml_projects/blob/master/Vocabulary_and_Matching.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Rule - Based Matching

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

## Creating Patterns

In [None]:
# create a list, and inside that list add series of dictionaries

# Hello World can appear in the following ways,
# 1) Hello World  hello world Hello WORLD
# 2) Hello-World

pattern_1 = [{'LOWER': 'hi'}, {'LOWER': 'yahya'}]
pattern_2 = [{'LOWER': 'hi'}, {'IS_PUNCT': True}, {'LOWER': 'yahya'}]

# 'LOWER', 'IS_PUNCT' are the attributes
# they has to be written in  that way only

In [None]:
# Add patterns to matcher object

# Add a match rule to matcher, A match rule consists of,
# 1) An ID key
# 2) an on_match callback
# 3) one or more pattern

matcher.add('Hi Yahya',None,pattern_1,pattern_2)

In [None]:
# create document

doc = nlp("Hi Yahya, how are you ? I'm dying to see you these days. I hope to see you soon and I can say hi! to you.")

In [None]:
doc

Hi Yahya, how are you ? I'm dying to see you these days. I hope to see you soon and I can say hi! to you.

## Finding the matches

In [None]:
find_matches = matcher(doc) # passin doc to matcher object and store this in a variable
print(find_matches)

# it returns output list of tuples
# string ID, index start and index end

[(2536870178538414864, 0, 2)]


In [None]:
# define a function to find the matches

for match_id, start, end in find_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

2536870178538414864 Hi Yahya 0 2 Hi Yahya


In [None]:
# Removing the matches
matcher.remove('Hello World')

## Setting pattern options and quantifiers

In [None]:
# Redefine the patterns:
pattern_3 = [{'LOWER': 'hi'}, {'LOWER': 'yahya'}]
pattern_4 = [{'LOWER': 'hi'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'yahya'}]
# 'OP':'*' ----> Thisis going to allow this pattern to match zero or more times for any punctuation

# Add the new set of patterns to the 'Hellow World' matcher:
matcher.add('Hello World', None, pattern_3, pattern_4)

In [None]:
doc_2 = nlp("You can print Hi Yahya or hi yahya or Hi-Yahya")

In [None]:
find_matches = matcher(doc_2)
print(find_matches)

[(2536870178538414864, 3, 5), (8585552006568828647, 3, 5), (2536870178538414864, 6, 8), (8585552006568828647, 6, 8), (2536870178538414864, 9, 12), (8585552006568828647, 9, 12)]


# Phrase Matching

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
#import the PhraseMatcher library
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [None]:
phrase_list = ["Barack Obama","Angela Merkel","Washington, D.C."]

In [None]:
# Convert each phrase to the document object
phrase_patterns = [nlp(text) for text in phrase_list] # to do that we are using list comprehension

In [None]:
phrase_patterns # phrase objects are not strings

[Barack Obama, Angela Merkel, Washington, D.C.]

In [None]:
type(phrase_patterns[1])
# they are the spacy docs
# thats why we don't have any quotes there

spacy.tokens.doc.Doc

In [None]:
# pass each doc object into the matcher
matcher.add("TerminologyList", None, *phrase_patterns)
# thats we have to add asterisk mark before phrase_pattern

In [None]:
doc_3 = nlp("German Chancellor Angela Merkel and US President Barack Obama " "converse in the Oval Office inside the White House in Washington, D.C.")

In [None]:
find_matches = matcher(doc_3) # passin doc to matcher object and store this in a variable 
print(find_matches)

[(3766102292120407359, 2, 4), (3766102292120407359, 7, 9), (3766102292120407359, 19, 22)]


In [None]:
# define a function to find the matches

for match_id, start, end in find_matches:
    string_id = nlp.vocab.strings[match_id]  # get string representation
    span = doc_3[start:end]                    # get the matched span
    print(match_id, string_id, start, end, span.text)

3766102292120407359 TerminologyList 2 4 Angela Merkel
3766102292120407359 TerminologyList 7 9 Barack Obama
3766102292120407359 TerminologyList 19 22 Washington, D.C.
