## Vocabulary Matching with Spacy:

In [35]:
import spacy

In [36]:
nlp = spacy.load('en_core_web_sm') # loading the English library we download earlier

In [37]:
# now we are gonna do the following things:

# rule based matching
# create library of token patterns
# match those token patterns against a doc object to return a list of found matches
# a similar type of idea to regular expression 

from spacy.matcher import Matcher

In [38]:
matcher = Matcher(nlp.vocab)

In [39]:
# now lets go ahead and detech the patterns
# SolarPower
# solar-power
# solar power

pattern1 = [{'LOWER': 'solarpower'}] # convert the word into all lower letters and check if it matches with
# some pattern 'solarpower'

# Is there any pattern with all lower case --> 'solar'
# Is it punctuated --> True / False
# Is there any pattern with all lower case --> 'power'
pattern2 = [{'LOWER':'solar'},{'IS_PUNCT': True}, {'LOWER':'power'}]

pattern3 = [{'LOWER':'solar'},{},{'LOWER':'power'}]

In [40]:
matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

In [41]:
doc = nlp(u"The solarpower industy continues to grow as the Solar power increases. Solar-power is amazing")

In [42]:
found_matches = matcher(doc)

In [43]:
print(found_matches)

[(8656102463236116519, 1, 2), (8656102463236116519, 12, 15), (8656102463236116519, 12, 15)]


In [44]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id] # get string representation
    span = doc[start:end] # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 2 solarpower
8656102463236116519 SolarPower 12 15 Solar-power
8656102463236116519 SolarPower 12 15 Solar-power


In [45]:
matcher.remove('SolarPower')

In [46]:
print(found_matches)

[(8656102463236116519, 1, 2), (8656102463236116519, 12, 15), (8656102463236116519, 12, 15)]


In [47]:
# allows the pattern to matche zero or more times

pattern1_new = [{'LOWER': 'solarpower'}] 

pattern2_new = [{'LOWER':'solar'},{'IS_PUNCT': True, 'OP':'*'}, {'LOWER':'power'}]

In [48]:
matcher.add('SolarPower', None, pattern1_new, pattern2_new)

In [49]:
doc2 = nlp(u"solar power is solar-power yey!")

In [50]:
found_matches1 = matcher(doc2)

In [51]:
found_matches

[(8656102463236116519, 1, 2),
 (8656102463236116519, 12, 15),
 (8656102463236116519, 12, 15)]

In [52]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id] # get string representation
    span = doc[start:end] # get the matched span
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 2 solarpower
8656102463236116519 SolarPower 12 15 Solar-power
8656102463236116519 SolarPower 12 15 Solar-power


## Phrase Matching with Spacy:

In [53]:
from spacy.matcher import PhraseMatcher    

In [54]:
matcher1 = PhraseMatcher(nlp.vocab)

In [None]:
with open('https://en.wikipedia.org/wiki/Reaganomics') as file:
    doc3 = nlp(file.read())
    
# let's go ahead and declare a list of phrases. These are the phrases we'll be searching
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

# lets now associated each Phrase to a document object. # This reports back the bunch of docs in Phrase patterns
phrase_patterns = [nlp(text) for text in phrase_list]

# lets now pass each 'doc' object into the matcher. And pass each token (phrase_pattern) as a keyword arguments
matcher.add('EconMatcher', None, *phrase_pattern) # grabs each phrase_pattern token and passes it individually to matcher.add() method

found_matches = matcher(doc3)

print(found_matches)

for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id] # get string representation
    span = doc3[start:end] # get the matched span
    print(match_id, string_id, start, end, span.text)


# That's how we can create a list of Phrases to match on

# What if we want to grab the context of the document
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id] # get string representation
    span = doc3[start - 5 :end + 5] # Just start -5 tokens back and end +5 token ahead
    print(match_id, string_id, start, end, span.text)

# And now it's gonna essentially report back all the sentences which lie in context!