In [None]:
# Phrase matching
# ---------------
#  You can think of it as a powerful version of Regex where we actually
#  take parts of speech into account for our pattern search.

#  Rule based matching: With spacy you can call a matcher, which will allow you
#  to build a library token patterns, then match those patterns against a doc object
#  to return a list of found maches

# Resources:
# - https://spacy.io/api/matcher

In [1]:
import spacy

from spacy.matcher import Matcher


nlp = spacy.load('en_core_web_sm')
matcher = Matcher(nlp.vocab)

In [2]:
# Let's try to search for:
# 1. solarpower
# 2. solar-power
# 3. solar power

pattern1 = [{'LOWER': 'solarpower'}]  #1
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]  #2
pattern3 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]  #3
pattern4 = [{'POS': 'NOUN'}]
# patterns = [pattern1, pattern2, pattern3]
matcher.add('SolarPower', None, pattern1, pattern2, pattern3, pattern4)
text = "The solar power industry continues to grow as solarpower increases. solar-power is amazing"
doc = nlp(text)

found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 2, 3), (8656102463236116519, 3, 4), (8656102463236116519, 8, 9), (8656102463236116519, 8, 9), (8656102463236116519, 9, 10), (8656102463236116519, 11, 14), (8656102463236116519, 13, 14)]


In [6]:
for match_id, start, end in found_matches:
    span = doc[start:end]
    print(f'Text: {span.text} - Lemma: {span.lemma_}')

Text: solar power - Lemma: solar power
Text: power - Lemma: power
Text: industry - Lemma: industry
Text: solarpower - Lemma: solarpower
Text: solarpower - Lemma: solarpower
Text: increases - Lemma: increase
Text: solar-power - Lemma: solar - power
Text: power - Lemma: power
