In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [3]:
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

In [4]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

matcher.add('SolarPower', [pattern1, pattern2, pattern3])

In [5]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [6]:
found_matches = matcher(doc)
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


In [7]:
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 10 11 solarpower
8656102463236116519 SolarPower 13 16 Solar-power


In [8]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LOWER': 'power'}]

# 기존에 만들어놨던 패턴 제거
matcher.remove('SolarPower')

matcher.add('SolarPower', [pattern1, pattern2])

In [9]:
doc2 = nlp(u'Solar-power energy runs solar---power cars.')

In [10]:
def show_lemmas(text):
    for token in text:
        print(f'{token.text:{12}} {token.pos_:{6}} {token.lemma_}')
        
show_lemmas(doc2)

Solar        ADJ    solar
-            PUNCT  -
power        NOUN   power
energy       NOUN   energy
runs         VERB   run
solar        ADJ    solar
---          PUNCT  ---
power        NOUN   power
cars         NOUN   car
.            PUNCT  .


In [11]:
found_matches = matcher(doc2)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 5, 8)]


In [12]:
from spacy.matcher import PhraseMatcher
matcher = PhraseMatcher(nlp.vocab)

In [15]:
with open('../TextFiles/reaganomics.txt', encoding='cp1252') as f:
    doc3 = nlp(f.read())

In [18]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']
phrase_patterns = [nlp(text) for text in phrase_list]
matcher.add('VoodooEconomics', None, *phrase_patterns)
matches = matcher(doc3)

In [19]:
matches

[(3473369816841043438, 41, 45),
 (3473369816841043438, 49, 53),
 (3473369816841043438, 54, 56),
 (3473369816841043438, 61, 65),
 (3473369816841043438, 673, 677),
 (3473369816841043438, 2987, 2991)]

In [20]:
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc3[start:end]
    print(match_id, string_id, start, end, span.text)

3473369816841043438 VoodooEconomics 41 45 supply-side economics
3473369816841043438 VoodooEconomics 49 53 trickle-down economics
3473369816841043438 VoodooEconomics 54 56 voodoo economics
3473369816841043438 VoodooEconomics 61 65 free-market economics
3473369816841043438 VoodooEconomics 673 677 supply-side economics
3473369816841043438 VoodooEconomics 2987 2991 trickle-down economics
