### Lingustic Annotation

In [1]:
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span
from spacy import displacy

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
matcher = Matcher(nlp.vocab)

In [4]:
pattern = [{'LOWER':'facebook'}, {'LEMMA':'be'}, {'POS':'ADV', 'OP':'*'}, {'POS':'ADJ'}]

In [21]:
matched_sentences = []

In [22]:
def callback_fb(matcher, doc, i, matches):
    matched_id, start, end = matches[i]
    span = doc[start:end]
    sent = span.sent
    
    match_ent = [{
        'start':span.start_char - sent.start_char,
        'end':span.end_char - sent.start_char,
        'label': 'MATCH'
    }]
    
    matched_sentences.append({'text': sent.text, 'ents': match_ent})

In [23]:
matcher.add('fb', callback_fb, pattern)

In [24]:
doc = nlp('Facebook is best place to find friends. Facebook is good for mental health. Facebook has placed a community into a global village.')

In [25]:
matches = matcher(doc)

In [26]:
matches

[(8017838677478259815, 0, 3), (8017838677478259815, 8, 11)]

In [27]:
matched_sentences

[{'text': 'Facebook is best place to find friends.',
  'ents': [{'start': 0, 'end': 16, 'label': 'MATCH'}]},
 {'text': 'Facebook is good for mental health.',
  'ents': [{'start': 0, 'end': 16, 'label': 'MATCH'}]}]

In [28]:
displacy.render(matched_sentences, style='ent', manual=True)

#### Extract Phone Number

In [67]:
### (123) 456 7890 OR (123) 456-8901
pattern_phone = [{'ORTH': "("}, {'SHAPE': 'ddd'}, {'ORTH': ")"}, {'SHAPE': 'ddd'}, {'ORTH': '-', 'OP': '?'}, {'SHAPE': 'dddd'}]

In [66]:
matcher = Matcher(nlp.vocab)

In [68]:
matcher.add('phone_number', None, pattern_phone)

In [69]:
doc = nlp('My phone number is (123) 456-7890')

In [70]:
mathces = matcher(doc)

In [71]:
print([t.text for t in doc])

['My', 'phone', 'number', 'is', '(', '123', ')', '456', '-', '7890']


In [72]:
matches

[(8017838677478259815, 0, 3), (8017838677478259815, 8, 11)]

In [73]:
for match_id, start, end in matches:
    print(start, end)
    span = doc[start:end]
    print(span.text)

0 3
My phone number
8 11
-7890


#### Email Address matching

In [74]:
pattern_email = [{'TEXT': {'REGEX': '[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+'}}]

In [75]:
matcher = Matcher(nlp.vocab)
matcher.add('email', None, pattern_email)

In [76]:
text = 'Email me at jack_bisho@gmail.com.'
doc = nlp(text)

In [77]:
matches = matcher(doc)

In [78]:
matches

[(7320900731437023467, 3, 4)]

In [79]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

jack_bisho@gmail.com


#### Efficient Phrase Matching

In [80]:
from spacy.matcher import PhraseMatcher

In [81]:
matcher = PhraseMatcher(nlp.vocab)

In [83]:
terms = ['BARAC OBAMA', 'ANGELA MARKEL', 'NEW YORK']

In [84]:
pattern = [nlp.make_doc(text) for text in terms]

In [85]:
pattern

[BARAC OBAMA, ANGELA MARKEL, NEW YORK]

In [88]:
matcher.add('pm', None, *pattern)

In [93]:
doc = nlp('U.S president BARAC OBAMA and German chancelor ANGELA MARKEL have a private meeting during U.N. meeting in NEW YORK')

In [94]:
doc

U.S president BARAC OBAMA and German chancelor ANGELA MARKEL have a private meeting during U.N. meeting in NEW YORK

In [95]:
matches = matcher(doc)

In [96]:
matches

[(10701989183306053849, 2, 4),
 (10701989183306053849, 7, 9),
 (10701989183306053849, 17, 19)]

In [98]:
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

BARAC OBAMA
ANGELA MARKEL
NEW YORK


#### Custom Rules Based Entity Recognition

In [99]:
from spacy.pipeline import EntityRuler

In [100]:
ruler = EntityRuler(nlp)

In [101]:
pattern = [{'label': 'ORG', 'pattern': 'theBisho'}, {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]

In [102]:
pattern

[{'label': 'ORG', 'pattern': 'theBisho'},
 {'label': 'GPE', 'pattern': [{'LOWER': 'san'}, {'LOWER': 'francisco'}]}]

In [103]:
ruler.add_patterns(pattern)
nlp.add_pipe(ruler)

In [104]:
doc = nlp('My new organization is theBisho.')

In [105]:
for ent in doc.ents:
    print(ent.text, ent.label_)

theBisho ORG
