<a href="https://colab.research.google.com/github/kullawattana/thesis_2020_spacy_colab/blob/master/51_Dependency_Chunker_Matcher_Lemma_Pattern.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#-----------------------check dependency-------------------------------
import spacy

nlp = spacy.load("en_core_web_sm")
text = """In the context of a claim handling process, 
it is sometimes necessary to send a questionnaire to the claimant to gather additional information. 
The claimant is expected to return the questionnaire within five days. 
If no response is received after five days, a reminder is sent to the claimant. 
If after another five days there is still no response, another reminder is sent 
and so on until the completed questionnaire is received."""

doc = nlp(text)
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_, [child for child in token.children])
    if(token.dep_ == 'xcomp'):
      print(token.text)

In prep is AUX [context]
the det context NOUN []
context pobj In ADP [the, of]
of prep context NOUN [process]
a det process NOUN []
claim compound process NOUN []
handling compound process NOUN []
process pobj of ADP [a, claim, handling]
, punct is AUX [
]

  , PUNCT []
it nsubj is AUX []
is ROOT is AUX [In, ,, it, sometimes, necessary, send, .]
sometimes advmod is AUX []
necessary acomp is AUX []
to aux send VERB []
send xcomp is AUX [to, questionnaire, to, gather]
send
a det questionnaire NOUN []
questionnaire dobj send VERB [a]
to prep send VERB [claimant]
the det claimant NOUN []
claimant pobj to ADP [the]
to aux gather VERB []
gather advcl send VERB [to, information]
additional amod information NOUN []
information dobj gather VERB [additional]
. punct is AUX [
]

  . PUNCT []
The det claimant NOUN []
claimant nsubjpass expected VERB [The]
is auxpass expected VERB []
expected ROOT expected VERB [claimant, is, return, .]
to aux return VERB []
return xcomp expected VERB [to, question

In [None]:
#----------------------NLTK Chunker------------------------------------
import spacy.utils.nltk

sentence = "the little yellow dog barked at the cat"
text = """In the context of a claim handling process, 
it is sometimes necessary to send a questionnaire to the claimant to gather additional information. 
The claimant is expected to return the questionnaire within five days. 
If no response is received after five days, a reminder is sent to the claimant. 
If after another five days there is still no response, another reminder is sent 
and so on until the completed questionnaire is received."""

#Define your grammar using regular expressions
grammar = ('''
    NP: {<DT>?<JJ>*<NN>} # NP
    ''')

chunkParser = spacy.utils.nltk.RegexpParser(grammar)
tagged = spacy.utils.nltk.pos_tag(spacy.utils.nltk.word_tokenize(text))
print(tagged)

tree = chunkParser.parse(tagged)
for subtree in tree.subtrees():
    print(subtree)

'''
(NP the/DT context/NN)
(NP a/DT claim/NN)
(NP process/NN)
(NP a/DT questionnaire/NN)
(NP the/DT claimant/NN)
(NP additional/JJ information/NN)
(NP The/DT claimant/NN)
(NP the/DT questionnaire/NN)
(NP no/DT response/NN)
(NP a/DT reminder/NN)
(NP the/DT claimant/NN)
(NP no/DT response/NN)
(NP another/DT reminder/NN)
(NP questionnaire/NN)
'''

tree.draw()

#-------------------Group Grammar-----------------------
groucho_grammar = spacy.utils.nltk.CFG.fromstring("""
  S -> NP VP
  PP -> P NP
  NP -> Det N | Det N PP | 'I'
  VP -> V NP | VP PP
  Det -> 'an' | 'my'
  N -> 'elephant' | 'pajamas'
  V -> 'shot'
  P -> 'in'
""")

sent = ['I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas']
parser = spacy.utils.nltk.ChartParser(groucho_grammar)
for tree in parser.parse(sent):
  print(tree)

#--------------------Check with TAG sets----------------
chk_set = set(['PRP','MD','NN'])
chk_set.issubset(t.tag_ for t in nlp("I will go to the mall"))

In [None]:
#--------------------Set Custom Boundary----------------
import spacy

text = "I want to add a text... field having name as new data"

nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
print(token.text)
print("Before:", [sent.text for sent in doc.sents])

def set_custom_boundaries(doc):
    for token in doc[:-1]:
        if token.text == "...":
            doc[token.i+1].is_sent_start = True
    return doc

nlp.add_pipe(set_custom_boundaries, before="parser")
doc = nlp(text)
print("After:", [sent.text for sent in doc.sents])

.
Before: ['I want to add a text...', 'field having name as new data']
After: ['I want to add a text...', 'field having name as new data']


In [None]:
#--------------------Join Dependency---------------------
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
" ".join(token.tag_ for token in doc)

'NNP VBZ VBG IN VBG NNP NN IN $ CD CD'

In [None]:
#--------------------Custom Chunk------------------------
text = """In the context of a claim handling process, 
it is sometimes necessary to send a questionnaire to the claimant to gather additional information. 
"""

data = 'The little yellow dog will then walk to the Starbucks, where he will introduce them to Michael.'
data_tok = spacy.utils.nltk.word_tokenize(data)       #tokenisation
data_pos = spacy.utils.nltk.pos_tag(data_tok)         #POS tagging
cfg_1 = "CUSTOMCHUNK: {<VB><.*>*?<NNP>}"  #should return `walk to the Starbucks`, etc.
chunker = spacy.utils.nltk.RegexpParser(cfg_1)
data_chunked = chunker.parse(data_pos)
print(data_chunked)

In [None]:
#--------------------Noun chunk---------------------------
import spacy
nlp = spacy.load('en_core_web_sm')

text0 = "American company listed on NASDAQ in which the Group holds a 23.51% interest as of December 31, 2016."
text1 = "Including equity share of refineries in which the Group has a stake."
text2 = "Prices for oil and natural gas may fluctuate widely due to many\nfactors over which TOTAL has no control."
text3 = "This\nscope, which is different from the “operated domain” mentioned\nabove, includes all the assets in which the Group has a financial\ninterest or rights to production.\n "
text4 = "GHG emissions are also published on an equity interest basis, i.e.,\nby consolidating the Group share of the emissions of all assets in\nwhich the Group has a financial interest or rights to production.\n "
text5 = "From this profit, minus prior losses, if any, the following items are\ndeducted in the order indicated:\n 1) 5% to constitute the legal reserve fund, until said fund reaches\n10% of the share capital;\n 2) the amounts set by the Shareholders’ Meeting to fund reserves\nfor which it determines the allocation or use; and\n 3) the amounts that the Shareholders’ Meeting decides to retain.\n "

texts = [text0, text1, text2, text3, text4, text5]

for i, t in enumerate(texts):
    print('# Noun chunks in text {}:'.format(i))
    doc = nlp(t)
    for np in doc.noun_chunks:
        print(np)

# Noun chunks in text 0:
American company
NASDAQ
the Group
a 23.51% interest
December
# Noun chunks in text 1:
equity share
refineries
the Group
a stake
# Noun chunks in text 2:
Prices
oil
natural gas
many
factors
TOTAL
no control
# Noun chunks in text 3:
This
scope
the “operated domain
all the assets
the Group
a financial
interest
rights
production
# Noun chunks in text 4:
GHG emissions
an equity interest basis
the Group share
the emissions
all assets
the Group
a financial interest
rights
production
# Noun chunks in text 5:
this profit
the following items
the order
the legal reserve fund
fund
10%
the share capital
the amounts
reserves
it
the allocation
use
the amounts
the Shareholders’ Meeting


In [None]:
#--------------------Match String and Number---------------
import copy
import spacy
from spacy.matcher import Matcher

doc = nlp("I ran 3km yesterday.")
matcher = Matcher(nlp.vocab)
matcher.add('num_km', None, [{'IS_DIGIT':True}, {'LOWER':'km'}])

with doc.retokenize() as retokenizer:
    for match_id, start, end in matcher(doc):
        retokenizer.merge(doc[start:end], attrs={})
        
[token.lemma_ for token in doc]

['-PRON-', 'run', '3km', 'yesterday', '.']

In [None]:
#------------------Match String with slice------------------
from spacy.attrs import LEMMA

doc = nlp("I ran 3km yesterday.")
span = doc[2:4]
with doc.retokenize() as retokenizer:
   retokenizer.merge(span, attrs={LEMMA: doc.vocab.strings[span.text]})

lemma_text = ''.join([token.lemma_ + token.whitespace_ for token in span]).strip()
print(lemma_text)


3km


In [None]:
#------------------Get Lemma--------------------------------
from spacy.tokens import Token

def get_lemmas(token):
  if token._._lemmas is not None:  # dummy attribute has custom lemmas
    return token._._lemmas
  return [token.lemma_]  # regular token lemma

Token.set_extension('_lemmas', default=None)
Token.set_extension('lemmas', getter=get_lemmas)

doc = nlp("I ran 3km yesterday.")
span = doc[2:4]
span_lemmas = [token.lemma_ for token in span]  # get list of lemmas
span.merge()
span[0]._._lemmas = span_lemmas  # write them to dummy attribute of merged token (!)

[token._.lemmas for token in doc]

[['-PRON-'], ['run'], ['3', 'km'], ['yesterday'], ['.']]

In [None]:
#-------------------Pattern Group---------------------------
import copy
import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load('en_core_web_sm')

color_patterns = [nlp(text) for text in ('red', 'green', 'yellow')]
product_patterns = [nlp(text) for text in ('boots', 'coats', 'bag')]
material_patterns = [nlp(text) for text in ('silk', 'yellow fabric')]

matcher = PhraseMatcher(nlp.vocab)
matcher.add('COLOR', None, *color_patterns)
matcher.add('PRODUCT', None, *product_patterns)
matcher.add('MATERIAL', None, *material_patterns)

doc1 = nlp('yellow fabric')
doc2 = nlp('red lipstick and big black boots')

for doc in matcher.pipe([doc1, doc2], n_threads=4):
    matches = matcher(doc)
    for match_id, start, end in matches:
        rule_id = nlp.vocab.strings[match_id]
        span = doc[start : end]
        print(rule_id, span.text)

COLOR yellow
MATERIAL yellow fabric
COLOR red
PRODUCT boots
