In [1]:
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 15 00:19:41 2017
Workshop: IE - Named Entity Recognition
@author: issfz
"""

'\nCreated on Fri Sep 15 00:19:41 2017\nWorkshop: IE - Named Entity Recognition\n@author: issfz\n'

In [3]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

In [4]:
# ===== POS Tagging and NER using NLTK =====

sent = '''There are three companies, Alpha Pte Ltd, Beta Co. and Gamma & Sons, involved in
the merger. The resulting company will be named GreekAlphabet Pte Ltd (GAPL) and
will be privately held. Prof Aleph, the current CEO of Alpha, will become the chairman
of GreekAlphabet while Dr Naught, the MD of Beta will be the CEO of the new
company. Dr Tom Gamma and his two sons, Dick Gamma and Harry Gamma, have
announced that they will be retiring once Gamma & Sons is acquired. They wish the
future management of GAPL the very best..
'''

In [5]:
# The input for POS tagger needs to be tokenized first.
sent_pos = pos_tag(word_tokenize(sent))
sent_pos

[('There', 'EX'),
 ('are', 'VBP'),
 ('three', 'CD'),
 ('companies', 'NNS'),
 (',', ','),
 ('Alpha', 'NNP'),
 ('Pte', 'NNP'),
 ('Ltd', 'NNP'),
 (',', ','),
 ('Beta', 'NNP'),
 ('Co.', 'NNP'),
 ('and', 'CC'),
 ('Gamma', 'NNP'),
 ('&', 'CC'),
 ('Sons', 'NNP'),
 (',', ','),
 ('involved', 'VBN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('merger', 'NN'),
 ('.', '.'),
 ('The', 'DT'),
 ('resulting', 'VBG'),
 ('company', 'NN'),
 ('will', 'MD'),
 ('be', 'VB'),
 ('named', 'VBN'),
 ('GreekAlphabet', 'NNP'),
 ('Pte', 'NNP'),
 ('Ltd', 'NNP'),
 ('(', '('),
 ('GAPL', 'NNP'),
 (')', ')'),
 ('and', 'CC'),
 ('will', 'MD'),
 ('be', 'VB'),
 ('privately', 'RB'),
 ('held', 'VBN'),
 ('.', '.'),
 ('Prof', 'NNP'),
 ('Aleph', 'NNP'),
 (',', ','),
 ('the', 'DT'),
 ('current', 'JJ'),
 ('CEO', 'NNP'),
 ('of', 'IN'),
 ('Alpha', 'NNP'),
 (',', ','),
 ('will', 'MD'),
 ('become', 'VB'),
 ('the', 'DT'),
 ('chairman', 'NN'),
 ('of', 'IN'),
 ('GreekAlphabet', 'NNP'),
 ('while', 'IN'),
 ('Dr', 'NNP'),
 ('Naught', 'NNP'),
 (',', ','

In [6]:
# ===== NER using NLTK =====
# The input for the NE chunker needs to have POS tags.
sent_chunk = ne_chunk(sent_pos)
print(sent_chunk)

(S
  There/EX
  are/VBP
  three/CD
  companies/NNS
  ,/,
  (PERSON Alpha/NNP Pte/NNP Ltd/NNP)
  ,/,
  (PERSON Beta/NNP)
  Co./NNP
  and/CC
  Gamma/NNP
  &/CC
  (PERSON Sons/NNP)
  ,/,
  involved/VBN
  in/IN
  the/DT
  merger/NN
  ./.
  The/DT
  resulting/VBG
  company/NN
  will/MD
  be/VB
  named/VBN
  (ORGANIZATION GreekAlphabet/NNP Pte/NNP Ltd/NNP)
  (/(
  (ORGANIZATION GAPL/NNP)
  )/)
  and/CC
  will/MD
  be/VB
  privately/RB
  held/VBN
  ./.
  (PERSON Prof/NNP Aleph/NNP)
  ,/,
  the/DT
  current/JJ
  CEO/NNP
  of/IN
  (GPE Alpha/NNP)
  ,/,
  will/MD
  become/VB
  the/DT
  chairman/NN
  of/IN
  (ORGANIZATION GreekAlphabet/NNP)
  while/IN
  Dr/NNP
  Naught/NNP
  ,/,
  the/DT
  (ORGANIZATION MD/NNP)
  of/IN
  (GPE Beta/NNP)
  will/MD
  be/VB
  the/DT
  (ORGANIZATION CEO/NNP)
  of/IN
  the/DT
  new/JJ
  company/NN
  ./.
  Dr/NNP
  (PERSON Tom/NNP Gamma/NNP)
  and/CC
  his/PRP$
  two/CD
  sons/NNS
  ,/,
  (PERSON Dick/NNP Gamma/NNP)
  and/CC
  (PERSON Harry/NNP Gamma/NNP)
  ,/,
  have/V

In [18]:
# ===== Now try creating your own named entity and noun phrase chunker ====
# We need to define the tag patterns to capture the target phrases and use 
# RegexParser to chunk the input with those patterns.
# Some minimal tag patterns are given here. 

grammar = r"""
  NE: {<NNP>+<NNP>+<NNP>}      # chunk sequences of proper nouns 
  NP: {<NNP>+<NNP>      }        
  NG: {<NNP><CC><NNP>} 
   {<DT><NN>}     
"""


In [19]:
cp = nltk.RegexpParser(grammar)
print(cp.parse(sent_pos))


(S
  There/EX
  are/VBP
  three/CD
  companies/NNS
  ,/,
  (NE Alpha/NNP Pte/NNP Ltd/NNP)
  ,/,
  (NP Beta/NNP Co./NNP)
  and/CC
  (NG Gamma/NNP &/CC Sons/NNP)
  ,/,
  involved/VBN
  in/IN
  (NG the/DT merger/NN)
  ./.
  The/DT
  resulting/VBG
  company/NN
  will/MD
  be/VB
  named/VBN
  (NE GreekAlphabet/NNP Pte/NNP Ltd/NNP)
  (/(
  GAPL/NNP
  )/)
  and/CC
  will/MD
  be/VB
  privately/RB
  held/VBN
  ./.
  (NP Prof/NNP Aleph/NNP)
  ,/,
  the/DT
  current/JJ
  CEO/NNP
  of/IN
  Alpha/NNP
  ,/,
  will/MD
  become/VB
  (NG the/DT chairman/NN)
  of/IN
  GreekAlphabet/NNP
  while/IN
  (NP Dr/NNP Naught/NNP)
  ,/,
  the/DT
  MD/NNP
  of/IN
  Beta/NNP
  will/MD
  be/VB
  the/DT
  CEO/NNP
  of/IN
  the/DT
  new/JJ
  company/NN
  ./.
  (NE Dr/NNP Tom/NNP Gamma/NNP)
  and/CC
  his/PRP$
  two/CD
  sons/NNS
  ,/,
  (NP Dick/NNP Gamma/NNP)
  and/CC
  (NP Harry/NNP Gamma/NNP)
  ,/,
  have/VBP
  announced/VBN
  that/IN
  they/PRP
  will/MD
  be/VB
  retiring/VBG
  once/RB
  (NG Gamma/NNP &/CC Sons/

In [None]:

#------------------------------------------------------------------------
# Exercise: modify the above tag patterns to capture the NEs and NPs in the 
# example sentence. 
#-------------------------------------------------------------------------

In [27]:
# ===== Now try creating your own named entity and noun phrase chunker ====
# We need to define the tag patterns to capture the target phrases and use 
# RegexParser to chunk the input with those patterns.
# Some minimal tag patterns are given here. 

Ngrammar = r"""
  NE: {<NNP>+}      # chunk sequences of proper nouns 
  NP:                
      {<DT><NN>}     
"""


In [2]:
# ===== Now try creating your own named entity and noun phrase chunker ====
# We need to define the tag patterns to capture the target phrases and use 
# RegexParser to chunk the input with those patterns.
# Some minimal tag patterns are given here. 

Negrammar = r"""
  NE: {<NNP>+<IN><NNP><TO><NNP>}      # chunk sequences of proper nouns 
  NP:                
      {<DT><NN>}
  NK: {<NNP>+}
  DT: {<NMP><CD><CD>}
"""


In [43]:
cp = nltk.RegexpParser(Negrammar)
print(cp.parse(sent_pos))

(S
  (NK Professor/NNP Tan/NNP Eng/NNP Chye/NNP)
  ,/,
  (NK NUS/NNP Deputy/NNP President/NNP)
  and/CC
  (NK Provost/NNP)
  ,/,
  and/CC
  (NK Professor/NNP Menahem/NNP Ben-Sasson/NNP)
  ,/,
  (NK President/NNP)
  of/IN
  (NK HUJ/NNP)
  signed/VBD
  the/DT
  joint/JJ
  degree/NN
  agreement/NN
  at/IN
  (NK NUS/NNP)
  ,/,
  in/IN
  (NP the/DT presence/NN)
  of/IN
  (NE Ambassador/NNP of/IN Israel/NNP to/TO Singapore/NNP)
  (NK Her/NNP Excellency/NNP Amira/NNP Arnon/NNP)
  and/CC
  about/IN
  30/CD
  invited/JJ
  guests/NNS
  ,/,
  on/IN
  (NK July/NNP)
  03/CD
  ,/,
  2013/CD
  ./.)


In [None]:
# month may not work .
