In [1]:
import nltk
from nltk import word_tokenize, pos_tag

In [2]:
sent = '''DETROIT  General Motors Co is recalling more than 1 million pickup trucks and sport utility 
vehicles in the United States due to issues with a temporary loss of power steering, 
the National Highway Traffic Safety Administration said. The recall is for 2015 models. 
They are: the Chevy Silverado 1500, Suburban and Tahoe, GMC Sierra 1500, Yukon and Yukon XL and Cadillac Escalade. 
The problem may cause difficulty steering the vehicle, especially at low speeds, increasing the risk of a crash, the auto safety regulator said in a document dated Sept. 12. The document did not highlight any reports of accidents and injuries, because of the power steering issue. GM dealers will update the power steering module software, free of charge for owners of the affected vehicles. In 2014, the No.1 U.S. automaker had recalled nearly 800,000 pickup trucks worldwide because of the same problem. 
GM did not immediately respond to a request for comment.
'''

In [3]:
# The input for POS tagger needs to be tokenized first.
sent_pos = pos_tag(word_tokenize(sent))
sent_pos

[('DETROIT', 'NNP'),
 ('\x14', 'NNP'),
 ('General', 'NNP'),
 ('Motors', 'NNPS'),
 ('Co', 'NNP'),
 ('is', 'VBZ'),
 ('recalling', 'VBG'),
 ('more', 'JJR'),
 ('than', 'IN'),
 ('1', 'CD'),
 ('million', 'CD'),
 ('pickup', 'NN'),
 ('trucks', 'NNS'),
 ('and', 'CC'),
 ('sport', 'JJ'),
 ('utility', 'NN'),
 ('vehicles', 'NNS'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('United', 'NNP'),
 ('States', 'NNPS'),
 ('due', 'JJ'),
 ('to', 'TO'),
 ('issues', 'NNS'),
 ('with', 'IN'),
 ('a', 'DT'),
 ('temporary', 'JJ'),
 ('loss', 'NN'),
 ('of', 'IN'),
 ('power', 'NN'),
 ('steering', 'NN'),
 (',', ','),
 ('the', 'DT'),
 ('National', 'NNP'),
 ('Highway', 'NNP'),
 ('Traffic', 'NNP'),
 ('Safety', 'NNP'),
 ('Administration', 'NNP'),
 ('said', 'VBD'),
 ('.', '.'),
 ('The', 'DT'),
 ('recall', 'NN'),
 ('is', 'VBZ'),
 ('for', 'IN'),
 ('2015', 'CD'),
 ('models', 'NNS'),
 ('.', '.'),
 ('They', 'PRP'),
 ('are', 'VBP'),
 (':', ':'),
 ('the', 'DT'),
 ('Chevy', 'NNP'),
 ('Silverado', 'NNP'),
 ('1500', 'CD'),
 (',', ','),
 ('Subur

In [4]:
# A more simplified tagset - universal
sent_pos2 = pos_tag(word_tokenize(sent), tagset='universal')
sent_pos2

[('DETROIT', 'NOUN'),
 ('\x14', 'NOUN'),
 ('General', 'NOUN'),
 ('Motors', 'NOUN'),
 ('Co', 'NOUN'),
 ('is', 'VERB'),
 ('recalling', 'VERB'),
 ('more', 'ADJ'),
 ('than', 'ADP'),
 ('1', 'NUM'),
 ('million', 'NUM'),
 ('pickup', 'NOUN'),
 ('trucks', 'NOUN'),
 ('and', 'CONJ'),
 ('sport', 'ADJ'),
 ('utility', 'NOUN'),
 ('vehicles', 'NOUN'),
 ('in', 'ADP'),
 ('the', 'DET'),
 ('United', 'NOUN'),
 ('States', 'NOUN'),
 ('due', 'ADJ'),
 ('to', 'PRT'),
 ('issues', 'NOUN'),
 ('with', 'ADP'),
 ('a', 'DET'),
 ('temporary', 'ADJ'),
 ('loss', 'NOUN'),
 ('of', 'ADP'),
 ('power', 'NOUN'),
 ('steering', 'NOUN'),
 (',', '.'),
 ('the', 'DET'),
 ('National', 'NOUN'),
 ('Highway', 'NOUN'),
 ('Traffic', 'NOUN'),
 ('Safety', 'NOUN'),
 ('Administration', 'NOUN'),
 ('said', 'VERB'),
 ('.', '.'),
 ('The', 'DET'),
 ('recall', 'NOUN'),
 ('is', 'VERB'),
 ('for', 'ADP'),
 ('2015', 'NUM'),
 ('models', 'NOUN'),
 ('.', '.'),
 ('They', 'PRON'),
 ('are', 'VERB'),
 (':', '.'),
 ('the', 'DET'),
 ('Chevy', 'NOUN'),
 ('Silver

In [6]:
# The wordnet lemmatizer works properly with the pos given
wnl = nltk.WordNetLemmatizer()
wnl.lemmatize('said', pos = 'v')

'say'

In [7]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk

In [8]:
# The input for POS tagger needs to be tokenized first.
sent_pos = pos_tag(word_tokenize(sent))
sent_pos

[('DETROIT', 'NNP'),
 ('\x14', 'NNP'),
 ('General', 'NNP'),
 ('Motors', 'NNPS'),
 ('Co', 'NNP'),
 ('is', 'VBZ'),
 ('recalling', 'VBG'),
 ('more', 'JJR'),
 ('than', 'IN'),
 ('1', 'CD'),
 ('million', 'CD'),
 ('pickup', 'NN'),
 ('trucks', 'NNS'),
 ('and', 'CC'),
 ('sport', 'JJ'),
 ('utility', 'NN'),
 ('vehicles', 'NNS'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('United', 'NNP'),
 ('States', 'NNPS'),
 ('due', 'JJ'),
 ('to', 'TO'),
 ('issues', 'NNS'),
 ('with', 'IN'),
 ('a', 'DT'),
 ('temporary', 'JJ'),
 ('loss', 'NN'),
 ('of', 'IN'),
 ('power', 'NN'),
 ('steering', 'NN'),
 (',', ','),
 ('the', 'DT'),
 ('National', 'NNP'),
 ('Highway', 'NNP'),
 ('Traffic', 'NNP'),
 ('Safety', 'NNP'),
 ('Administration', 'NNP'),
 ('said', 'VBD'),
 ('.', '.'),
 ('The', 'DT'),
 ('recall', 'NN'),
 ('is', 'VBZ'),
 ('for', 'IN'),
 ('2015', 'CD'),
 ('models', 'NNS'),
 ('.', '.'),
 ('They', 'PRP'),
 ('are', 'VBP'),
 (':', ':'),
 ('the', 'DT'),
 ('Chevy', 'NNP'),
 ('Silverado', 'NNP'),
 ('1500', 'CD'),
 (',', ','),
 ('Subur

In [9]:
# ===== NER using NLTK =====
# The input for the NE chunker needs to have POS tags.
sent_chunk = ne_chunk(sent_pos)
print(sent_chunk)

(S
  (ORGANIZATION DETROIT/NNP)
  /NNP
  General/NNP
  Motors/NNPS
  Co/NNP
  is/VBZ
  recalling/VBG
  more/JJR
  than/IN
  1/CD
  million/CD
  pickup/NN
  trucks/NNS
  and/CC
  sport/JJ
  utility/NN
  vehicles/NNS
  in/IN
  the/DT
  (GPE United/NNP States/NNPS)
  due/JJ
  to/TO
  issues/NNS
  with/IN
  a/DT
  temporary/JJ
  loss/NN
  of/IN
  power/NN
  steering/NN
  ,/,
  the/DT
  (ORGANIZATION
    National/NNP
    Highway/NNP
    Traffic/NNP
    Safety/NNP
    Administration/NNP)
  said/VBD
  ./.
  The/DT
  recall/NN
  is/VBZ
  for/IN
  2015/CD
  models/NNS
  ./.
  They/PRP
  are/VBP
  :/:
  the/DT
  (ORGANIZATION Chevy/NNP)
  Silverado/NNP
  1500/CD
  ,/,
  (PERSON Suburban/NNP)
  and/CC
  (PERSON Tahoe/NNP)
  ,/,
  (ORGANIZATION GMC/NNP)
  Sierra/NNP
  1500/CD
  ,/,
  (PERSON Yukon/NNP)
  and/CC
  (PERSON Yukon/NNP XL/NNP)
  and/CC
  (PERSON Cadillac/NNP Escalade/NNP)
  ./.
  The/DT
  problem/NN
  may/MD
  cause/VB
  difficulty/NN
  steering/VBG
  the/DT
  vehicle/NN
  ,/,
  espec

In [11]:
# ===== Now try creating your own named entity and noun phrase chunker ====
# We need to define the tag patterns to capture the target phrases and use 
# RegexParser to chunk the input with those patterns.
# Some minimal tag patterns are given here. 

grammar = r"""
  NE: {<NNP>}      # chunk sequences of proper nouns 
  NP:                
      {<DT><NN>}     
"""


In [12]:
cp = nltk.RegexpParser(grammar)
print(cp.parse(sent_pos))


(S
  (NE DETROIT/NNP)
  (NE /NNP)
  (NE General/NNP)
  Motors/NNPS
  (NE Co/NNP)
  is/VBZ
  recalling/VBG
  more/JJR
  than/IN
  1/CD
  million/CD
  pickup/NN
  trucks/NNS
  and/CC
  sport/JJ
  utility/NN
  vehicles/NNS
  in/IN
  the/DT
  (NE United/NNP)
  States/NNPS
  due/JJ
  to/TO
  issues/NNS
  with/IN
  a/DT
  temporary/JJ
  loss/NN
  of/IN
  power/NN
  steering/NN
  ,/,
  the/DT
  (NE National/NNP)
  (NE Highway/NNP)
  (NE Traffic/NNP)
  (NE Safety/NNP)
  (NE Administration/NNP)
  said/VBD
  ./.
  (NP The/DT recall/NN)
  is/VBZ
  for/IN
  2015/CD
  models/NNS
  ./.
  They/PRP
  are/VBP
  :/:
  the/DT
  (NE Chevy/NNP)
  (NE Silverado/NNP)
  1500/CD
  ,/,
  (NE Suburban/NNP)
  and/CC
  (NE Tahoe/NNP)
  ,/,
  (NE GMC/NNP)
  (NE Sierra/NNP)
  1500/CD
  ,/,
  (NE Yukon/NNP)
  and/CC
  (NE Yukon/NNP)
  (NE XL/NNP)
  and/CC
  (NE Cadillac/NNP)
  (NE Escalade/NNP)
  ./.
  (NP The/DT problem/NN)
  may/MD
  cause/VB
  difficulty/NN
  steering/VBG
  (NP the/DT vehicle/NN)
  ,/,
  especial

In [13]:
# ===== Now try creating your own named entity and noun phrase chunker ====
# We need to define the tag patterns to capture the target phrases and use 
# RegexParser to chunk the input with those patterns.
# Some minimal tag patterns are given here. 

Ngrammar = r"""
  NE: {<NNP>+}      # chunk sequences of proper nouns 
  NP:                
      {<DT><NN>}     
"""


In [14]:
cp = nltk.RegexpParser(Negrammar)
print(cp.parse(sent_pos))

NameError: name 'Negrammar' is not defined