In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
example = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

## Sentence tokenization AND parts-of-speech tagging

In [3]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [4]:
sent = preprocess(example)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

## Chunking
_Our chunk pattern consists of one rule, that a noun phrase, NP, should be formed whenever the chunker finds an optional determiner, DT, followed by any number of adjectives, JJ, and then a noun, NN._

In [5]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [6]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [7]:
# IOB tags graphical tree structure
cs.draw()

In [8]:
# printing IOB tags for each token

from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [9]:
# With the function nltk.ne_chunk(), we can recognize named entities using a classifier, 
# the classifier adds category labels such as PERSON, ORGANIZATION, and GPE (geopolitical entity).

from nltk import ne_chunk
ne_tree = ne_chunk(pos_tag(word_tokenize(example)))
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


- Google is classified as PERSON above
- NLTK doesn’t have a proper English corpus for NER. It has the __CoNLL 2002 Named Entity__ but it’s only for Spanish and Dutch.
- CoNLL is language independent NER
- NLTK provides good corpus reader. So, better way is to going with __Groningen Meaning Bank (GMB)__. GMB is a fairly large corpus with a lot of annotations. The corpus is created by using already existed annotators and then corrected by humans where needed
- The other way is to consider __spacy__ which is trained on __OntoNotes 5__ corpus, and it supports many entities


<img src="files/spacy_ner.png">

In [14]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

One of the nice things about Spacy is that we only need to apply nlp once, the entire background pipeline will return the objects.

In [16]:
doc = nlp(example)
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [18]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


Extracting named entities from a New York Times article, — “F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Fired.”

In [25]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [27]:
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)

In [31]:
# pringing 100 letters of the article
article[:100]

     F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Fired - The New York Times                                                                            SectionsSEARCHSkip to contentSkip to site indexPoliticsSubscribeLog InLog InToday’s PaperPolitics|F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is FiredAdvertisementSupported byF.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is FiredImagePeter Strzok, a top F.B.I. counterintelligence agent who was taken off the special counsel investigation after his disparaging texts about President Trump were uncovered, was fired.CreditCreditT.J. Kirkpatrick for The New York TimesBy Adam Goldman and Michael S. SchmidtAug. 13, 2018WASHINGTON

In [34]:
article.ents

(Peter Strzok,
 Criticized Trump,
 Texts,
 InLog InToday’s,
 Peter Strzok,
 Criticized Trump,
 Texts,
 Peter Strzok,
 Criticized Trump,
 Texts,
 Trump,
 Kirkpatrick,
 Adam Goldman,
 Michael S. SchmidtAug,
 13,
 F.B.I.,
 Trump,
 Hillary Clinton,
 Russia,
 Strzok,
 Monday,
 Trump,
 2016,
 F.B.I.,
 Lisa Page,
 Russia,
 Strzok,
 20 years,
 F.B.I.,
 the early months,
 Strzok,
 F.B.I.,
 Trump,
 Strzok,
 last summer,
 Robert S. Mueller III,
 Strzok,
 Twitter,
 Monday,
 Trump,
 June,
 Strzok,
 F.B.I.,
 Hillary Clinton’s,
 2016,
 Strzok,
 ’s Office of Professional Responsibility,
 Strzok,
 60 days,
 Strzok,
 House,
 July,
 Strzok,
 F.B.I.,
 David Bowdich,
 the Office of Professional Responsibility,
 Strzok,
 F.B.I.,
 Strzok,
 Strzok,
 Trump,
 F.B.I.,
 Bowdich,
 F.B.I.,
 Christopher A. Wray,
 Aitan Goelman,
 Strzok,
 Wray’s,
 Congress,
 Goelman,
 Americans,
 Goelman,
 Page,
 Trump,
 Page,
 Trump,
 Strzok,
 Michael E. Horowitz,
 Strzok,
 Strzok,
 Clinton,
 just weeks,
 2016,
 Horowitz,
 Hundreds,

In [36]:
# 174 entities in total
len(article.ents)

174

In [38]:
# Count of unique entities
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 82,
         'GPE': 16,
         'CARDINAL': 5,
         'ORG': 39,
         'DATE': 24,
         'NORP': 2,
         'ORDINAL': 1,
         'FAC': 1,
         'PRODUCT': 2,
         'LOC': 1,
         'TIME': 1})

In [40]:
# Three most frequest tokens
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 28), ('F.B.I.', 13), ('Trump', 12)]

In [54]:
# printing random sentence
sentences = [x for x in article.sents]
print(sentences[14])

Trump’s victory traces back to June, when Mr. Strzok’s conduct was laid out in a wide-ranging inspector general’s report on how the F.B.I. handled the investigation of Hillary Clinton’s emails in the run-up to the 2016 election.


In [53]:
# displacy.render to generate the raw markup.
displacy.render(nlp(str(sentences[14])), jupyter=True, style='ent')

In [57]:
# Sentence and its dependencies
displacy.render(nlp(str(sentences[14])), style='dep', jupyter = True, options = {'distance': 120})

In [61]:
# extract part-of-speech and lemmatize this sentence.
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[14])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Trump', 'PROPN', 'Trump'),
 ('’s', 'PROPN', '’s'),
 ('victory', 'NOUN', 'victory'),
 ('traces', 'NOUN', 'trace'),
 ('June', 'PROPN', 'June'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('’s', 'PROPN', '’s'),
 ('conduct', 'NOUN', 'conduct'),
 ('laid', 'VERB', 'lay'),
 ('wide', 'ADV', 'wide'),
 ('ranging', 'VERB', 'range'),
 ('inspector', 'NOUN', 'inspector'),
 ('general', 'ADJ', 'general'),
 ('’s', 'PROPN', '’s'),
 ('report', 'NOUN', 'report'),
 ('F.B.I.', 'PROPN', 'F.B.I.'),
 ('handled', 'VERB', 'handle'),
 ('investigation', 'NOUN', 'investigation'),
 ('Hillary', 'PROPN', 'Hillary'),
 ('Clinton', 'PROPN', 'Clinton'),
 ('’s', 'PROPN', '’s'),
 ('emails', 'NOUN', 'email'),
 ('run', 'NOUN', 'run'),
 ('2016', 'NUM', '2016'),
 ('election', 'NOUN', 'election')]

In [67]:
dict([(str(x), x.label_) for x in nlp(str(sentences[14])).ents])
# Trump is wrongly categorized as ORG.

{'Trump': 'ORG',
 'June': 'DATE',
 'Strzok': 'PERSON',
 'F.B.I.': 'ORG',
 'Hillary Clinton’s': 'PERSON',
 '2016': 'DATE'}

In [68]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[16]])

[(Mr., 'O', ''), (Strzok, 'B', 'PERSON'), (had, 'O', ''), (testified, 'O', ''), (before, 'O', ''), (the, 'O', ''), (House, 'B', 'ORG'), (in, 'O', ''), (July, 'B', 'DATE'), (about, 'O', ''), (how, 'O', ''), (he, 'O', ''), (had, 'O', ''), (not, 'O', ''), (allowed, 'O', ''), (his, 'O', ''), (political, 'O', ''), (views, 'O', ''), (to, 'O', ''), (interfere, 'O', ''), (with, 'O', ''), (the, 'O', ''), (investigations, 'O', ''), (he, 'O', ''), (was, 'O', ''), (overseeing, 'O', ''), (., 'O', '')]


In [71]:
#display.render on entire article. So cool !
displacy.render(nlp(str(article)), jupyter=True, style='ent')