In [14]:
# import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [2]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [3]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [6]:
sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [7]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'


In [8]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [9]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [15]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


## SpaCy

In [18]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [19]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [20]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])


[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


## Extracting named entity from an article


In [22]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

153

In [23]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'ORG': 38,
         'PERSON': 76,
         'DATE': 23,
         'GPE': 9,
         'NORP': 2,
         'CARDINAL': 3,
         'LOC': 1,
         'ORDINAL': 1})

In [24]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 29), ('F.B.I.', 19), ('Trump', 13)]

In [36]:
sentences = [x for x in article.sents]
print(sentences[21])

“The decision to fire Special Agent Strzok is not only a departure from typical bureau practice, but also contradicts Director Wray’s testimony to Congress and his assurances that the F.B.I. intended to follow its regular process in this and all personnel matters,” Mr. Goelman said.


In [40]:
displacy.render(nlp(str(sentences[21])), style='dep', jupyter = True, options = {'distance': 120})

In [41]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[21])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('decision', 'NOUN', 'decision'),
 ('fire', 'VERB', 'fire'),
 ('Special', 'PROPN', 'Special'),
 ('Agent', 'PROPN', 'Agent'),
 ('Strzok', 'PROPN', 'Strzok'),
 ('departure', 'NOUN', 'departure'),
 ('typical', 'ADJ', 'typical'),
 ('bureau', 'NOUN', 'bureau'),
 ('practice', 'NOUN', 'practice'),
 ('contradicts', 'VERB', 'contradict'),
 ('Director', 'PROPN', 'Director'),
 ('Wray', 'PROPN', 'Wray'),
 ('testimony', 'NOUN', 'testimony'),
 ('Congress', 'PROPN', 'Congress'),
 ('assurances', 'NOUN', 'assurance'),
 ('F.B.I.', 'PROPN', 'F.B.I.'),
 ('intended', 'VERB', 'intend'),
 ('follow', 'VERB', 'follow'),
 ('regular', 'ADJ', 'regular'),
 ('process', 'NOUN', 'process'),
 ('personnel', 'NOUN', 'personnel'),
 ('matters', 'NOUN', 'matter'),
 ('Mr.', 'PROPN', 'Mr.'),
 ('Goelman', 'PROPN', 'Goelman'),
 ('said', 'VERB', 'say')]

In [42]:
dict([(str(x), x.label_) for x in nlp(str(sentences[21])).ents])


{'Wray': 'PERSON', 'Congress': 'ORG', 'F.B.I.': 'ORG', 'Goelman': 'PERSON'}

In [43]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[21]])


[(“, 'O', ''), (The, 'O', ''), (decision, 'O', ''), (to, 'O', ''), (fire, 'O', ''), (Special, 'O', ''), (Agent, 'O', ''), (Strzok, 'O', ''), (is, 'O', ''), (not, 'O', ''), (only, 'O', ''), (a, 'O', ''), (departure, 'O', ''), (from, 'O', ''), (typical, 'O', ''), (bureau, 'O', ''), (practice, 'O', ''), (,, 'O', ''), (but, 'O', ''), (also, 'O', ''), (contradicts, 'O', ''), (Director, 'O', ''), (Wray, 'B', 'PERSON'), (’s, 'O', ''), (testimony, 'O', ''), (to, 'O', ''), (Congress, 'B', 'ORG'), (and, 'O', ''), (his, 'O', ''), (assurances, 'O', ''), (that, 'O', ''), (the, 'O', ''), (F.B.I., 'B', 'ORG'), (intended, 'O', ''), (to, 'O', ''), (follow, 'O', ''), (its, 'O', ''), (regular, 'O', ''), (process, 'O', ''), (in, 'O', ''), (this, 'O', ''), (and, 'O', ''), (all, 'O', ''), (personnel, 'O', ''), (matters, 'O', ''), (,, 'O', ''), (”, 'O', ''), (Mr., 'O', ''), (Goelman, 'B', 'PERSON'), (said, 'O', ''), (., 'O', '')]


In [48]:
displacy.render(nlp(''.join([str(x) for x in sentences])), jupyter=True, style='ent')