In [1]:
# https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [3]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [Errno 11001] getaddrinfo failed>


False

In [4]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market \
and ordered the company to alter its practices'

In [5]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [6]:
sent = preprocess(ex)
sent

[('European', 'JJ'),
 ('authorities', 'NNS'),
 ('fined', 'VBD'),
 ('Google', 'NNP'),
 ('a', 'DT'),
 ('record', 'NN'),
 ('$', '$'),
 ('5.1', 'CD'),
 ('billion', 'CD'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('for', 'IN'),
 ('abusing', 'VBG'),
 ('its', 'PRP$'),
 ('power', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('mobile', 'JJ'),
 ('phone', 'NN'),
 ('market', 'NN'),
 ('and', 'CC'),
 ('ordered', 'VBD'),
 ('the', 'DT'),
 ('company', 'NN'),
 ('to', 'TO'),
 ('alter', 'VB'),
 ('its', 'PRP$'),
 ('practices', 'NNS')]

In [7]:
# Now we’ll implement noun phrase chunking to identify named entities using a regular expression consisting of rules that 
# indicate how sentences should be chunked
# Our chunk pattern consists of one rule, that a noun phrase, NP, should be formed whenever the chunker finds an 
# optional determiner, DT, followed by any number of adjectives, JJ, and then a noun, NN.
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [8]:
# Using this pattern, we create a chunk parser and test it on our sentence
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [9]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('European', 'JJ', 'O'),
 ('authorities', 'NNS', 'O'),
 ('fined', 'VBD', 'O'),
 ('Google', 'NNP', 'O'),
 ('a', 'DT', 'B-NP'),
 ('record', 'NN', 'I-NP'),
 ('$', '$', 'O'),
 ('5.1', 'CD', 'O'),
 ('billion', 'CD', 'O'),
 ('on', 'IN', 'O'),
 ('Wednesday', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('abusing', 'VBG', 'O'),
 ('its', 'PRP$', 'O'),
 ('power', 'NN', 'B-NP'),
 ('in', 'IN', 'O'),
 ('the', 'DT', 'B-NP'),
 ('mobile', 'JJ', 'I-NP'),
 ('phone', 'NN', 'I-NP'),
 ('market', 'NN', 'B-NP'),
 ('and', 'CC', 'O'),
 ('ordered', 'VBD', 'O'),
 ('the', 'DT', 'B-NP'),
 ('company', 'NN', 'I-NP'),
 ('to', 'TO', 'O'),
 ('alter', 'VB', 'O'),
 ('its', 'PRP$', 'O'),
 ('practices', 'NNS', 'O')]


In [13]:
import nltk
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.


True

In [15]:
import nltk
nltk.download('words')

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [16]:
from nltk.chunk import ne_chunk
ne_tree = ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD
  the/DT
  company/NN
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [17]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()



In [18]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone \
          market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[('European', 'NORP'),
 ('Google', 'ORG'),
 ('$5.1 billion', 'MONEY'),
 ('Wednesday', 'DATE')]


In [22]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, 'B', 'NORP'),
 (authorities, 'O', ''),
 (fined, 'O', ''),
 (Google, 'B', 'ORG'),
 (a, 'O', ''),
 (record, 'O', ''),
 ($, 'B', 'MONEY'),
 (5.1, 'I', 'MONEY'),
 (billion, 'I', 'MONEY'),
 (on, 'O', ''),
 (Wednesday, 'B', 'DATE'),
 (for, 'O', ''),
 (abusing, 'O', ''),
 (its, 'O', ''),
 (power, 'O', ''),
 (in, 'O', ''),
 (the, 'O', ''),
 (mobile, 'O', ''),
 (phone, 'O', ''),
 (          , 'O', ''),
 (market, 'O', ''),
 (and, 'O', ''),
 (ordered, 'O', ''),
 (the, 'O', ''),
 (company, 'O', ''),
 (to, 'O', ''),
 (alter, 'O', ''),
 (its, 'O', ''),
 (practices, 'O', '')]


In [35]:
from bs4 import BeautifulSoup
import requests
import re
def url_to_string(url):
    res = requests.get(url)
    html = res.text
#     soup = BeautifulSoup(html, 'html5lib')
    soup = BeautifulSoup(html, 'html.parser')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
print("Text: ", ny_bb)
article = nlp(ny_bb)

len(article.ents)

URL:  F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is Fired - The New York Times SectionsSEARCHSkip to contentSkip to site indexPoliticsSubscribeLog InLog InToday’s PaperPolitics|F.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is FiredAdvertisementSupported byF.B.I. Agent Peter Strzok, Who Criticized Trump in Texts, Is FiredImagePeter Strzok, a top F.B.I. counterintelligence agent who was taken off the special counsel investigation after his disparaging texts about President Trump were uncovered, was fired.CreditCreditT.J. Kirkpatrick for The New York TimesBy Adam Goldman and Michael S. SchmidtAug 13, 2018WASHINGTON — Peter Strzok, the F.B.I. senior counterintelligence agent who disparaged President Trump in inflammatory text messages and helped oversee the Hillary Clinton email and Russia investigations, has been fired for violating bureau policies, Mr. Strzok’s lawyer said Monday.Mr. Trump and his allies seized on the texts — exchanged during the 2016 campaign 




169

In [26]:
pprint([(x.text, x.label_) for x in article.ents])

[('Peter Strzok', 'PERSON'),
 ('Criticized Trump', 'PERSON'),
 ('Texts', 'GPE'),
 ('InLog InToday’s', 'PERSON'),
 ('Peter Strzok', 'PERSON'),
 ('Criticized Trump', 'PERSON'),
 ('Texts', 'GPE'),
 ('Peter Strzok', 'PERSON'),
 ('Criticized Trump', 'PERSON'),
 ('Texts', 'GPE'),
 ('Trump', 'PERSON'),
 ('Kirkpatrick', 'PERSON'),
 ('Adam Goldman', 'PERSON'),
 ('Michael S. SchmidtAug', 'PERSON'),
 ('F.B.I.', 'ORG'),
 ('Trump', 'PERSON'),
 ('Hillary Clinton', 'PERSON'),
 ('Russia', 'GPE'),
 ('Strzok', 'PERSON'),
 ('Monday', 'DATE'),
 ('Trump', 'ORG'),
 ('2016', 'DATE'),
 ('F.B.I.', 'ORG'),
 ('Lisa Page', 'PERSON'),
 ('Russia', 'GPE'),
 ('Strzok', 'PERSON'),
 ('20 years', 'DATE'),
 ('F.B.I.', 'ORG'),
 ('the early months', 'DATE'),
 ('Strzok', 'PERSON'),
 ('F.B.I.', 'ORG'),
 ('Trump', 'PERSON'),
 ('Strzok', 'PERSON'),
 ('last summer', 'DATE'),
 ('Robert S. Mueller III', 'PERSON'),
 ('Strzok', 'PERSON'),
 ('Twitter', 'GPE'),
 ('Monday', 'DATE'),
 ('Trump', 'ORG'),
 ('June', 'DATE'),
 ('Strzok', 'P

In [36]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 82,
         'GPE': 16,
         'ORG': 37,
         'DATE': 23,
         'NORP': 2,
         'CARDINAL': 5,
         'ORDINAL': 1,
         'LAW': 1,
         'FAC': 1,
         'PRODUCT': 1})

In [37]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 28), ('F.B.I.', 13), ('Trump', 12)]

In [43]:
sentences = [x for x in article.sents]
print(sentences[25])

In one, Ms. Page asks: Trump is “not ever going to become president, right?


In [45]:
displacy.render(nlp(str(sentences[25])), jupyter=True, style='ent')

In [46]:
displacy.render(nlp(str(sentences[25])), style='dep', jupyter = True, options = {'distance': 120})

In [47]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[25])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Ms.', 'PROPN', 'Ms.'),
 ('Page', 'PROPN', 'Page'),
 ('asks', 'VERB', 'ask'),
 ('Trump', 'NOUN', 'trump'),
 ('going', 'VERB', 'go'),
 ('president', 'NOUN', 'president'),
 ('right', 'ADJ', 'right')]

In [48]:
dict([(str(x), x.label_) for x in nlp(str(sentences[25])).ents])

{'one': 'CARDINAL', 'Page': 'PERSON', 'Trump': 'ORG'}

In [50]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[25]])

[(In, 'O', ''), (one, 'O', ''), (,, 'O', ''), (Ms., 'O', ''), (Page, 'B', 'PERSON'), (asks, 'O', ''), (:, 'O', ''), (Trump, 'B', 'ORG'), (is, 'O', ''), (“, 'O', ''), (not, 'O', ''), (ever, 'O', ''), (going, 'O', ''), (to, 'O', ''), (become, 'O', ''), (president, 'O', ''), (,, 'O', ''), (right, 'O', ''), (?, 'O', '')]
