In [15]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [16]:
ex = 'European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices'

In [17]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [18]:
sent = preprocess(ex)
sent

[(u'European', 'JJ'),
 (u'authorities', 'NNS'),
 (u'fined', 'VBD'),
 (u'Google', 'NNP'),
 (u'a', 'DT'),
 (u'record', 'NN'),
 (u'$', '$'),
 (u'5.1', 'CD'),
 (u'billion', 'CD'),
 (u'on', 'IN'),
 (u'Wednesday', 'NNP'),
 (u'for', 'IN'),
 (u'abusing', 'VBG'),
 (u'its', 'PRP$'),
 (u'power', 'NN'),
 (u'in', 'IN'),
 (u'the', 'DT'),
 (u'mobile', 'JJ'),
 (u'phone', 'NN'),
 (u'market', 'NN'),
 (u'and', 'CC'),
 (u'ordered', 'VBD'),
 (u'the', 'DT'),
 (u'company', 'NN'),
 (u'to', 'TO'),
 (u'alter', 'VB'),
 (u'its', 'PRP$'),
 (u'practices', 'NNS')]

In [19]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

In [20]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  European/JJ
  authorities/NNS
  fined/VBD
  Google/NNP
  (NP a/DT record/NN)
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  (NP power/NN)
  in/IN
  (NP the/DT mobile/JJ phone/NN)
  (NP market/NN)
  and/CC
  ordered/VBD
  (NP the/DT company/NN)
  to/TO
  alter/VB
  its/PRP$
  practices/NNS)


In [21]:
import nltk

from nltk.corpus import conll2000
from nltk.chunk import conlltags2tree, tree2conlltags
from nltk.chunk import ne_chunk
from nltk import pos_tag

from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

iob_tagged = tree2conlltags(cs)

pprint(iob_tagged)
ne_tree = ne_chunk(pos_tag(word_tokenize(ex)))
print(ne_tree)

[(u'European', 'JJ', u'O'),
 (u'authorities', 'NNS', u'O'),
 (u'fined', 'VBD', u'O'),
 (u'Google', 'NNP', u'O'),
 (u'a', 'DT', u'B-NP'),
 (u'record', 'NN', u'I-NP'),
 (u'$', '$', u'O'),
 (u'5.1', 'CD', u'O'),
 (u'billion', 'CD', u'O'),
 (u'on', 'IN', u'O'),
 (u'Wednesday', 'NNP', u'O'),
 (u'for', 'IN', u'O'),
 (u'abusing', 'VBG', u'O'),
 (u'its', 'PRP$', u'O'),
 (u'power', 'NN', u'B-NP'),
 (u'in', 'IN', u'O'),
 (u'the', 'DT', u'B-NP'),
 (u'mobile', 'JJ', u'I-NP'),
 (u'phone', 'NN', u'I-NP'),
 (u'market', 'NN', u'B-NP'),
 (u'and', 'CC', u'O'),
 (u'ordered', 'VBD', u'O'),
 (u'the', 'DT', u'B-NP'),
 (u'company', 'NN', u'I-NP'),
 (u'to', 'TO', u'O'),
 (u'alter', 'VB', u'O'),
 (u'its', 'PRP$', u'O'),
 (u'practices', 'NNS', u'O')]
(S
  (GPE European/JJ)
  authorities/NNS
  fined/VBD
  (PERSON Google/NNP)
  a/DT
  record/NN
  $/$
  5.1/CD
  billion/CD
  on/IN
  Wednesday/NNP
  for/IN
  abusing/VBG
  its/PRP$
  power/NN
  in/IN
  the/DT
  mobile/JJ
  phone/NN
  market/NN
  and/CC
  ordered/VBD

In [22]:
import spacy
from spacy import displacy
from __future__ import unicode_literals
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [31]:
doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
pprint([(X.text, X.label_) for X in doc.ents])

[(u'European', u'NORP'),
 (u'Google', u'ORG'),
 (u'$5.1 billion', u'MONEY'),
 (u'Wednesday', u'DATE')]


In [32]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(European, u'B', u'NORP'),
 (authorities, u'O', u''),
 (fined, u'O', u''),
 (Google, u'B', u'ORG'),
 (a, u'O', u''),
 (record, u'O', u''),
 ($, u'B', u'MONEY'),
 (5.1, u'I', u'MONEY'),
 (billion, u'I', u'MONEY'),
 (on, u'O', u''),
 (Wednesday, u'B', u'DATE'),
 (for, u'O', u''),
 (abusing, u'O', u''),
 (its, u'O', u''),
 (power, u'O', u''),
 (in, u'O', u''),
 (the, u'O', u''),
 (mobile, u'O', u''),
 (phone, u'O', u''),
 (market, u'O', u''),
 (and, u'O', u''),
 (ordered, u'O', u''),
 (the, u'O', u''),
 (company, u'O', u''),
 (to, u'O', u''),
 (alter, u'O', u''),
 (its, u'O', u''),
 (practices, u'O', u'')]


In [33]:
from bs4 import BeautifulSoup
import requests
import re
     
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))
ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

174

In [26]:
labels = [x.label_ for x in article.ents]
Counter(labels)


Counter({u'CARDINAL': 4,
         u'DATE': 31,
         u'EVENT': 1,
         u'GPE': 29,
         u'NORP': 5,
         u'ORDINAL': 1,
         u'ORG': 22,
         u'PERSON': 81})

In [27]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[(u'Strzok', 32), (u'F.B.I.', 17), (u'Trump', 10)]

In [28]:
sentences = [x for x in article.sents]
print(sentences[20])

Firing Mr. Strzok, however, removes a favorite target of Mr. Trump from the ranks of the F.B.I. and gives Mr. Bowdich and the F.B.I. director, Christopher A. Wray, a chance to move beyond the president’s ire.


In [29]:
from __future__ import unicode_literals
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

TypeError: Argument 'string' has incorrect type (expected unicode, got str)