### This project describes how to build named entity recognizer (NER) with NLTK and SpaCy, to identify the names of things, such as persons, organizations, or locations in the raw text.

## 1. NLTK

In [1]:
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/seafish/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
sentence = 'The U.S. has reported 30% of adults are fully vaccinated and nearly 50% of the U.S. adult population has received at least one vaccine dose, according to data from the U.S. Centers for Disease Control and Prevention.'

In [3]:
sent1 = nltk.word_tokenize(sentence)       # word tokenization
sent2 = nltk.pos_tag(sent1)                # part-of-speech tagging
print(sent1, '\n')
print(sent2)

['The', 'U.S.', 'has', 'reported', '30', '%', 'of', 'adults', 'are', 'fully', 'vaccinated', 'and', 'nearly', '50', '%', 'of', 'the', 'U.S.', 'adult', 'population', 'has', 'received', 'at', 'least', 'one', 'vaccine', 'dose', ',', 'according', 'to', 'data', 'from', 'the', 'U.S.', 'Centers', 'for', 'Disease', 'Control', 'and', 'Prevention', '.'] 

[('The', 'DT'), ('U.S.', 'NNP'), ('has', 'VBZ'), ('reported', 'VBN'), ('30', 'CD'), ('%', 'NN'), ('of', 'IN'), ('adults', 'NNS'), ('are', 'VBP'), ('fully', 'RB'), ('vaccinated', 'VBN'), ('and', 'CC'), ('nearly', 'RB'), ('50', 'CD'), ('%', 'NN'), ('of', 'IN'), ('the', 'DT'), ('U.S.', 'NNP'), ('adult', 'NN'), ('population', 'NN'), ('has', 'VBZ'), ('received', 'VBN'), ('at', 'IN'), ('least', 'JJS'), ('one', 'CD'), ('vaccine', 'NN'), ('dose', 'NN'), (',', ','), ('according', 'VBG'), ('to', 'TO'), ('data', 'NNS'), ('from', 'IN'), ('the', 'DT'), ('U.S.', 'NNP'), ('Centers', 'NNPS'), ('for', 'IN'), ('Disease', 'NNP'), ('Control', 'NNP'), ('and', 'CC'),

### Chunking 
A process of extracting phrases from unstructured text, which means analyzing a sentence to identify the constituents(Noun Groups, Verbs, verb groups, etc.)

In [4]:
# Chunk pattern 
pattern = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""        
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent2)
print(cs)

(S
  The/DT
  (Chunk U.S./NNP)
  has/VBZ
  reported/VBN
  30/CD
  %/NN
  of/IN
  adults/NNS
  are/VBP
  fully/RB
  vaccinated/VBN
  and/CC
  nearly/RB
  50/CD
  %/NN
  of/IN
  the/DT
  (Chunk U.S./NNP adult/NN)
  population/NN
  has/VBZ
  received/VBN
  at/IN
  least/JJS
  one/CD
  vaccine/NN
  dose/NN
  ,/,
  according/VBG
  to/TO
  data/NNS
  from/IN
  the/DT
  (Chunk U.S./NNP)
  Centers/NNPS
  for/IN
  (Chunk Disease/NNP Control/NNP)
  and/CC
  (Chunk Prevention/NNP)
  ./.)


In [5]:
# IOB tags (represent chunk structures). 
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
iob_tagged = tree2conlltags(cs)       # to get triples of (word, tag, chunk tags for each token)
pprint(iob_tagged)                    # Per line: one token, part of speech tag, named entity tag

[('The', 'DT', 'O'),
 ('U.S.', 'NNP', 'B-Chunk'),
 ('has', 'VBZ', 'O'),
 ('reported', 'VBN', 'O'),
 ('30', 'CD', 'O'),
 ('%', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('adults', 'NNS', 'O'),
 ('are', 'VBP', 'O'),
 ('fully', 'RB', 'O'),
 ('vaccinated', 'VBN', 'O'),
 ('and', 'CC', 'O'),
 ('nearly', 'RB', 'O'),
 ('50', 'CD', 'O'),
 ('%', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('U.S.', 'NNP', 'B-Chunk'),
 ('adult', 'NN', 'I-Chunk'),
 ('population', 'NN', 'O'),
 ('has', 'VBZ', 'O'),
 ('received', 'VBN', 'O'),
 ('at', 'IN', 'O'),
 ('least', 'JJS', 'O'),
 ('one', 'CD', 'O'),
 ('vaccine', 'NN', 'O'),
 ('dose', 'NN', 'O'),
 (',', ',', 'O'),
 ('according', 'VBG', 'O'),
 ('to', 'TO', 'O'),
 ('data', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('the', 'DT', 'O'),
 ('U.S.', 'NNP', 'B-Chunk'),
 ('Centers', 'NNPS', 'O'),
 ('for', 'IN', 'O'),
 ('Disease', 'NNP', 'B-Chunk'),
 ('Control', 'NNP', 'I-Chunk'),
 ('and', 'CC', 'O'),
 ('Prevention', 'NNP', 'B-Chunk'),
 ('.', '.', 'O')]


In [6]:
# Recognize named entities using a classifier.
nltk.download('maxent_ne_chunker')
nltk.download('words')
from nltk.chunk import ne_chunk
ne_tree = ne_chunk(pos_tag(word_tokenize(sentence)))
print(ne_tree)

(S
  The/DT
  (GPE U.S./NNP)
  has/VBZ
  reported/VBN
  30/CD
  %/NN
  of/IN
  adults/NNS
  are/VBP
  fully/RB
  vaccinated/VBN
  and/CC
  nearly/RB
  50/CD
  %/NN
  of/IN
  the/DT
  (GPE U.S./NNP)
  adult/NN
  population/NN
  has/VBZ
  received/VBN
  at/IN
  least/JJS
  one/CD
  vaccine/NN
  dose/NN
  ,/,
  according/VBG
  to/TO
  data/NNS
  from/IN
  the/DT
  (GPE U.S./NNP)
  Centers/NNPS
  for/IN
  (PERSON Disease/NNP Control/NNP)
  and/CC
  Prevention/NNP
  ./.)


[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/seafish/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/seafish/nltk_data...
[nltk_data]   Package words is already up-to-date!


## 2. SpaCy

In [7]:
import spacy
from spacy import displacy 
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [8]:
#doc = nlp('European authorities fined Google a record $5.1 billion on Wednesday for abusing its power in the mobile phone market and ordered the company to alter its practices')
doc = nlp('The U.S. has reported 30% of adults are fully vaccinated and nearly 50% of the U.S. adult population has received at least one vaccine dose, according to data from the U.S. Centers for Disease Control and Prevention.')
pprint([(X.text, X.label_) for X in doc.ents])

[('U.S.', 'GPE'),
 ('30%', 'PERCENT'),
 ('nearly 50%', 'PERCENT'),
 ('U.S.', 'GPE'),
 ('at least one', 'CARDINAL'),
 ('the U.S. Centers for Disease Control and Prevention', 'ORG')]


- "B" means the token begins an entity,
- "I" means it is inside an entity,
- "O" means it is outside an entity, 
- "" means no entity tag is set.

In [10]:
print([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(The, 'O', ''), (U.S., 'B', 'GPE'), (has, 'O', ''), (reported, 'O', ''), (30, 'B', 'PERCENT'), (%, 'I', 'PERCENT'), (of, 'O', ''), (adults, 'O', ''), (are, 'O', ''), (fully, 'O', ''), (vaccinated, 'O', ''), (and, 'O', ''), (nearly, 'B', 'PERCENT'), (50, 'I', 'PERCENT'), (%, 'I', 'PERCENT'), (of, 'O', ''), (the, 'O', ''), (U.S., 'B', 'GPE'), (adult, 'O', ''), (population, 'O', ''), (has, 'O', ''), (received, 'O', ''), (at, 'B', 'CARDINAL'), (least, 'I', 'CARDINAL'), (one, 'I', 'CARDINAL'), (vaccine, 'O', ''), (dose, 'O', ''), (,, 'O', ''), (according, 'O', ''), (to, 'O', ''), (data, 'O', ''), (from, 'O', ''), (the, 'B', 'ORG'), (U.S., 'I', 'ORG'), (Centers, 'I', 'ORG'), (for, 'I', 'ORG'), (Disease, 'I', 'ORG'), (Control, 'I', 'ORG'), (and, 'I', 'ORG'), (Prevention, 'I', 'ORG'), (., 'O', '')]


In [11]:
# Extracting named entity 
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()

    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [12]:
ny_bb = url_to_string('https://www.usatoday.com/story/news/health/2021/04/17/covid-updates-us-reports-30-adults-fully-vaccinated/7267691002/')
article = nlp(ny_bb)
len(article.ents)

101

In [13]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'GPE': 17,
         'PERCENT': 4,
         'WORK_OF_ART': 1,
         'PERSON': 15,
         'ORG': 25,
         'CARDINAL': 17,
         'DATE': 15,
         'NORP': 2,
         'MONEY': 4,
         'LOC': 1})

In [14]:
# Five most frequence tokens
items = [x.text for x in article.ents]
Counter(items).most_common(5)

[('Friday', 5), ('U.S.', 4), ('3 million', 3), ('US', 2), ('30%', 2)]

In [15]:
# Randomly select one sentence to learn 
sentences = [x for x in article.sents]
print(sentences[30])

The funding, which comes from President Joe Biden's nearly $2 trillion COVID-19 relief package, will be allocated through the Centers for Disease Control and Prevention to help states and other jurisdictions monitor potentially more contagious COVID-19 strains, including the variants driving another surge in Michigan.


In [16]:
# Run displaycy.render to generate the raw markup
from spacy import displacy
displacy.render(nlp(str(sentences[30])), jupyter=True, style='ent')

In [17]:
# Use spaCy's built-in displaCy visualizer
displacy.render(nlp(str(sentences[20])), style='dep', jupyter=True, options = {'distance':150})

In [18]:
# Extract part-of-speech and lemmatize
[(x.orth_, x.pos_, x.lemma_) for x in [y for y in nlp(str(sentences[30])) if not y.is_stop and y.pos_ != 'PUNCT']]

[('funding', 'NOUN', 'funding'),
 ('comes', 'VERB', 'come'),
 ('President', 'PROPN', 'President'),
 ('Joe', 'PROPN', 'Joe'),
 ('Biden', 'PROPN', 'Biden'),
 ('\xa0', 'SPACE', '\xa0'),
 ('nearly', 'ADV', 'nearly'),
 ('$', 'SYM', '$'),
 ('2', 'NUM', '2'),
 ('trillion', 'NUM', 'trillion'),
 ('COVID-19', 'NOUN', 'covid-19'),
 ('relief', 'NOUN', 'relief'),
 ('package', 'NOUN', 'package'),
 ('allocated', 'VERB', 'allocate'),
 ('Centers', 'PROPN', 'Centers'),
 ('Disease', 'PROPN', 'Disease'),
 ('Control', 'PROPN', 'Control'),
 ('Prevention', 'PROPN', 'Prevention'),
 ('help', 'VERB', 'help'),
 ('states', 'NOUN', 'state'),
 ('jurisdictions', 'NOUN', 'jurisdiction'),
 ('monitor', 'VERB', 'monitor'),
 ('potentially', 'ADV', 'potentially'),
 ('contagious', 'ADJ', 'contagious'),
 ('COVID-19', 'NOUN', 'covid-19'),
 ('strains', 'NOUN', 'strain'),
 ('including', 'VERB', 'include'),
 ('variants', 'NOUN', 'variant'),
 ('driving', 'VERB', 'drive'),
 ('\xa0', 'SPACE', '\xa0'),
 ('surge', 'NOUN', 'surge'),


In [19]:
dict([(str(x), x.label_) for x in nlp(str(sentences[40])).ents])

{'Lindy Washburn': 'PERSON',
 'Mike Stucka': 'PERSON',
 'The Associated PressAbout Us': 'ORG',
 'Principles Corrections Press Releases Accessibility Sitemap': 'ORG',
 'Info/Cookie PolicyContact': 'ORG'}

In [20]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[40]])

[(Later, 'O', ''), (,, 'O', ''), (diagnostic, 'O', ''), (scans, 'O', ''), (showed, 'O', ''), (that, 'O', ''), (she, 'O', ''), (had, 'O', ''), ( , 'O', ''), (several, 'O', ''), (blood, 'O', ''), (clots, 'O', ''), (in, 'O', ''), (major, 'O', ''), (blood, 'O', ''), (vessels, 'O', ''), (in, 'O', ''), (her, 'O', ''), (brain, 'O', ''), (,, 'O', ''), (abdomen, 'O', ''), (and, 'O', ''), (lungs, 'O', ''), (,, 'O', ''), (he, 'O', ''), (said.-, 'O', ''), ( , 'O', ''), (Lindy, 'B', 'PERSON'), (Washburn, 'I', 'PERSON'), (,, 'O', ''), (NorthJersey.comContributing, 'O', ''), (:, 'O', ''), (Mike, 'B', 'PERSON'), (Stucka, 'I', 'PERSON'), (,, 'O', ''), ( , 'O', ''), (USA, 'O', ''), (TODAY, 'O', ''), (;, 'O', ''), (The, 'B', 'ORG'), (Associated, 'I', 'ORG'), (PressAbout, 'I', 'ORG'), (Us, 'I', 'ORG'), (Newsroom, 'O', ''), (Staff, 'O', ''), (Ethical, 'O', ''), (Principles, 'B', 'ORG'), (Corrections, 'I', 'ORG'), (Press, 'I', 'ORG'), (Releases, 'I', 'ORG'), (Accessibility, 'I', 'ORG'), (Sitemap, 'I', 'ORG'

In [21]:
# visualize the entity of the entire article.
displacy.render(nlp(str(sentences)), jupyter=True, style='ent')