<a href="https://colab.research.google.com/github/wendywtchang/course-project/blob/master/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pwd

/content


In [3]:
%cd ./drive/MyDrive/NER/

/content/drive/MyDrive/NER


In [10]:
import pandas as pd
df = pd.read_csv('ner_dataset.csv', engine='python')

In [12]:
df

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O
...,...,...,...,...
1048570,,they,PRP,O
1048571,,responded,VBD,O
1048572,,to,TO,O
1048573,,the,DT,O


# NER with NLTK

In [28]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [18]:
df['Word'][:24]

0         Thousands
1                of
2     demonstrators
3              have
4           marched
5           through
6            London
7                to
8           protest
9               the
10              war
11               in
12             Iraq
13              and
14           demand
15              the
16       withdrawal
17               of
18          British
19           troops
20             from
21             that
22          country
23                .
Name: Word, dtype: object

In [20]:
sample_sent = ''
for w in df['Word'][:24]:
  sample_sent = sample_sent + w + ' '
sample_sent


'Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country . '

In [31]:
# print syntax tree
ne_tree = ne_chunk(pos_tag(word_tokenize(sample_sent)))
print(ne_tree)

(S
  Thousands/NNS
  of/IN
  demonstrators/NNS
  have/VBP
  marched/VBN
  through/IN
  (GPE London/NNP)
  to/TO
  protest/VB
  the/DT
  war/NN
  in/IN
  (GPE Iraq/NNP)
  and/CC
  demand/VB
  the/DT
  withdrawal/NN
  of/IN
  (GPE British/JJ)
  troops/NNS
  from/IN
  that/DT
  country/NN
  ./.)


In [32]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [33]:
sent = preprocess(sample_sent)

In [34]:
sent

[('Thousands', 'NNS'),
 ('of', 'IN'),
 ('demonstrators', 'NNS'),
 ('have', 'VBP'),
 ('marched', 'VBN'),
 ('through', 'IN'),
 ('London', 'NNP'),
 ('to', 'TO'),
 ('protest', 'VB'),
 ('the', 'DT'),
 ('war', 'NN'),
 ('in', 'IN'),
 ('Iraq', 'NNP'),
 ('and', 'CC'),
 ('demand', 'VB'),
 ('the', 'DT'),
 ('withdrawal', 'NN'),
 ('of', 'IN'),
 ('British', 'JJ'),
 ('troops', 'NNS'),
 ('from', 'IN'),
 ('that', 'DT'),
 ('country', 'NN'),
 ('.', '.')]

In [35]:
# find NPs chunks using regex
pattern = 'NP: {<DT>?<JJ>*<NN>}'
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print(cs)

(S
  Thousands/NNS
  of/IN
  demonstrators/NNS
  have/VBP
  marched/VBN
  through/IN
  London/NNP
  to/TO
  protest/VB
  (NP the/DT war/NN)
  in/IN
  Iraq/NNP
  and/CC
  demand/VB
  (NP the/DT withdrawal/NN)
  of/IN
  British/JJ
  troops/NNS
  from/IN
  (NP that/DT country/NN)
  ./.)


In [39]:
#NPChunker = nltk.RegexpParser(pattern) 
#result = NPChunker.parse(sent)
#result

In [40]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

iob_tagged = tree2conlltags(cs)
pprint(iob_tagged)

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'O'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'B-NP'),
 ('war', 'NN', 'I-NP'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'O'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'B-NP'),
 ('withdrawal', 'NN', 'I-NP'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'O'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'B-NP'),
 ('country', 'NN', 'I-NP'),
 ('.', '.', 'O')]


# SpaCy
- SpaCy's named entity recognition has been trained on the OntoNotes 5 corpus.

In [41]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [48]:
#Using SpaCy to print text and entity label
doc = nlp(sample_sent)
pprint([(X.text, X.label_) for X in doc.ents])

[('Thousands', 'CARDINAL'),
 ('London', 'GPE'),
 ('Iraq', 'GPE'),
 ('British', 'NORP')]


In [47]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(Thousands, 'B', 'CARDINAL'),
 (of, 'O', ''),
 (demonstrators, 'O', ''),
 (have, 'O', ''),
 (marched, 'O', ''),
 (through, 'O', ''),
 (London, 'B', 'GPE'),
 (to, 'O', ''),
 (protest, 'O', ''),
 (the, 'O', ''),
 (war, 'O', ''),
 (in, 'O', ''),
 (Iraq, 'B', 'GPE'),
 (and, 'O', ''),
 (demand, 'O', ''),
 (the, 'O', ''),
 (withdrawal, 'O', ''),
 (of, 'O', ''),
 (British, 'B', 'NORP'),
 (troops, 'O', ''),
 (from, 'O', ''),
 (that, 'O', ''),
 (country, 'O', ''),
 (., 'O', '')]


# Extracting named entity from an article
- Beautiful Soup
- SpaCy

In [49]:
from bs4 import BeautifulSoup
import requests
import re

In [50]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [51]:
ny_bb = url_to_string('https://www.nytimes.com/2022/01/05/us/politics/jan-6-capitol-riot-investigation.html')
article = nlp(ny_bb)
len(article.ents)

241

In [52]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'CARDINAL': 22,
         'DATE': 52,
         'FAC': 15,
         'GPE': 8,
         'LAW': 2,
         'LOC': 2,
         'NORP': 1,
         'ORDINAL': 4,
         'ORG': 49,
         'PERCENT': 1,
         'PERSON': 80,
         'PRODUCT': 1,
         'TIME': 3,
         'WORK_OF_ART': 1})

In [53]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Trump', 19), ('Jan. 6', 16), ('Capitol', 16)]

In [54]:
sentences = [x for x in article.sents]
print(sentences[20])

The government estimates that as many as 2,500 people who took part in the events of Jan. 6 could be charged with federal crimes.


In [55]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [56]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [57]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('government', 'NOUN', 'government'),
 ('estimates', 'VERB', 'estimate'),
 ('2,500', 'NUM', '2,500'),
 ('people', 'NOUN', 'people'),
 ('took', 'VERB', 'take'),
 ('events', 'NOUN', 'event'),
 ('Jan.', 'PROPN', 'January'),
 ('6', 'NUM', '6'),
 ('charged', 'VERB', 'charge'),
 ('federal', 'ADJ', 'federal'),
 ('crimes', 'NOUN', 'crime')]

In [58]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'Jan. 6': 'DATE', 'as many as 2,500': 'CARDINAL'}

In [59]:
print([(x, x.ent_iob_, x.ent_type_) for x in sentences[20]])

[(The, 'O', ''), (government, 'O', ''), (estimates, 'O', ''), (that, 'O', ''), (as, 'B', 'CARDINAL'), (many, 'I', 'CARDINAL'), (as, 'I', 'CARDINAL'), (2,500, 'I', 'CARDINAL'), (people, 'O', ''), (who, 'O', ''), (took, 'O', ''), (part, 'O', ''), (in, 'O', ''), (the, 'O', ''), (events, 'O', ''), (of, 'O', ''), (Jan., 'B', 'DATE'), (6, 'I', 'DATE'), (could, 'O', ''), (be, 'O', ''), (charged, 'O', ''), (with, 'O', ''), (federal, 'O', ''), (crimes, 'O', ''), (., 'O', '')]


In [60]:
displacy.render(article, jupyter=True, style='ent')