## NER - Named Entity Recognition

In [1]:
#import nltk
#nltk.download('popular', halt_on_error=False)
#nltk.download('all', halt_on_error=False)

In [2]:
import os
import requests
import nltk as nltk
import nltk.corpus  
from nltk.text import Text
import pandas as pd
import re
import sys

In [3]:
print(sys.version)

3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:53) 
[GCC 9.4.0]


#### Copy files to local FS from GCP bucket

In [4]:
def get_gcs_data (bucket_name, folder_name, file_name, path_local):
    url = 'https://storage.googleapis.com/' + bucket_name + '/' + folder_name + '/' + file_name
    r = requests.get(url)
    open(path_local + '/' + file_name , 'wb').write(r.content)

In [5]:
path_news = '/home/jupyter/data/news'

os.makedirs(path_news, exist_ok=True)

In [6]:
bucket_name = 'msca-bdp-data-open'
folder_name = 'news'
file_name = ['News_1.txt']
path_local = path_news

os.makedirs(path_local, exist_ok=True)

for file in file_name:
    get_gcs_data (bucket_name = bucket_name,
                 folder_name = folder_name,
                 file_name = file,
                 path_local = path_local)
    print('Downloaded: ' + file)

Downloaded: News_1.txt


## NLTK-based for NER

In [7]:
text = '''Surging Chinese demand and an improving U.S. economy have lifted sales of Caterpillar's signature yellow mining and construction machines. Now, with the pace of growth quickening in Latin America and Europe, the company is projecting higher earnings for 2018 than analysts estimated.  The outlook from Caterpillar, considered an economic bellwether, comes as industries from manufacturing to services report increased sales and orders that have fueled record equity prices and buoyed investor expectations for this year. This week, the International Monetary Fund raised its estimate for 2018 global growth to the fastest in seven years.  Caterpillar's results showed strength across the board in nearly every industry for the first time, which indicated coordinated and synchronized macroeconomic growth, Larry De Maria, an analyst at William Blair & Co., said in an interview. It's a good harbinger for overall economic activity.'''

In [8]:
text

"Surging Chinese demand and an improving U.S. economy have lifted sales of Caterpillar's signature yellow mining and construction machines. Now, with the pace of growth quickening in Latin America and Europe, the company is projecting higher earnings for 2018 than analysts estimated.  The outlook from Caterpillar, considered an economic bellwether, comes as industries from manufacturing to services report increased sales and orders that have fueled record equity prices and buoyed investor expectations for this year. This week, the International Monetary Fund raised its estimate for 2018 global growth to the fastest in seven years.  Caterpillar's results showed strength across the board in nearly every industry for the first time, which indicated coordinated and synchronized macroeconomic growth, Larry De Maria, an analyst at William Blair & Co., said in an interview. It's a good harbinger for overall economic activity."

### Basic NER: tagging words (tokens) as "NE"

In [9]:
# NLTK chunked_sentences is a tree structure, or list of lists.  We have to traverse it to get the values

entities = []
labels = []
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)), binary = True):
    if hasattr(chunk, 'label'):
        entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
        labels.append(chunk.label())

#entities_labels = list(zip(entities, labels))
entities_labels = list(set(zip(entities, labels))) #unique entities

#Binary=True means just tag entities as NE 
#Binary=False give us PERSON, ORGANIZATION, and GPE (Geo-political Entity) 

In [10]:
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,Latin America,NE
1,Larry De Maria,NE
2,Chinese,NE
3,U.S.,NE
4,International Monetary Fund,NE
5,Caterpillar,NE
6,William Blair,NE
7,Europe,NE


Chinese
U.S.
Caterpillar's
Latin America
Europe
Caterpillar
International Monetary Fund
Caterpillar's
Larry De Maria
William Blair & Co.

### Basic NER: tagging words (tokens) as PERSON, ORGANIZATION, and GPE

In [11]:
entities = []
labels = []
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)), binary = False):
    if hasattr(chunk, 'label'):
        entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
        labels.append(chunk.label())

#entities_labels = list(zip(entities, labels))
entities_labels = list(set(zip(entities, labels))) #unique entities

In [12]:
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,Caterpillar,GPE
1,Caterpillar,PERSON
2,Larry De Maria,PERSON
3,William Blair,PERSON
4,Europe,GPE
5,Chinese,GPE
6,Latin America,GPE
7,International Monetary Fund,ORGANIZATION
8,U.S.,GPE


### Alternative NER, separating by sentenses first, then by tokens

In [13]:
entities = []
labels = []

for sent in nltk.sent_tokenize(text):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = False):
        if hasattr(chunk, 'label'):
            entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
            labels.append(chunk.label())

#entities_labels = list(zip(entities, labels))
entities_labels = list(set(zip(entities, labels))) #unique entities

In [14]:
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,Caterpillar,GPE
1,Larry De Maria,PERSON
2,William Blair,PERSON
3,Europe,GPE
4,Chinese,GPE
5,Latin America,GPE
6,International Monetary Fund,ORGANIZATION
7,U.S.,GPE


## Compare NER Results between Sentense vs. Word Tokenization

In [15]:
!head '/home/jupyter/data/news/News_1.txt'

Rising economic tide reaches all shores for buoyant Caterpillar
Caterpillar earnings

This Tuesday, July 25, 2017, photo shows Caterpillar machinery at a dealership in Murrysville, Pa. Caterpillar, Inc. reports earnings, Thursday, Jan. 25, 2018. (Gene J. Puskar / AP)
Joe DeauxBloomberg News

If you want more evidence of a broadening expansion in the global economy, look no further than Caterpillar.

Surging Chinese demand and an improving U.S. economy have lifted sales of Caterpillar's signature yellow mining and construction machines. Now, with the pace of growth quickening in Latin America and Europe, the company is projecting higher earnings for 2018 than analysts estimated.



In [16]:
f = open(os.path.join(path_news, 'News_1.txt'), encoding="utf8")

text = f.read()

### Tagging word tokens
Shallow parsing (also chunking or light parsing) is an analysis of a sentence which first identifies constituent parts of sentences (nouns, verbs, adjectives, etc.) and then links them to higher order units that have discrete grammatical meanings (noun groups or phrases, verb groups, etc.). Wikipedia 

In [17]:
entities = []
labels = []
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)), binary = False):
    if hasattr(chunk, 'label'):
        entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
        labels.append(chunk.label())

#entities_labels = list(zip(entities, labels))
entities_labels = list(set(zip(entities, labels))) #unique entities

In [18]:
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df.head(20)

Unnamed: 0,Entities,Labels
0,Murrysville,GPE
1,Dow Jones Industrial Average,ORGANIZATION
2,Bloomberg,PERSON
3,Joe,PERSON
4,Caterpillar Caterpillar,ORGANIZATION
5,Latin America,GPE
6,Puskar,PERSON
7,Larry De Maria,PERSON
8,Chinese,GPE
9,Dow Jones Industrial,ORGANIZATION


### Sentense split, then tagging word tokens

In [19]:
entities = []
labels = []

for sent in nltk.sent_tokenize(text):
    for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent)), binary = False):
        if hasattr(chunk, 'label'):
            entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
            labels.append(chunk.label())

#entities_labels = list(zip(entities, labels))
entities_labels = list(set(zip(entities, labels))) #unique entities

In [20]:
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df.head(20)

Unnamed: 0,Entities,Labels
0,Murrysville,GPE
1,Dow Jones Industrial Average,ORGANIZATION
2,Bloomberg,PERSON
3,Joe,PERSON
4,Caterpillar Caterpillar,ORGANIZATION
5,Latin America,GPE
6,Puskar,PERSON
7,Larry De Maria,PERSON
8,Chinese,GPE
9,Dow Jones Industrial,ORGANIZATION


### NLTK - NER

In [21]:
text = '''Sara's work efforts destroyed Apple Corporation's annual sales single handedly'''

In [22]:
entities = []
labels = []
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)), binary = False):
    if hasattr(chunk, 'label'):
        entities.append(' '.join(c[0] for c in chunk)) #Add space as between multi-token entities
        labels.append(chunk.label())

#entities_labels = list(zip(entities, labels))
entities_labels = list(set(zip(entities, labels))) #unique entities

In [23]:
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ["Entities", "Labels"]
entities_df

Unnamed: 0,Entities,Labels
0,Sara,PERSON
1,Apple Corporation,PERSON


In [24]:
import datetime
import pytz

datetime.datetime.now(pytz.timezone('US/Central')).strftime("%a, %d %B %Y %H:%M:%S")

'Wed, 26 October 2022 11:19:34'