# 1. NLTK method

In [1]:
import pandas as pd
import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('tagsets')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

In [2]:
text = "Apple acquired Zoom in China on Wednesday 6th May 2020.\
This news has made Apple and Google stock jump by 5% on Dow Jones Index in the \
United States of America"

In [3]:
## tokenize to words
words = nltk.word_tokenize(text)
words

['Apple',
 'acquired',
 'Zoom',
 'in',
 'China',
 'on',
 'Wednesday',
 '6th',
 'May',
 '2020.This',
 'news',
 'has',
 'made',
 'Apple',
 'and',
 'Google',
 'stock',
 'jump',
 'by',
 '5',
 '%',
 'on',
 'Dow',
 'Jones',
 'Index',
 'in',
 'the',
 'United',
 'States',
 'of',
 'America']

In [4]:
## Part of Speech tagging
pos_tags = nltk.pos_tag(words)
pos_tags

[('Apple', 'NNP'),
 ('acquired', 'VBD'),
 ('Zoom', 'NNP'),
 ('in', 'IN'),
 ('China', 'NNP'),
 ('on', 'IN'),
 ('Wednesday', 'NNP'),
 ('6th', 'CD'),
 ('May', 'NNP'),
 ('2020.This', 'CD'),
 ('news', 'NN'),
 ('has', 'VBZ'),
 ('made', 'VBN'),
 ('Apple', 'NNP'),
 ('and', 'CC'),
 ('Google', 'NNP'),
 ('stock', 'NN'),
 ('jump', 'NN'),
 ('by', 'IN'),
 ('5', 'CD'),
 ('%', 'NN'),
 ('on', 'IN'),
 ('Dow', 'NNP'),
 ('Jones', 'NNP'),
 ('Index', 'NNP'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('United', 'NNP'),
 ('States', 'NNPS'),
 ('of', 'IN'),
 ('America', 'NNP')]

In [5]:
nltk.help.upenn_tagset('NNP')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...


In [6]:
chunks =nltk.ne_chunk(pos_tags,binary=True) ## NE or not a NE

for chunk in chunks:
    print(chunk)

(NE Apple/NNP)
('acquired', 'VBD')
('Zoom', 'NNP')
('in', 'IN')
(NE China/NNP)
('on', 'IN')
('Wednesday', 'NNP')
('6th', 'CD')
('May', 'NNP')
('2020.This', 'CD')
('news', 'NN')
('has', 'VBZ')
('made', 'VBN')
(NE Apple/NNP)
('and', 'CC')
(NE Google/NNP)
('stock', 'NN')
('jump', 'NN')
('by', 'IN')
('5', 'CD')
('%', 'NN')
('on', 'IN')
('Dow', 'NNP')
('Jones', 'NNP')
('Index', 'NNP')
('in', 'IN')
('the', 'DT')
(NE United/NNP States/NNPS)
('of', 'IN')
(NE America/NNP)


In [7]:
entities =[]
labels=[]
for chunk in chunks:
    if hasattr(chunk,'label'):
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())
entities_labels = list(set(zip(entities,labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ['Entities','labels']
entities_df

Unnamed: 0,Entities,labels
0,America,NE
1,Apple,NE
2,Google,NE
3,China,NE
4,United States,NE


In [8]:
chunks =nltk.ne_chunk(pos_tags,binary=False) ## NE or not a NE
for chunk in chunks:
    print(chunk)

entities =[]
labels=[]
for chunk in chunks:
    if hasattr(chunk,'label'):
        entities.append(' '.join(c[0] for c in chunk))
        labels.append(chunk.label())
entities_labels = list(set(zip(entities,labels)))
entities_df = pd.DataFrame(entities_labels)
entities_df.columns = ['Entities','labels']
entities_df

(PERSON Apple/NNP)
('acquired', 'VBD')
(PERSON Zoom/NNP)
('in', 'IN')
(GPE China/NNP)
('on', 'IN')
('Wednesday', 'NNP')
('6th', 'CD')
('May', 'NNP')
('2020.This', 'CD')
('news', 'NN')
('has', 'VBZ')
('made', 'VBN')
(PERSON Apple/NNP)
('and', 'CC')
(ORGANIZATION Google/NNP)
('stock', 'NN')
('jump', 'NN')
('by', 'IN')
('5', 'CD')
('%', 'NN')
('on', 'IN')
(PERSON Dow/NNP Jones/NNP Index/NNP)
('in', 'IN')
('the', 'DT')
(GPE United/NNP States/NNPS)
('of', 'IN')
(GPE America/NNP)


Unnamed: 0,Entities,labels
0,China,GPE
1,America,GPE
2,Apple,PERSON
3,Dow Jones Index,PERSON
4,Zoom,PERSON
5,United States,GPE
6,Google,ORGANIZATION


# 2. SPACY method

In [9]:
## Spacy
# https://spacy.io/

In [9]:
import spacy
from spacy import displacy
# spacy.__version__

3 spacy models as follows:

1. en_core_web_sm = small size
2. en_core_web_md = medium size
3. en_core_web_lg = large size

In [8]:
## Download spacy models 
# !python -m spacy download en_core_web_md

In [10]:
text = "Apple acquired Zoom in China on Wednesday 6th May 2020.\
This news has made Apple and Google stock jump by 5% on Dow Jones Index in the \
United States of America"

In [11]:
## Load Spacy model
nlp = spacy.load('en_core_web_md')

In [12]:
doc = nlp(text)

entities =[]
labels =[]
position_start = []
position_end = []
pos = []
for ent in doc.ents:
    entities.append(ent)
    labels.append(ent.label_)
    position_start.append(ent.start_char)
    position_end.append(ent.end_char)
for token in doc:
    pos.append(token.pos_)
df = pd.DataFrame({'Entities':entities,'Labels':labels,
                   'Position_Start':position_start,'position_end':position_end})
df

Unnamed: 0,Entities,Labels,Position_Start,position_end
0,(Apple),ORG,0,5
1,(Zoom),ORG,15,19
2,(China),GPE,23,28
3,"(Wednesday, 6th)",DATE,32,45
4,(Apple),ORG,74,79
5,(Google),ORG,84,90
6,"(5, %)",PERCENT,105,107
7,"(Dow, Jones)",ORG,111,120
8,"(the, United, States, of, America)",GPE,130,158


In [13]:
pos

['PROPN',
 'VERB',
 'PROPN',
 'ADP',
 'PROPN',
 'ADP',
 'PROPN',
 'NOUN',
 'PROPN',
 'NUM',
 'NOUN',
 'AUX',
 'VERB',
 'PROPN',
 'CCONJ',
 'PROPN',
 'NOUN',
 'NOUN',
 'ADP',
 'NUM',
 'NOUN',
 'ADP',
 'PROPN',
 'PROPN',
 'PROPN',
 'ADP',
 'DET',
 'PROPN',
 'PROPN',
 'ADP',
 'PROPN']

In [14]:
spacy.explain('GPE')

'Countries, cities, states'

In [17]:
import spacy
from spacy import displacy

text = "When Sebastian Thrun started working on self-driving cars at Google in 2007, few people outside of the company took him seriously."
text2 = "This news has made Apple and Google stock jump by 5% on Dow Jones Index in the United States of America"

nlp = spacy.load("en_core_web_md")
# doc = nlp(text)
doc = nlp(text2)
displacy.serve(doc, style="ent")


Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.
