# Approachs
1. NLTK [ word segmentation + sentence segmentation]
2. Spacy

# NLTK

## 1. Word Tokenization
## 2. POS Tagging
## 3. ne_chunk

In [3]:
import pandas as pd
import nltk

In [4]:
text = ''' Apple is aiming to buy a India's startup Xolo INC for $6 million USD,
which increased the stock rate  of AAPL by 15% in United States of America'''

# Word Tokenization

In [6]:
words = nltk.word_tokenize(text)
words

['Apple',
 'is',
 'aiming',
 'to',
 'buy',
 'a',
 'India',
 "'s",
 'startup',
 'Xolo',
 'INC',
 'for',
 '$',
 '6',
 'million',
 'USD',
 ',',
 'which',
 'increased',
 'the',
 'stock',
 'rate',
 'of',
 'AAPL',
 'by',
 '15',
 '%',
 'in',
 'United',
 'States',
 'of',
 'America']

# POS Tagging

In [7]:
pos_tags = nltk.pos_tag(words)
pos_tags

[('Apple', 'NNP'),
 ('is', 'VBZ'),
 ('aiming', 'VBG'),
 ('to', 'TO'),
 ('buy', 'VB'),
 ('a', 'DT'),
 ('India', 'NNP'),
 ("'s", 'POS'),
 ('startup', 'NN'),
 ('Xolo', 'NNP'),
 ('INC', 'NNP'),
 ('for', 'IN'),
 ('$', '$'),
 ('6', 'CD'),
 ('million', 'CD'),
 ('USD', 'NNP'),
 (',', ','),
 ('which', 'WDT'),
 ('increased', 'VBD'),
 ('the', 'DT'),
 ('stock', 'NN'),
 ('rate', 'NN'),
 ('of', 'IN'),
 ('AAPL', 'NNP'),
 ('by', 'IN'),
 ('15', 'CD'),
 ('%', 'NN'),
 ('in', 'IN'),
 ('United', 'NNP'),
 ('States', 'NNPS'),
 ('of', 'IN'),
 ('America', 'NNP')]

In [8]:
# refrerring for tags
nltk.help.upenn_tagset("VBZ")

VBZ: verb, present tense, 3rd person singular
    bases reconstructs marks mixes displeases seals carps weaves snatches
    slumps stretches authorizes smolders pictures emerges stockpiles
    seduces fizzes uses bolsters slaps speaks pleads ...


# ne_chunk

In [9]:
chunks = nltk.ne_chunk(pos_tags)
for i in chunks:
    print(i)

(GPE Apple/NNP)
('is', 'VBZ')
('aiming', 'VBG')
('to', 'TO')
('buy', 'VB')
('a', 'DT')
(GPE India/NNP)
("'s", 'POS')
('startup', 'NN')
(PERSON Xolo/NNP INC/NNP)
('for', 'IN')
('$', '$')
('6', 'CD')
('million', 'CD')
('USD', 'NNP')
(',', ',')
('which', 'WDT')
('increased', 'VBD')
('the', 'DT')
('stock', 'NN')
('rate', 'NN')
('of', 'IN')
(ORGANIZATION AAPL/NNP)
('by', 'IN')
('15', 'CD')
('%', 'NN')
('in', 'IN')
(GPE United/NNP States/NNPS)
('of', 'IN')
(GPE America/NNP)


In [10]:
chunks_ne = nltk.ne_chunk(pos_tags,binary=True)
for i in chunks_ne:
    print(i)

(NE Apple/NNP)
('is', 'VBZ')
('aiming', 'VBG')
('to', 'TO')
('buy', 'VB')
('a', 'DT')
(NE India/NNP)
("'s", 'POS')
('startup', 'NN')
(NE Xolo/NNP INC/NNP)
('for', 'IN')
('$', '$')
('6', 'CD')
('million', 'CD')
('USD', 'NNP')
(',', ',')
('which', 'WDT')
('increased', 'VBD')
('the', 'DT')
('stock', 'NN')
('rate', 'NN')
('of', 'IN')
(NE AAPL/NNP)
('by', 'IN')
('15', 'CD')
('%', 'NN')
('in', 'IN')
(NE United/NNP States/NNPS)
('of', 'IN')
(NE America/NNP)


# binary = False

In [11]:
entities = []
labels = []

for chunk in chunks:
    if hasattr(chunk,'label'):
        entities.append(' '.join(word[0] for word in chunk))
        labels.append(chunk.label())
        
entities_labels = list(set(zip(entities,labels)))
df = pd.DataFrame(entities_labels)
df.columns = ['Entities','Labels']
df

Unnamed: 0,Entities,Labels
0,United States,GPE
1,AAPL,ORGANIZATION
2,Xolo INC,PERSON
3,Apple,GPE
4,America,GPE
5,India,GPE


# binary = True

In [31]:
entities_ne = []
labels_ne = []

for chunk in chunks_ne:
    if hasattr(chunk,'label'):
        entities_ne.append(' '.join(word[0] for word in chunk))
        labels_ne.append(chunk.label())
        
entities_labels_ne = list(set(zip(entities_ne,labels_ne)))
df_ne = pd.DataFrame(entities_labels_ne)
df_ne.columns = ['Entities','Labels']
df_ne

Unnamed: 0,Entities,Labels
0,Xolo INC,NE
1,AAPL,NE
2,Apple,NE
3,America,NE
4,United States,NE
5,India,NE


# Spacy

In [13]:
import spacy,nltk
nlp = spacy.load("en_core_web_sm")

In [14]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [30]:
nlp.pipe_labels['ner']

['CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART']

In [15]:
doc = nlp(text)

In [18]:
for ents in doc.ents:
    print(ents,"-->",ents.label_,"-->",spacy.explain(ents.label_))

Apple --> ORG --> Companies, agencies, institutions, etc.
India --> GPE --> Countries, cities, states
Xolo INC --> ORG --> Companies, agencies, institutions, etc.
$6 million --> MONEY --> Monetary values, including unit
15% --> PERCENT --> Percentage, including "%"
United States of America --> GPE --> Countries, cities, states


In [None]:
entities_labels_ne = list(set(zip(entities_ne,labels_ne)))

In [25]:
entities_doc = []
labels_doc = []
description = []
start_position = []
end_position = []

for ents in doc.ents:
    entities_doc.append(ents)
    labels_doc.append(ents.label_)
    description.append(spacy.explain(ents.label_))
    start_position.append(ents.start_char)
    end_position.append(ents.end_char)
    
ner_spacy = list(zip(entities_doc,labels_doc,description,start_position,end_position))
df_spacy = pd.DataFrame(ner_spacy)
df_spacy.columns = ['Entities','Labels','Description','Starting Position','Ending Position']
df_spacy

Unnamed: 0,Entities,Labels,Description,Starting Position,Ending Position
0,(Apple),ORG,"Companies, agencies, institutions, etc.",1,6
1,(India),GPE,"Countries, cities, states",26,31
2,"(Xolo, INC)",ORG,"Companies, agencies, institutions, etc.",42,50
3,"($, 6, million)",MONEY,"Monetary values, including unit",55,65
4,"(15, %)",PERCENT,"Percentage, including ""%""",114,117
5,"(United, States, of, America)",GPE,"Countries, cities, states",121,145


In [26]:
from spacy import displacy

In [28]:
displacy.render(doc,style='ent')