# Named Entity Recognition in NLTK



In [1]:
from nltk import ne_chunk
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk.chunk import tree2conlltags

text = "Zuzanna Kocur earned 42 dollars selling 2 shares of Google in 2018"


In [2]:
def print_prediction(text):
    # tokenize text
    tokens = word_tokenize(text)
    print("Tokens:")
    print(tokens)
    print()
    
    # tag parts of speech
    pos_taggs = pos_tag(tokens)
    print("Part of speech tagging:")
    print(pos_taggs)
    print()
    
    # uses _currently recommended_ named entities chunker
    entities = ne_chunk(pos_taggs)
    print("Entities and parts of speech:")
    print(entities)
    print()

In [3]:
print_prediction(text)


Tokens:
['Zuzanna', 'Kocur', 'earned', '42', 'dollars', 'selling', '2', 'shares', 'of', 'Google', 'in', '2018']

Part of speech tagging:
[('Zuzanna', 'NNP'), ('Kocur', 'NNP'), ('earned', 'VBD'), ('42', 'CD'), ('dollars', 'NNS'), ('selling', 'VBG'), ('2', 'CD'), ('shares', 'NNS'), ('of', 'IN'), ('Google', 'NNP'), ('in', 'IN'), ('2018', 'CD')]

Entities and parts of speech:
(S
  (PERSON Zuzanna/NNP)
  (PERSON Kocur/NNP)
  earned/VBD
  42/CD
  dollars/NNS
  selling/VBG
  2/CD
  shares/NNS
  of/IN
  (GPE Google/NNP)
  in/IN
  2018/CD)



In [None]:
# "(PERSON Zuzanna/NNP)" is NLTK's Tree object
# if something was not marked as entity, it will be a tuple, e.g. "earned/VBD"

In [4]:
from nltk.tree import Tree

def anonymize_names(text):
    tokens = word_tokenize(text)
    pos_taggs = pos_tag(tokens)
    entities = ne_chunk(pos_taggs)
    
    result = ""
    for ent in entities:
        if isinstance(ent, Tree):
            # if it is a Tree object, it was marked as entity
            leaf = ent.pos()[0]
            # leaf looks like this: (('Zuzanna', 'NNP'), 'PERSON')
            if leaf[1] == 'PERSON':
                result += "%SECRET% "
            else:
                result += leaf[0][0] + " "
                
        else:
            # if it is not a Tree object, it was not marked as entity - add to result
            result += ent[0] + " "
    return result   


In [5]:
anonymized = anonymize_names(text)
print(anonymized)

%SECRET% %SECRET% earned 42 dollars selling 2 shares of Google in 2018 


In [6]:
print(anonymize_names("There was Barack Obama singing"))

There was %SECRET% singing 
