# **NER Using NLTK**

In [1]:
example_document = '''Google began in January 1996 as a research project by Larry Page and Sergey Brin when they were both PhD students at Stanford University in Stanford, California.'''

In [2]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [3]:
def nltk_ner(document):
  return {(' '.join(c[0] for c in chunk), chunk.label() ) for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(document))) if hasattr(chunk, 'label') }

In [4]:
nltk_ner(example_document)

{('California', 'GPE'),
 ('Google', 'PERSON'),
 ('Larry Page', 'PERSON'),
 ('PhD', 'ORGANIZATION'),
 ('Sergey Brin', 'PERSON'),
 ('Stanford', 'GPE'),
 ('Stanford University', 'ORGANIZATION')}

# **NER Using Spacy**

In [5]:
!python3 -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9MB)
[K     |████████████████████████████████| 827.9MB 1.2MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-cp36-none-any.whl size=829180944 sha256=f1387185a3c0165e642ea01fe6d6aed9882914f0e1f578899bf4970d34197f91
  Stored in directory: /tmp/pip-ephem-wheel-cache-oo2x31wk/wheels/2a/c1/a6/fc7a877b1efca9bc6a089d6f506f16d3868408f9ff89f8dbfc
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [6]:
import spacy
sp_lg = spacy.load("en_core_web_sm")

In [7]:
def spacy_large_ner(document):
  return {(ent.text.strip(), ent.label_) for ent in sp_lg(document).ents}

In [8]:
spacy_large_ner(example_document)

{('California', 'GPE'),
 ('Google', 'ORG'),
 ('January 1996', 'DATE'),
 ('Larry Page', 'PERSON'),
 ('PhD', 'WORK_OF_ART'),
 ('Sergey Brin', 'PERSON'),
 ('Stanford', 'GPE'),
 ('Stanford University', 'ORG')}

# **NER with Standford Core NLP**

In [9]:
!pip3 install nltk==3.2.4

Collecting nltk==3.2.4
[?25l  Downloading https://files.pythonhosted.org/packages/cd/c2/858e0708b497116ae45cf5c6b1f66984ac60729c61e49df6c1c0b808d1e4/nltk-3.2.4.tar.gz (1.2MB)
[K     |████████████████████████████████| 1.2MB 2.8MB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.2.4-cp36-none-any.whl size=1367704 sha256=7beaf305778e1cb56e049374b1392f29a2863bd9eff3a4ce5101339b5c937747
  Stored in directory: /root/.cache/pip/wheels/36/f1/5c/f667347d86a3a534ba4c0127eed4389f929916e3ec88bb461a
Successfully built nltk
Installing collected packages: nltk
  Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.2.4


In [10]:
!wget http://nlp.stanford.edu/software/stanford-ner-2015-04-20.zip
!unzip stanford-ner-2015-04-20.zip 

--2020-10-16 06:26:53--  http://nlp.stanford.edu/software/stanford-ner-2015-04-20.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/software/stanford-ner-2015-04-20.zip [following]
--2020-10-16 06:26:54--  https://nlp.stanford.edu/software/stanford-ner-2015-04-20.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 176961718 (169M) [application/zip]
Saving to: ‘stanford-ner-2015-04-20.zip’


2020-10-16 06:27:17 (7.32 MB/s) - ‘stanford-ner-2015-04-20.zip’ saved [176961718/176961718]

Archive:  stanford-ner-2015-04-20.zip
   creating: stanford-ner-2015-04-20/
  inflating: stanford-ner-2015-04-20/README.txt  
  inflating: stanford-ner-2015-04-20/ner-gui.bat  
  inflating: stanford-ner-2015-04-20/build.xml  
  inflating: 

In [11]:
from nltk.tag.stanford import StanfordNERTagger
jar = "stanford-ner-2015-04-20/stanford-ner-3.5.2.jar"
model = "stanford-ner-2015-04-20/classifiers/" 
st_3class = StanfordNERTagger(model + "english.all.3class.distsim.crf.ser.gz", jar, encoding='utf8') 
st_4class = StanfordNERTagger(model + "english.conll.4class.distsim.crf.ser.gz", jar, encoding='utf8') 
st_7class = StanfordNERTagger(model + "english.muc.7class.distsim.crf.ser.gz", jar, encoding='utf8') 

The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  '-tokenizerFactory',
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  '-tokenizerFactory',
The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.tag.corenlp.CoreNLPPOSTagger[0m or [91mnltk.tag.corenlp.CoreNLPNERTagger[0m instead.
  '-tokenizerFactory',


In [12]:
def stanford_ner(document,model):
  if model == 1:
    return [(entity,tag) for entity,tag in st_3class.tag(document.split()) if tag != "O"]
  elif model == 2:
    return [(entity,tag) for entity,tag in st_4class.tag(document.split()) if tag != "O"]
  elif model == 3:
    return [(entity,tag) for entity,tag in st_7class.tag(document.split()) if tag != "O"]

In [13]:
stanford_ner(example_document,model=1)

[('Google', 'ORGANIZATION'),
 ('Larry', 'PERSON'),
 ('Page', 'PERSON'),
 ('Sergey', 'PERSON'),
 ('Brin', 'PERSON'),
 ('Stanford', 'ORGANIZATION'),
 ('University', 'ORGANIZATION'),
 ('Stanford,', 'LOCATION'),
 ('California.', 'LOCATION')]

In [14]:
stanford_ner(example_document,model=2)

[('Google', 'ORGANIZATION'),
 ('Larry', 'PERSON'),
 ('Page', 'PERSON'),
 ('Sergey', 'PERSON'),
 ('Brin', 'PERSON'),
 ('Stanford', 'ORGANIZATION'),
 ('University', 'ORGANIZATION')]

In [15]:
stanford_ner(example_document,model=3)

[('January', 'DATE'),
 ('1996', 'DATE'),
 ('Larry', 'PERSON'),
 ('Page', 'PERSON'),
 ('Sergey', 'PERSON'),
 ('Brin', 'PERSON'),
 ('Stanford', 'ORGANIZATION'),
 ('University', 'ORGANIZATION'),
 ('Stanford,', 'LOCATION'),
 ('California.', 'LOCATION')]