[Reference](https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da)

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

# Information extraction

In [33]:
f = open("commentsample.txt", encoding="utf8", errors='ignore')
ex = f.read().replace("*","").replace("\n","")
ex
# print (ex)

'/  UNIX password, and DES, encryption.  By Tom Truscott, trt@rti.rti.org,  from algorithms by Robert W. Baldwin and James Gillogly.   References:  "Mathematical Cryptology for Computer Scientists and Mathematicians,"  by Wayne Patterson, 1987, ISBN 0-8476-7438-X.   "Password Security: A Case History," R. Morris and Ken Thompson,  Communications of the ACM, vol. 22, pp. 594-597, Nov. 1979.   "DES will be Totally Insecure within Ten Years," M.E. Hellman,  IEEE Spectrum, vol. 16, pp. 32-39, July 1979. /'

In [34]:
def preprocess(sent):
    sent = nltk.word_tokenize(sent)
    sent = nltk.pos_tag(sent)
    return sent

In [35]:
sent = preprocess(ex)
sent

[('/', 'JJ'),
 ('UNIX', 'NNP'),
 ('password', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('DES', 'NNP'),
 (',', ','),
 ('encryption', 'NN'),
 ('.', '.'),
 ('By', 'IN'),
 ('Tom', 'NNP'),
 ('Truscott', 'NNP'),
 (',', ','),
 ('trt', 'NN'),
 ('@', 'NNP'),
 ('rti.rti.org', 'NN'),
 (',', ','),
 ('from', 'IN'),
 ('algorithms', 'VBN'),
 ('by', 'IN'),
 ('Robert', 'NNP'),
 ('W.', 'NNP'),
 ('Baldwin', 'NNP'),
 ('and', 'CC'),
 ('James', 'NNP'),
 ('Gillogly', 'NNP'),
 ('.', '.'),
 ('References', 'NNS'),
 (':', ':'),
 ('``', '``'),
 ('Mathematical', 'NNP'),
 ('Cryptology', 'NNP'),
 ('for', 'IN'),
 ('Computer', 'NNP'),
 ('Scientists', 'NNPS'),
 ('and', 'CC'),
 ('Mathematicians', 'NNPS'),
 (',', ','),
 ("''", "''"),
 ('by', 'IN'),
 ('Wayne', 'NNP'),
 ('Patterson', 'NNP'),
 (',', ','),
 ('1987', 'CD'),
 (',', ','),
 ('ISBN', 'NNP'),
 ('0-8476-7438-X', 'NNP'),
 ('.', '.'),
 ('``', '``'),
 ('Password', 'NNP'),
 ('Security', 'NNP'),
 (':', ':'),
 ('A', 'DT'),
 ('Case', 'NNP'),
 ('History', 'NNP'),
 (',', ','),


In [36]:
pattern = 'NP: {<DT>?<JJ>*<NN>}'

# Chunking

In [38]:
cp = nltk.RegexpParser(pattern)
cs = cp.parse(sent)
print (cs)

(S
  //JJ
  UNIX/NNP
  (NP password/NN)
  ,/,
  and/CC
  DES/NNP
  ,/,
  (NP encryption/NN)
  ./.
  By/IN
  Tom/NNP
  Truscott/NNP
  ,/,
  (NP trt/NN)
  @/NNP
  (NP rti.rti.org/NN)
  ,/,
  from/IN
  algorithms/VBN
  by/IN
  Robert/NNP
  W./NNP
  Baldwin/NNP
  and/CC
  James/NNP
  Gillogly/NNP
  ./.
  References/NNS
  :/:
  ``/``
  Mathematical/NNP
  Cryptology/NNP
  for/IN
  Computer/NNP
  Scientists/NNPS
  and/CC
  Mathematicians/NNPS
  ,/,
  ''/''
  by/IN
  Wayne/NNP
  Patterson/NNP
  ,/,
  1987/CD
  ,/,
  ISBN/NNP
  0-8476-7438-X/NNP
  ./.
  ``/``
  Password/NNP
  Security/NNP
  :/:
  A/DT
  Case/NNP
  History/NNP
  ,/,
  ''/''
  R./NNP
  Morris/NNP
  and/CC
  Ken/NNP
  Thompson/NNP
  ,/,
  Communications/NNP
  of/IN
  the/DT
  ACM/NNP
  ,/,
  (NP vol/NN)
  ./.
  22/CD
  ,/,
  (NP pp/NN)
  ./.
  594-597/JJ
  ,/,
  Nov./NNP
  1979/CD
  ./.
  ``/``
  DES/NNP
  will/MD
  be/VB
  Totally/RB
  Insecure/NNP
  within/IN
  Ten/NNP
  Years/NNP
  ,/,
  ''/''
  M.E/NNP
  ./.
  Hellman/NNP
  ,/

In [39]:
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

iob_tagged = tree2conlltags(cs)
pprint (iob_tagged)

[('/', 'JJ', 'O'),
 ('UNIX', 'NNP', 'O'),
 ('password', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('and', 'CC', 'O'),
 ('DES', 'NNP', 'O'),
 (',', ',', 'O'),
 ('encryption', 'NN', 'B-NP'),
 ('.', '.', 'O'),
 ('By', 'IN', 'O'),
 ('Tom', 'NNP', 'O'),
 ('Truscott', 'NNP', 'O'),
 (',', ',', 'O'),
 ('trt', 'NN', 'B-NP'),
 ('@', 'NNP', 'O'),
 ('rti.rti.org', 'NN', 'B-NP'),
 (',', ',', 'O'),
 ('from', 'IN', 'O'),
 ('algorithms', 'VBN', 'O'),
 ('by', 'IN', 'O'),
 ('Robert', 'NNP', 'O'),
 ('W.', 'NNP', 'O'),
 ('Baldwin', 'NNP', 'O'),
 ('and', 'CC', 'O'),
 ('James', 'NNP', 'O'),
 ('Gillogly', 'NNP', 'O'),
 ('.', '.', 'O'),
 ('References', 'NNS', 'O'),
 (':', ':', 'O'),
 ('``', '``', 'O'),
 ('Mathematical', 'NNP', 'O'),
 ('Cryptology', 'NNP', 'O'),
 ('for', 'IN', 'O'),
 ('Computer', 'NNP', 'O'),
 ('Scientists', 'NNPS', 'O'),
 ('and', 'CC', 'O'),
 ('Mathematicians', 'NNPS', 'O'),
 (',', ',', 'O'),
 ("''", "''", 'O'),
 ('by', 'IN', 'O'),
 ('Wayne', 'NNP', 'O'),
 ('Patterson', 'NNP', 'O'),
 (',', ',', 'O'),

In [41]:
ne_tree = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print (ne_tree)

(S
  //JJ
  (ORGANIZATION UNIX/NNP)
  password/NN
  ,/,
  and/CC
  (ORGANIZATION DES/NNP)
  ,/,
  encryption/NN
  ./.
  By/IN
  (PERSON Tom/NNP Truscott/NNP)
  ,/,
  trt/NN
  @/NNP
  rti.rti.org/NN
  ,/,
  from/IN
  algorithms/VBN
  by/IN
  (PERSON Robert/NNP W./NNP Baldwin/NNP)
  and/CC
  (PERSON James/NNP Gillogly/NNP)
  ./.
  References/NNS
  :/:
  ``/``
  Mathematical/NNP
  Cryptology/NNP
  for/IN
  (ORGANIZATION Computer/NNP Scientists/NNPS)
  and/CC
  Mathematicians/NNPS
  ,/,
  ''/''
  by/IN
  (PERSON Wayne/NNP Patterson/NNP)
  ,/,
  1987/CD
  ,/,
  (ORGANIZATION ISBN/NNP)
  0-8476-7438-X/NNP
  ./.
  ``/``
  (PERSON Password/NNP Security/NNP)
  :/:
  A/DT
  Case/NNP
  History/NNP
  ,/,
  ''/''
  R./NNP
  Morris/NNP
  and/CC
  (PERSON Ken/NNP Thompson/NNP)
  ,/,
  (ORGANIZATION Communications/NNP)
  of/IN
  the/DT
  (ORGANIZATION ACM/NNP)
  ,/,
  vol/NN
  ./.
  22/CD
  ,/,
  pp/NN
  ./.
  594-597/JJ
  ,/,
  Nov./NNP
  1979/CD
  ./.
  ``/``
  DES/NNP
  will/MD
  be/VB
  Totally/RB

# Entity

In [47]:
doc

/  UNIX password, and DES, encryption.  By Tom Truscott, trt@rti.rti.org,  from algorithms by Robert W. Baldwin and James Gillogly.   References:  "Mathematical Cryptology for Computer Scientists and Mathematicians,"  by Wayne Patterson, 1987, ISBN 0-8476-7438-X.   "Password Security: A Case History," R. Morris and Ken Thompson,  Communications of the ACM, vol. 22, pp. 594-597, Nov. 1979.   "DES will be Totally Insecure within Ten Years," M.E. Hellman,  IEEE Spectrum, vol. 16, pp. 32-39, July 1979. /

In [43]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [45]:
doc = nlp(ex)
pprint ([(X.text, X.label_) for X in doc.ents])

[('Tom Truscott', 'PERSON'),
 ('Robert W. Baldwin', 'PERSON'),
 ('James Gillogly', 'PERSON'),
 (' "Mathematical Cryptology for Computer Scientists', 'ORG'),
 ('Mathematicians', 'NORP'),
 ('Wayne Patterson', 'PERSON'),
 ('1987', 'DATE'),
 ('  "Password Security', 'WORK_OF_ART'),
 ('Case History', 'ORG'),
 ('R. Morris', 'PERSON'),
 ('Ken Thompson', 'PERSON'),
 (' Communications', 'ORG'),
 ('ACM', 'ORG'),
 ('22', 'CARDINAL'),
 ('594', 'CARDINAL'),
 ('Nov. 1979', 'DATE'),
 ('Ten Years', 'DATE'),
 ('M.E. Hellman', 'ORG'),
 (' IEEE Spectrum', 'ORG'),
 ('16', 'CARDINAL'),
 ('32', 'CARDINAL'),
 ('July 1979', 'DATE')]


In [46]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(/, 'O', ''),
 ( , 'O', ''),
 (UNIX, 'O', ''),
 (password, 'O', ''),
 (,, 'O', ''),
 (and, 'O', ''),
 (DES, 'O', ''),
 (,, 'O', ''),
 (encryption, 'O', ''),
 (., 'O', ''),
 ( , 'O', ''),
 (By, 'O', ''),
 (Tom, 'B', 'PERSON'),
 (Truscott, 'I', 'PERSON'),
 (,, 'O', ''),
 (trt@rti.rti.org, 'O', ''),
 (,, 'O', ''),
 ( , 'O', ''),
 (from, 'O', ''),
 (algorithms, 'O', ''),
 (by, 'O', ''),
 (Robert, 'B', 'PERSON'),
 (W., 'I', 'PERSON'),
 (Baldwin, 'I', 'PERSON'),
 (and, 'O', ''),
 (James, 'B', 'PERSON'),
 (Gillogly, 'I', 'PERSON'),
 (., 'O', ''),
 (  , 'O', ''),
 (References, 'O', ''),
 (:, 'O', ''),
 ( , 'B', 'ORG'),
 (", 'I', 'ORG'),
 (Mathematical, 'I', 'ORG'),
 (Cryptology, 'I', 'ORG'),
 (for, 'I', 'ORG'),
 (Computer, 'I', 'ORG'),
 (Scientists, 'I', 'ORG'),
 (and, 'O', ''),
 (Mathematicians, 'B', 'NORP'),
 (,, 'O', ''),
 (", 'O', ''),
 ( , 'O', ''),
 (by, 'O', ''),
 (Wayne, 'B', 'PERSON'),
 (Patterson, 'I', 'PERSON'),
 (,, 'O', ''),
 (1987, 'B', 'DATE'),
 (,, 'O', ''),
 (ISBN, 'O', ''),


In [48]:
displacy.render(nlp(str(doc)), jupyter=True, style='ent')