# Extract Named Entities
Using Spacy.io, identify and extract a list of named entities. Create a dataset with an attribute indicating the presence for each of the top 500 most common named entities.

In [132]:
# Install: pip install spacy && python -m spacy download en
import spacy
#spacy.info('en')
nlp = spacy.load('en')


In [57]:
# import the dataset
import csv
with open('data/comments-labelled.csv', 'r') as f:
    reader = csv.DictReader(f)
    comments = [comment for comment in reader]
len(comments)

146363

In [133]:
# Extract the lemmatized named entities from each comment
from collections import Counter
c = Counter()
for i, comment in enumerate(comments[:100]):
    doc = nlp(comment['comment'])
    #ents = [(ent, ent.label_, ent.lemma_) for ent in doc.ents]
    ents = [(ent.lemma_, ent.label_) for ent in doc.ents]
    comments[i]['ents'] = [(ent, ent.label_, ent.lemma_) for ent in doc.ents]
    for ent in ents:
        c[ent] += 1
c.most_common(25)

[(('utah', 'GPE'), 40),
 (('bears ears national monument', 'ORG'), 19),
 (('america', 'GPE'), 17),
 (('national monument', 'ORG'), 16),
 (('one', 'CARDINAL'), 15),
 (('american', 'NORP'), 14),
 (('national monuments', 'ORG'), 13),
 (('bears ears', 'ORG'), 13),
 (('trump', 'PERSON'), 12),
 (('monument', 'ORG'), 11),
 (('the antiquities act', 'LAW'), 10),
 (('americans', 'NORP'), 10),
 (('native american', 'NORP'), 9),
 (('future generation', 'DATE'), 8),
 (('the united states', 'GPE'), 7),
 (('the national monument', 'ORG'), 7),
 (('national parks', 'ORG'), 6),
 (('zinke', 'PERSON'), 6),
 (('first', 'ORDINAL'), 6),
 (('two', 'CARDINAL'), 5),
 (('1996', 'DATE'), 4),
 (('san juan county', 'GPE'), 4),
 (('blm', 'ORG'), 4),
 (('generation', 'DATE'), 4),
 (('the bears ears', 'ORG'), 4)]

In [120]:
# extract the top 500 most frequent
len(c.most_common()[:500])
types = ['PERSON', 'LAW', 'ORG', 'NORP', 'GPE', 'WORK_OF_ART']
top_500 = [ne[0][0] for ne in c.most_common()[:500] if ne[0][1] in types]


In [135]:
# create binarized dataframe indicating the presence / absence of each named
# entity in the dataset

import pandas as pd
named_ents = []

top_500_set = set(top_500)

for i, comment in enumerate(comments):
    doc = nlp(comment['comment'])
    ents = set([ent.lemma_ for ent in doc.ents])
    new_dict = { key: True for key in list(ents & top_500_set) }
    new_dict['document_id'] = comment['document_id'] 
        
    named_ents.append(new_dict)
named_ents
ne_df = pd.DataFrame(named_ents)
ne_df.fillna(value=False, inplace=True)

In [139]:
ne_df.head()
ne_df2 = ne_df*1 # convert boolean to 0/1

['document_id',
 '',
 '-PRON- national monuments',
 'a national monument',
 'act',
 'administration',
 'african',
 'alaska',
 'albuquerque',
 'all national monuments',
 'ama',
 'america',
 'american',
 'americans',
 'americas',
 'antiquities act',
 'any national monument',
 'arches',
 'arizona',
 'arlington',
 'atlanta',
 'atlantic puffins',
 'atv',
 'austin',
 'az',
 'backcountry hunters & anglers',
 'basin and range',
 'basin and range ( nv',
 'bear',
 'bear ear',
 'bear ears',
 'bear ears national monument',
 'bears',
 'bears ear',
 'bears ears',
 'bears ears (',
 'bears ears and',
 'bears ears monument',
 'bears ears national',
 'bears ears national monument',
 "bears ears national monument 's",
 'bears ears national monuments',
 'bears ears nm',
 'benm',
 'berkeley',
 'berryessa snow',
 'berryessa snow mountain',
 'bishop',
 'blanding',
 'blm',
 'bluff',
 'boise',
 'boulder',
 'bozeman',
 'brooklyn',
 'browns canyon',
 'browns canyon national monument',
 'butte',
 'ca',
 'californ

In [140]:
# reorder columns to put document_id first
cols = ne_df.columns.tolist()
cols.remove('document_id')
cols = ['document_id'] + cols
ne_df2 = ne_df2[cols]
ne_df2.to_csv('data/named-entities.csv')

['document_id',
 '',
 '-PRON- national monuments',
 'a national monument',
 'act',
 'administration',
 'african',
 'alaska',
 'albuquerque',
 'all national monuments',
 'ama',
 'america',
 'american',
 'americans',
 'americas',
 'antiquities act',
 'any national monument',
 'arches',
 'arizona',
 'arlington',
 'atlanta',
 'atlantic puffins',
 'atv',
 'austin',
 'az',
 'backcountry hunters & anglers',
 'basin and range',
 'basin and range ( nv',
 'bear',
 'bear ear',
 'bear ears',
 'bear ears national monument',
 'bears',
 'bears ear',
 'bears ears',
 'bears ears (',
 'bears ears and',
 'bears ears monument',
 'bears ears national',
 'bears ears national monument',
 "bears ears national monument 's",
 'bears ears national monuments',
 'bears ears nm',
 'benm',
 'berkeley',
 'berryessa snow',
 'berryessa snow mountain',
 'bishop',
 'blanding',
 'blm',
 'bluff',
 'boise',
 'boulder',
 'bozeman',
 'brooklyn',
 'browns canyon',
 'browns canyon national monument',
 'butte',
 'ca',
 'californ

In [108]:
# template for matching monument as a named entitty
# from spacy.attrs import IS_PUNCT, LOWER, ORTH
# nlp.matcher.add(
#     'BearsEars',
#     'GPE',
#     {},
#     [
#         [
#             {ORTH: 'Bears'},
#             {ORTH: 'Ears'},
#             {ORTH: 'National'},
#             {ORTH: 'Monument'}
#         ],
#         [
#             {LOWER: 'bears'},
#             {LOWER: 'ears'},
#             {LOWER: 'national', 'OP': '?'},
#             {LOWER: 'monument', 'OP': '?'}
#         ]
#     ]    
# )

# Grouping
After exporting, the ~500 named entities were manually grouped down to ~250 named entities, maintaining the binary coding for each.