# Named Entity Extraction with Textacy

## Setup
For most uses of textacy, language-specific model data for spacy must first be downloaded. Follow the directions [here](https://spacy.io/docs/usage/models).

```bash
$ pip install textacy
```

Experiments with semi-supervised and unsupervised learning for entity clustering were unimpressive and not successful.

In [1]:
import textacy
import spacy
from pprint import pprint
import random
nlp = spacy.load('en')

In [2]:
# import the dataset
import csv
with open('data/comments.csv', 'r') as f:
    reader = csv.DictReader(f)
    comments = [comment for comment in reader]
len(comments)

438871

In [3]:
# to improve performance for development, take a random sample of 25k comments
# comments = random.sample(comments, 25000)

[OrderedDict([('documentId', 'DOI-2017-0002-0992'),
              ('postedDate', '2017-05-12T00:00:00-04:00'),
              ('attachmentCount', '0'),
              ('commentText',
               'It is unconscionable to revert National Monuments back to land that can be mined and desecrated. This land was put aside for the PEOPLE OF THE USA, not for corporations to make a profit from. These lands are imperative to preserve wildlife as well as historic artifacts. \n\nBesides the fact that there is no grounds to do this, it would not only be illegal but a horrible idea! \n\nPROTECT THE LAND NOT YOUR WALLET!!!')]),
 OrderedDict([('documentId', 'DOI-2017-0002-0993'),
              ('postedDate', '2017-05-12T00:00:00-04:00'),
              ('attachmentCount', '0'),
              ('commentText',
               'Please do not remove protection from any of these monuments. It is more vital than ever to make a real effort to preserve fragile life and ecosystems. There is no reason to not keep 

# Train SpaCy
Define rules for the monument names.

In [5]:
nlp = spacy.load('en')
from spacy.attrs import IS_PUNCT, LOWER, ORTH, IS_SPACE
from spacy.matcher import Matcher

def merge_phrase(matcher, doc, i, matches):
    '''
    Merge a phrase. We have to be careful here because we'll change the token indices.
    To avoid problems, merge all the phrases once we're called on the last match.
    '''
    # print(matches[i])
    if i != len(matches)-1: # if not the last match, keep going
        return None

    # Get Span objects
    spans = [(ent_id, label, doc[start : end]) for ent_id, label, start, end in matches]

    for ent_id, label, span in spans:

        span.merge(label=label, tag='NNP' if label else span.root.tag_, ent_id = ent_id)

    
#         for eid in set([ent_id for ent_id, label, start, end in matches]):
#         # for each entity id, find the 
#     x = 0
#     for x, span in enumerate(spans):
#         if x < len(spans)-1 and spans[x+1][0] != span[0]: # next entity is not the same
#             span[2].merge(ent_id=span[0], label=span[1])
#         else:
#             continue

nlp.matcher.add_entity('BasinandRange', on_match=merge_phrase)
nlp.matcher.add_pattern(
'BasinandRange',
[{66: 'basin'},
 {66: 'and', 'OP': '?'},
 {66: 'range'}],
label='GPE'
)

nlp.matcher.add_entity('BearsEars', on_match=merge_phrase)
nlp.matcher.add_pattern(
'BearsEars',
[{66: 'bears'},
 {66: 'ears'}],
label='GPE'
)
nlp.matcher.add_pattern(
'BearsEars',
[{66: 'bears'},
 {66: 'ears'},
 {66: 'national', 'OP': '?'},
 {66: 'monument'}],
label='GPE'
)

nlp.matcher.add_entity('BerryessaSnowMountain', on_match=merge_phrase)
nlp.matcher.add_pattern(
'BerryessaSnowMountain',
[{66: 'berryessa'},
 {66: 'snow'},
 {66: 'mountain'}],
label='GPE'
)
nlp.matcher.add_pattern(
'BerryessaSnowMountain',
[{66: 'berryessa'}],
label='GPE'
)
nlp.matcher.add_pattern(
'BerryessaSnowMountain',
[{66: 'snow'},
 {66: 'mountain'}],
label='GPE'
)

nlp.matcher.add_entity('CanyonsoftheAncients', on_match=merge_phrase)
nlp.matcher.add_pattern(
'CanyonsoftheAncients',
[{66: 'canyons'},
 {66: 'of', 'OP': '?'},
 {66: 'the', 'OP': '?'},
 {66: 'ancients'}],
label='GPE'
)
nlp.matcher.add_pattern(
'CanyonsoftheAncients',
[{66: 'canyon'},
 {66: 'of', 'OP': '?'},
 {66: 'the', 'OP': '?'},
 {66: 'ancients'}],
label='GPE'
)
nlp.matcher.add_entity('CarrizoPlain', on_match=merge_phrase)
nlp.matcher.add_pattern(
'CarrizoPlain',
[{66: 'carrizo'},
 {66: 'plain'}],
label='GPE'
)

nlp.matcher.add_entity('CascadeSiskiyou', on_match=merge_phrase)
nlp.matcher.add_pattern(
'CascadeSiskiyou',
[{66: 'cascade', 'OP': '?'},
 {66: 'siskiyou'}],
label='GPE'
)

nlp.matcher.add_entity('CratersoftheMoon', on_match=merge_phrase)
nlp.matcher.add_pattern(
'CratersoftheMoon',
[{66: 'craters'},
 {66: 'of'},
 {66: 'the'},
 {66: 'moon'},
 {66: 'national', 'OP': '?'},
 {66: 'monument'}],
label='GPE'
)
nlp.matcher.add_pattern(
'CratersoftheMoon',
[{66: 'craters', 'OP': '?'},
 {66: 'of', 'OP': '?'},
 {66: 'the', 'OP': '?'},
 {66: 'moon'},
 {66: 'national', 'OP': '?'},
 {66: 'monument'}],
label='GPE'
)


nlp.matcher.add_entity('GiantSequoia', on_match=merge_phrase)
nlp.matcher.add_pattern(
'GiantSequoia',
[{66: 'giant', 'OP': '?'},
 {66: 'sequoia'}],
label='GPE'
)

nlp.matcher.add_entity('GoldButte', on_match=merge_phrase)
nlp.matcher.add_pattern(
'GoldButte',
[{66: 'gold'},
 {66: 'butte'},
 {66: 'national', 'OP': '?'},
 {66: 'monument', 'OP': '?'}],
label='GPE'
)

nlp.matcher.add_entity('GrandCanyonParashant', on_match=merge_phrase)
nlp.matcher.add_pattern(
'GrandCanyonParashant',
[{66: 'grand', 'OP': '?'},
 {66: 'canyon', 'OP': '?'},
  {IS_SPACE: True, 'OP': '*'},
 {IS_PUNCT: True, 'OP': '*'},
  {IS_SPACE: True, 'OP': '*'},
 {66: 'parashant',}],
label='GPE'
)

nlp.matcher.add_entity('GrandStaircaseEscalante', on_match=merge_phrase)
nlp.matcher.add_pattern(
'GrandStaircaseEscalante',
[{66: 'grand', 'OP': '?'},
 {66: 'staircase', 'OP': '?'},
 {IS_SPACE: True, 'OP': '*'},
 {IS_PUNCT: True},
  {IS_SPACE: True, 'OP': '*'},
 {66: 'escalante'}],
label='GPE'
)
nlp.matcher.add_pattern(
'GrandStaircaseEscalante',
[{66: 'grand'},
 {66: 'staircase'},
 {66: 'escalante', 'OP': '?'}],
label='GPE'
)
nlp.matcher.add_pattern(
'GrandStaircaseEscalante',
[{66: 'escalante'},
 {66: 'national', 'OP': '?'},
 {66: 'monument'}],
label='GPE'
)
nlp.matcher.add_pattern(
'GrandStaircaseEscalante',
[{66: 'staircase'},
 {66: 'national', 'OP': '?'},
 {66: 'monument'}],
label='GPE'
)

nlp.matcher.add_entity('HanfordReach', on_match=merge_phrase)
nlp.matcher.add_pattern(
'HanfordReach',
[{66: 'hanford'},
 {66: 'reach'}],
label='GPE'
)

nlp.matcher.add_entity('IronwoodForest', on_match=merge_phrase)
nlp.matcher.add_pattern(
'IronwoodForest',
[{66: 'ironwood'},
 {66: 'forest'}],
label='GPE'
)

nlp.matcher.add_entity('MojaveTrails', on_match=merge_phrase)
nlp.matcher.add_pattern(
'MojaveTrails',
[{66: 'mojave'},
 {66: 'trails'}],
label='GPE'
)

nlp.matcher.add_entity('OrganMountainsDesertPeaks', on_match=merge_phrase)
nlp.matcher.add_pattern(
'OrganMountainsDesertPeaks',
[{66: 'organ', 'OP': '?'},
 {66: 'mountains', 'OP': '?'},
  {IS_SPACE: True, 'OP': '*'},
  {IS_PUNCT: True},
  {IS_SPACE: True, 'OP': '*'},
 {66: 'desert', 'OP': '?'},
 {66: 'peaks'}],
label='GPE'
)
nlp.matcher.add_pattern(
'OrganMountainsDesertPeaks',
[{66: 'organ'},
 {66: 'mountains', 'OP': '?'},
 {66: 'desert', 'OP': '?'},
 {66: 'peaks', 'OP': '?'}],
label='GPE'
)

nlp.matcher.add_entity('RioGrandedelNorte', on_match=merge_phrase)
nlp.matcher.add_pattern(
'RioGrandedelNorte',
[{66: 'rio', 'OP': '?'},
 {66: 'grande', 'OP': '?'},
 {66: 'del', 'OP': '?'},
 {66: 'norte'}],
label='GPE'
)
nlp.matcher.add_pattern(
'RioGrandedelNorte',
[{66: 'rio'},
 {66: 'grande'},
 {66: 'del', 'OP': '?'},
 {66: 'norte', 'OP': '?'}],
label='GPE'
)

nlp.matcher.add_entity('SandtoSnow', on_match=merge_phrase)
nlp.matcher.add_pattern(
'SandtoSnow',
[{66: 'sand', 'OP': '?'},
 {66: 'to', 'OP': '?'},
 {66: 'snow'}],
label='GPE'
)

nlp.matcher.add_entity('SanGabrielMountains', on_match=merge_phrase)
nlp.matcher.add_pattern(
'SanGabrielMountains',
[{66: 'san', 'OP': '?'},
 {66: 'gabriel'},
 {66: 'mountains', 'OP': '?'}],
label='GPE'
)

nlp.matcher.add_entity('SonoranDesert', on_match=merge_phrase)
nlp.matcher.add_pattern(
'SonoranDesert',
[{66: 'sonoran'},
 {66: 'desert', 'OP': '?'}],
label='GPE'
)

nlp.matcher.add_entity('UpperMissouriRiverBreaks', on_match=merge_phrase)
nlp.matcher.add_pattern(
'UpperMissouriRiverBreaks',
[{66: 'upper', 'OP': '?'},
 {66: 'missouri'},
 {66: 'river', 'OP': '?'},
 {66: 'breaks', 'OP': '?'}],
label='GPE'
)

nlp.matcher.add_entity('VermilionCliffs', on_match=merge_phrase)
nlp.matcher.add_pattern(
'VermilionCliffs',
[{66: 'vermilion'},
 {66: 'cliffs'}],
label='GPE'
)

nlp.matcher.add_entity('KatahdinWoodsandWaters', on_match=merge_phrase)
nlp.matcher.add_pattern(
'KatahdinWoodsandWaters',
[{66: 'katahdin'},
 {66: 'woods', 'OP': '?'},
 {66: 'and', 'OP': '?'},
 {66: 'waters', 'OP': '?'}],
label='GPE'
)
nlp.matcher.add_pattern(
'KatahdinWoodsandWaters',
[{66: 'katahdin'},
 {66: 'national', 'OP': '?'},
 {66: 'monument'}],
label='GPE'
)

nlp.matcher.add_entity('MarianasTrenchMarine', on_match=merge_phrase)
nlp.matcher.add_pattern(
'MarianasTrenchMarine',
[{66: 'marianas'},
 {66: 'trench', 'OP': '?'},
 {66: 'marine', 'OP': '?'}],
label='GPE'
)

nlp.matcher.add_entity('Seamounts', on_match=merge_phrase)
nlp.matcher.add_pattern(
'Seamounts',
[{66: 'northeast'},
 {66: 'canyons'}],
label='GPE'
)
nlp.matcher.add_pattern(
'Seamounts',
[{66: 'seamounts'},
  {66: 'marine', 'OP': '?'},
 {66: 'national', 'OP': '?'},
 {66: 'monument', 'OP': '?'}],
label='GPE'
)

nlp.matcher.add_entity('PacificRemoteIslandsMarine', on_match=merge_phrase)
nlp.matcher.add_pattern(
'PacificRemoteIslandsMarine',
[{66: 'pacific'},
 {66: 'remote', 'OP': '?'},
 {66: 'islands'},
 {66: 'marine', 'OP': '?'},
{66: 'national', 'OP': '?'},
{66: 'monument', 'OP': '?'}],
label='GPE'
)

nlp.matcher.add_entity('PapahanaumokuakeaMarine', on_match=merge_phrase)
nlp.matcher.add_pattern(
'PapahanaumokuakeaMarine',
[{66: 'papahanaumokuakea'},
 {66: 'marine', 'OP': '?'}],
label='GPE'
)

nlp.matcher.add_entity('RoseAtollMarine', on_match=merge_phrase)
nlp.matcher.add_pattern(
'RoseAtollMarine',
[{66: 'rose'},
 {66: 'atoll', 'OP': '?'},
 {66: 'marine', 'OP': '?'}],
label='GPE'
)
nlp.pipeline = [nlp.tagger, nlp.entity, nlp.matcher, nlp.parser]

## some testing
The cell below is just used for testing...

In [None]:
mons = """Basin and Range National Monument
Bears Ears National Monument
Berryessa Snow Mountain
Canyons of the Ancients
Carrizo Plain National Monument
Cascade Siskiyou
Craters of the Moon National Monument
craters of the moon monument
Giant Sequoia
Gold Butte
Grand Canyon-Parashant
Grand Canyon / Parashant National Monument
Grand Staircase-Escalante
Hanford Reach
Ironwood Forest
Mojave Trails
Organ Mountains-Desert Peaks
Rio Grande del Norte
Sand to Snow
San Gabriel Mountains
Sonoran Desert
Upper Missouri River Breaks
Vermilion Cliffs
Katahdin Woods and Waters California
Marianas Trench Marine
Pacific Remote Islands Marine National Monument
Papahanaumokuakea Marine
Rose Atoll Marine"""
for mon in mons.split('\n'):
#     entity_name = ((mon.replace(' ', '')).replace('-', '')).replace(r'\n', '')
#     print("matcher.add_entity('%s', on_match=merge_phrases)" % entity_name)
#     pattern = [{LOWER: word} for word in mon.split()]
#     pattern.append({LOWER: 'national', 'OP': '?'})
#     pattern.append({LOWER: 'monument', 'OP': '?'})
#     print('matcher.add_pattern(')
#     print("'%s'," % entity_name)
#     pprint(pattern)
#     print(",label='GPE'\n)\n")
    doc = nlp('blah blah ' + mon + ' blah')
    #matches = matcher(doc)
#    print([w.text for w in doc])
    print([ent.orth_ for ent in doc.ents])

## the slow, more accurate way to build a corpus

In [6]:
# this is pretty inefficient--it would be better to use 
# SpaCy's Pipe function, but I kept running into vocab issues

# Only run this to update--takes 30 - 45 mins. Otherwise, just read in the corpus we've saved

if 1 == 1:
# Split records’ content (text) field from associated metadata fields, 
# but keep them paired together for convenient loading into a textacy.Corpus
    #docs = [textacy.doc.Doc(doc) for doc in nlp.pipe(text_stream, batch_size=1000, n_threads=4)]
    #docs = [nlp(doc) for doc in text_stream]
    
    #corpus = textacy.Corpus('en', texts=text_stream, metadatas=metadata_stream, itemwise = False)
    corpus = textacy.Corpus(nlp)
    #corpus.spacy_vocab = nlp.vocab
    for comment in comments:
        corpus.add_doc(nlp(comment['commentText']), metadata=comment )
    #corpus.save('data', name='textacy-corpus', compression='gzip')
else:
# or, read in the corpus we've already parsed
    corpus = textacy.Corpus.load('data', name='textacy-corpus', compression='gzip')

print(corpus)

Corpus(438871 docs; 81960552 tokens)


## the faster, less accurate way to build a corpus

In [None]:
# fast, but without monument matching
text_stream, metadata_stream = textacy.fileio.split_record_fields(comments, 'comment')
corpus = textacy.Corpus('en', texts=text_stream, metadatas=metadata_stream)

print(corpus)

In [None]:
# some cool stuff you can do with textacy

# counts =
corpus.word_freqs(normalize=True, weighting='count')
# idf = 
corpus.word_doc_freqs(normalize=True, weighting='idf')
list(textacy.extract.ngrams(doc, 3, filter_stops=True, filter_punct=True, filter_nums=False))[:15]

doc = corpus[0]
ts = textacy.text_stats.TextStats(doc)
print(ts.n_unique_words)
print(ts.basic_counts)
print(ts.readability_stats)

doc.to_bag_of_terms(named_entities=True, normalize='lemma', filter_stops=True, filter_punct=True)

doc.metadata

corpus.word_doc_freqs(lemmatize=True, lowercase=True, weighting='count', as_strings=True)

In [7]:
# get a count of named entities

# Extract the named entities from each comment
from collections import Counter
c = Counter()
for doc in corpus:
    ents = textacy.extract.named_entities(doc, include_types=('PERSON', 'LAW', 'ORG', 'NORP', 'GPE', 'WORK_OF_ART'), drop_determiners=True)
    for ent in ents:
        e = (ent.orth_, ent.lemma_, ent.label_)
        c[e] += 1

c.most_common(10) 
# should I drop determiners? Probably not..

[(('', '', ''), 280446),
 (('Bears Ears', 'bears', 'GPE'), 241090),
 (('American', 'american', 'NORP'), 213070),
 (('Bears Ears National Monument', 'bears', 'GPE'), 163538),
 (('Utah', 'utah', 'GPE'), 120303),
 (('Ryan Zinke', 'ryan zinke', 'PERSON'), 105209),
 (('Bears Ears Inter-Tribal Coalition',
   'bears inter - tribal coalition',
   'GPE'),
  94993),
 (('Interior', 'interior', 'ORG'), 94417),
 (('National', 'national', 'GPE'), 93091),
 (('Hopi', 'hopi', 'NORP'), 92580)]

In [8]:
ents = list()
for ent in c.most_common(2000):
    ents.append({'value': ent[0][0],
                'lemma': ent[0][1],
                'pos': ent[0][2],
                'frequency': ent[1]})

for i, row in enumerate(ents):
    ents[i]["Id"] = i+1

entities = set([ent['value'] for ent in ents])
ents[-5:]

[{'Id': 1996,
  'frequency': 25,
  'lemma': 'escalante -',
  'pos': 'ORG',
  'value': 'Escalante-'},
 {'Id': 1997,
  'frequency': 25,
  'lemma': 'chimney rock national monument',
  'pos': 'ORG',
  'value': 'Chimney Rock National Monument'},
 {'Id': 1998,
  'frequency': 25,
  'lemma': 'ca 93001',
  'pos': 'ORG',
  'value': 'CA 93001'},
 {'Id': 1999,
  'frequency': 25,
  'lemma': 'more',
  'pos': 'PERSON',
  'value': 'MORE'},
 {'Id': 2000,
  'frequency': 25,
  'lemma': 'french',
  'pos': 'NORP',
  'value': 'French'}]

In [9]:
# group things together, just to make the lookup table building a little easier
textacy_variants = textacy.keyterms.aggregate_term_variants(entities, fuzzy_dedupe=True)
import pprint
pprint.pprint(textacy_variants[:10])

[{'"Review of Certain National Monuments Established Since 1996'},
 {'Review of Certain National Monuments',
  'Review of Certain National Monuments Established'},
 {'Upper Missouri River Breaks National Monument',
  'Upper Missouri River Breaks National Monument in'},
 {'Pacific Remote Islands Marine National Monument'},
 {'Organ Mountain Desert Peaks National Monument',
  'Organ Mountains Desert Peaks National Monument',
  'Organ Mountains-Desert Peaks National Monument'},
 {'Grand Staircase - Escalante National Monument',
  'Grand Staircase Escalante National Monuments'},
 {'Grand Staircase Escalante National Monument',
  'Grand Staircase-Escalante National',
  'Grand Staircase-Escalante National Monuments'},
 {'Giant Sequoia National Monument',
  'Giant Sequoia National Monument Designation'},
 {'Papah&#257;naumoku&#257;kea Marine Monument'},
 {'Grand Staircase-Escalante National Monument',
  'Grand Staircase/Escalante National Monument'}]


In [None]:
# sort the varients. But, don't actually do this!
# varients = [list(var) for var in textacy_variants]
# def getKey(item):
#     return item[0]
# varients = sorted(varients, key=getKey)
# varients[:20]

In [None]:
# write a dataset with the named entities and frequencies
# but dont actually do this
# with open('ents.csv', 'w') as f:
#     writer = csv.DictWriter(f, ['Id', 'value', 'lemma', 'pos', 'frequency'])
#     writer.writeheader()
#     writer.writerows(ents)

In [10]:
# write the dataset used to create the named entity resolution stuff
# this is the part where I painstakenly go through the resulting
# spreadsheet and manually map instances to their canonical value
with open('ents.csv', 'w') as f:
    writer = csv.writer(f)
    # flatten the grouped entities
    # to do: include a count of each
    for item in [item for sublist in textacy_variants for item in sublist]:
        writer.writerow([item])
# save the resulting file as data/entity-resolution.csv

# Tag docs with named entities
1. read in 'data/entity-resolution.csv'
2. tag each document with the named entities it contains
3. export as to csv (data/named-entities.csv)

In [16]:
# read entities into a list of tuples of form (instance, entity)
with open('data/entity-resolution.csv') as f:
    reader = csv.DictReader(f)
    er = [ent for ent in reader]
er = [(item['instance'], item['canonical']) for item in er if item['canonical'] != 'ignore']
lookup = {e[0]: e[1] for e in er}

entities = set(lookup.values())
lookup.update({e: e for e in entities})
instances = set(lookup.keys()).union(entities)

print('total named instances', str(len(instances)))
print('unique named entities', str(len(entities)))

total named instances 1486
unique named entities 664


In [17]:
# create an empty dataframe with
# rows: document_ids
# columns: canonical named entities
import pandas as pd
import numpy as np
df = pd.DataFrame(False, index=[doc.metadata.get('documentId') for doc in corpus], columns=entities, dtype=bool)

In [18]:
# for each doc, get list of named entities, and update df
# setting the appropriate canonical name columns to true for matches
from spacy.parts_of_speech import DET # to remove determiners
for doc in corpus:
    row_idx = doc.metadata.get('documentId')
    nes = [ne if ne[0].pos != DET else ne[1:] for ne in doc.spacy_doc.ents]
    nes = set([ne.orth_ for ne in nes])
    df.loc[row_idx, [lookup[v] for v in instances.intersection(nes)]] = True

In [19]:
# convert bool to 0/1 and write to file
(df*1).to_csv('data/named-entities.csv', index_label='documentId',) # mult. by 1 to convert boolean to 0/1

In [15]:
df.loc['DOI-2017-0002-0005']

State Parks                                       False
Kirkland                                          False
Democrat                                          False
Spanish                                           False
Veterans                                          False
Lake Powell                                       False
White House                                       False
Burbank                                           False
American College Of Environmental Lawyers         False
Ashland                                           False
Yellowstone National Park                         False
Santa Maria                                       False
Europeans                                         False
Gig Harbor                                        False
Supreme Court                                     False
Bureau of Economic Analysis                       False
Maryland                                          False
Las Vegas                                       