This notebook explores using the [Stanford Named Entity Recognizer](https://nlp.stanford.edu/software/CRF-NER.html) and the [Natural Language Tool Kit](http://www.nltk.org/). To run the code you will first need to download the StanfordNER toolkit from [here](https://nlp.stanford.edu/software/CRF-NER.html#Download) and unzip it in the same directory as this notebook file.

In [2]:
import os
import re
import datetime 
import csv
from itertools import groupby
from nltk.corpus import PlaintextCorpusReader
from nltk.tag import StanfordNERTagger

You may need to adjust the path `stanford-ner-2018-10-16` if you have downloaded a different version.

In [7]:
tagger = StanfordNERTagger('./stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
                           './stanford-ner-2018-10-16/stanford-ner.jar')

In [8]:
ARTISTS = ['The Roots',
'Eve',
'DJ Jazzy Jeff & The Fresh Prince',
'Ludacris',
'T.I.',
'Kanye West',
'Chance the Rapper',
'Common',
'Gucci Mane',
'Migos',
'OutKast',
'Twista',
'Crucial Conflict',
'Lupe Fiasco',
'Digital Underground',
'2pac',
'Trouble Funk']

In [10]:
LYRICS_DIR = './lyrics/'

In [11]:
paths = [os.path.join(LYRICS_DIR, d) for d in os.listdir(LYRICS_DIR)]

In [13]:
def get_tagged_entities(dirpath):
    """
    Given a directory path,
    create a PlaintextCorpus object and then
    return a list of dictionaries representing tagged entities
    in the corpus
    """
    entities = []
    corp = PlaintextCorpusReader(dirpath, '.*\.txt')
    
    print('{0}: Starting work on {1}'.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), dirpath))
    for fid in corp.fileids():
        
        s_id = fid.split('-', 1)[0]
        s_title = re.sub('-', ' ', fid.split('-', 1)[1].rsplit('.')[0])
        
        text = corp.words(fid)
        tagged = tagger.tag(text)
        
        # Based on: http://stackoverflow.com/a/30665014/1232820
        for tag, chunk in groupby(tagged, lambda x: x[1]):
            if tag != 'O':
                result = {
                    'song_id': s_id,
                    'song_title': s_title,
                    'entity': " ".join(word for word, tg in chunk),
                    'entity_type': tag
                }
                entities.append(result)
            else:
                # flush chunk
                pass
    print('{0}: Finished processing {1}'.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), dirpath))
    return entities

In [14]:
all_entities = [get_tagged_entities(p) for p in paths]

2019-07-17 15:32:50: Starting work on ./lyrics/Twista
2019-07-17 15:36:11: Finished processing ./lyrics/Twista
2019-07-17 15:36:11: Starting work on ./lyrics/T.I.
2019-07-17 15:44:01: Finished processing ./lyrics/T.I.
2019-07-17 15:44:01: Starting work on ./lyrics/The Roots
2019-07-17 15:50:05: Finished processing ./lyrics/The Roots
2019-07-17 15:50:05: Starting work on ./lyrics/2Pac
2019-07-17 16:04:08: Finished processing ./lyrics/2Pac
2019-07-17 16:04:08: Starting work on ./lyrics/Digital Underground
2019-07-17 16:05:06: Finished processing ./lyrics/Digital Underground
2019-07-17 16:05:06: Starting work on ./lyrics/DJ Jazzy Jeff & The Fresh Prince
2019-07-17 16:06:28: Finished processing ./lyrics/DJ Jazzy Jeff & The Fresh Prince
2019-07-17 16:06:28: Starting work on ./lyrics/Crucial Conflict
2019-07-17 16:06:59: Finished processing ./lyrics/Crucial Conflict
2019-07-17 16:06:59: Starting work on ./lyrics/Common
2019-07-17 16:13:46: Finished processing ./lyrics/Common
2019-07-17 16:13

In [15]:
merge = zip(sorted(ARTISTS), all_entities)

In [16]:
results = []
for m in merge:
    update = {'artist': m[0]}
    new_item = [{**item, **update} for item in m[1]]
    if len(m[1]) == len(new_item):
        results.extend(new_item)

In [17]:
len(results)

15819

In [18]:
with open('./entities.csv', 'w') as csvfile:
    field_names = ['entity', 'entity_type', 'song_id', 'song_title', 'artist']
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    
    writer.writeheader()
    for row in results:
        writer.writerow(row)