In [22]:
import os
import re
import datetime 
import csv
from itertools import groupby
from nltk.corpus import PlaintextCorpusReader
from nltk.tag import StanfordNERTagger

In [2]:
tagger = StanfordNERTagger('./stanford-ner-2016-10-31/classifiers/english.all.3class.distsim.crf.ser.gz',
                           './stanford-ner-2016-10-31/stanford-ner-3.7.0.jar')

In [3]:
ARTISTS = ['The Roots',
'Eve',
'DJ Jazzy Jeff & The Fresh Prince',
'Ludacris',
'T.I.',
'Kanye West',
'Chance the Rapper',
'Common',
'Gucci Mane',
'Migos',
'OutKast',
'Twista',
'Crucial Conflict',
'Lupe Fiasco',
'Digital Underground',
'2pac',
'Trouble Funk']

In [4]:
LYRICS_DIR = '/Users/umd-laptop/Code/geniusing/lyrics/'

In [5]:
paths = [os.path.join(LYRICS_DIR, d) for d in os.listdir(LYRICS_DIR)]

In [6]:
def get_tagged_entities(dirpath):
    """
    Given a directory path,
    create a PlaintextCorpus object and then
    return a list of dictionaries representing tagged entities
    in the corpus
    """
    entities = []
    corp = PlaintextCorpusReader(dirpath, '.*\.txt')
    
    print('{0}: Starting work on {1}'.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), dirpath))
    for fid in corp.fileids():
        
        s_id = fid.split('-', 1)[0]
        s_title = re.sub('-', ' ', fid.split('-', 1)[1].rsplit('.')[0])
        
        text = corp.words(fid)
        tagged = tagger.tag(text)
        
        # Based on: http://stackoverflow.com/a/30665014/1232820
        for tag, chunk in groupby(tagged, lambda x: x[1]):
            if tag != 'O':
                result = {
                    'song_id': s_id,
                    'song_title': s_title,
                    'entity': " ".join(word for word, tg in chunk),
                    'entity_type': tag
                }
                entities.append(result)
            else:
                # flush chunk
                pass
    print('{0}: Finished processing {1}'.format(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), dirpath))
    return entities

In [7]:
all_entities = [get_tagged_entities(p) for p in paths]

2017-04-21 07:40:38: Starting work on /Users/umd-laptop/Code/geniusing/lyrics/2Pac
2017-04-21 07:55:46: Finished processing /Users/umd-laptop/Code/geniusing/lyrics/2Pac
2017-04-21 07:55:46: Starting work on /Users/umd-laptop/Code/geniusing/lyrics/Chance The Rapper
2017-04-21 07:59:28: Finished processing /Users/umd-laptop/Code/geniusing/lyrics/Chance The Rapper
2017-04-21 07:59:28: Starting work on /Users/umd-laptop/Code/geniusing/lyrics/Common
2017-04-21 08:06:55: Finished processing /Users/umd-laptop/Code/geniusing/lyrics/Common
2017-04-21 08:06:55: Starting work on /Users/umd-laptop/Code/geniusing/lyrics/Crucial Conflict
2017-04-21 08:07:28: Finished processing /Users/umd-laptop/Code/geniusing/lyrics/Crucial Conflict
2017-04-21 08:07:28: Starting work on /Users/umd-laptop/Code/geniusing/lyrics/Digital Underground
2017-04-21 08:08:39: Finished processing /Users/umd-laptop/Code/geniusing/lyrics/Digital Underground
2017-04-21 08:08:39: Starting work on /Users/umd-laptop/Code/geniusing/

In [11]:
merge = zip(sorted(ARTISTS), all_entities)

In [19]:
results = []
for m in merge:
    update = {'artist': m[0]}
    new_item = [{**item, **update} for item in m[1]]
    if len(m[1]) == len(new_item):
        results.extend(new_item)

In [20]:
len(results)

15819

In [29]:
with open('./entities.csv', 'w') as csvfile:
    field_names = ['entity', 'entity_type', 'song_id', 'song_title', 'artist']
    writer = csv.DictWriter(csvfile, fieldnames=field_names)
    
    writer.writeheader()
    for row in results:
        writer.writerow(row)