# Data Pre-processing

In [186]:
import os
import json

## Load lyric files

In [187]:
dataset_path = '../../dataset'

In [188]:
def get_song_files(root_dir: str) -> dict:
    return {int(os.path.splitext(os.path.basename(f))[0]): os.path.join(root_dir, f) for f in os.listdir(root_dir) if f.endswith('.json')}

In [189]:
def load_songs(file_dict: dict):
    song_dict = {}
    for song_id, path in file_dict.items():
        with open(path, 'r') as fp:
            song_dict[song_id] = json.load(fp)
    return song_dict

In [190]:
songid_to_file = get_song_files(os.path.join(dataset_path,'songs'))
songid_to_song = load_songs(songid_to_file)

## Cleanse lyrics

In [191]:
import collections
import itertools
import pickle
import re
from typing import List
import nltk

class LyricsVocab:
    # List of tokens that have reserved corpus index.
    reserved_tokens = ['<pad>', '<start>', '<end>', '<unk>']

    def __init__(self, songs: dict):

        # Process each song into list of words for each line.
        self.songid2lyrics = {songid: self.clean_lyrics(song['lyrics']) for songid,song in songs.items()}

        # Flatten all lyrics into single list.
        flat = list(self.flatten_multi_list([lyrics for songid,lyrics in self.songid2lyrics.items()]))

        # Generate counts for each word.
        self.counts = collections.Counter(flat)

        # Generate list of unique words.
        self.corpus = self.reserved_tokens + sorted(set(flat))

        # Build mappings for: int <--> token
        self.token2index = {}
        self.index2token = {}
        for i,token in enumerate(self.corpus):
            self.token2index[token] = i
            self.index2token[i] = token

        # Generate embedding based on word assignment.
        self.songid2embed = {songid: [[self.token2index[token] for token in line] for line in lyrics] for songid,lyrics in self.songid2lyrics.items()}

    def save(self, path: str, protocol: int = pickle.DEFAULT_PROTOCOL):
        """Save vocabulary to a pickle file.
        
        The pickle file is a dictionary which contains the following keys:
            index2token: Dictionary of index --> token
            embed: Dictionary of song ID --> 2D list for lyric integer tokens
            counts: Dictionary of token --> number of occurrences
        """
        store = {
            'index2token': self.index2token,
            'token2index': self.token2index,
            'embed': self.songid2embed,
            'counts': self.counts,
        }
        with open(path, 'wb') as fp:
            pickle.dump(store, fp, protocol=protocol)

    @classmethod
    def flatten_multi_list(cls, ml: list):
        for l in ml:
            if isinstance(l, list):
                yield from cls.flatten_multi_list(l)
            else:
                yield l

    @staticmethod
    def decontracted(phrase: str):
        """Remove English word contractions.

        Gleaned from: https://stackoverflow.com/a/47091490
        """
        # specific
        phrase = re.sub(r"won\'t", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)
        phrase = re.sub(r"wanna", "want to", phrase)
        phrase = re.sub(r"gotta", "got to", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase

    @classmethod
    def clean_lyrics(cls, lyric: str) -> List[List[str]]:
        lyric = lyric.lower() # Convert to common case.
        lyric = re.sub(r'\[[^\]]*\]', '', lyric) # Remove paranthetical content "[*]", like markers for chorus and verses.
        lyric = re.sub(r'\([^\)]*\)', '', lyric) # Remove paranthetical content "(*)", like markers for chorus and verses.
        lyric = lyric.strip() # Remove any extra newlines at the ends.
        lyric = cls.decontracted(lyric) # Remove contractions before tokenizer to handle special cases.

        # Preserve line structure because tokenizer will remove traditional newlines.
        lyric = re.sub(r"(?:\s*\n\s*)+", r'\n', lyric) # Remove repeated newlines.
        lyric = re.sub('\n', ' NEWLINE ', lyric)

        # Tokenize the entire song into list of words.
        tokens = nltk.tokenize.word_tokenize(lyric) # Split into word tokens.
        tokens = [word for word in tokens if word.isalpha()] # Careful to remove punct after contractions.

        # Group sentences together by line.
        tokens_lines = list(list(group) for key,group in itertools.groupby(tokens, lambda s: s == 'NEWLINE') if not key)
        return tokens_lines

In [192]:
# Create vocabulary object.
lv = LyricsVocab(songid_to_song)

In [193]:
print(f"{len(lv.songid2lyrics)} songs")
print(f"{len(lv.corpus)} unique words")

1339 songs
6275 unique words


In [194]:
songid = list(lv.songid2lyrics.keys())[0]
print(f"Song:      {lv.songid2lyrics[songid][0]}")
print(f"Embedding: {lv.songid2embed[songid][0]}")

Song:      ['stepping', 'out', 'in', 'faith']
Embedding: [5047, 3648, 2690, 1858]


## Write embedding to pickle file

In [195]:
path = os.path.join(dataset_path, 'lyrics.pickle')
lv.save(path)
print(f'Lyrics vocabulary saved: {path}')

Lyrics vocabulary saved: ../../dataset/lyrics.pickle
