In [147]:
import os
import json
import torch.nn
import torch.utils.data

## Load lyric files

In [148]:
dataset_path = '../dataset'

In [149]:
def get_song_files(root_dir: str) -> dict:
    return {int(os.path.splitext(os.path.basename(f))[0]): os.path.join(root_dir, f) for f in os.listdir(root_dir) if f.endswith('.json')}

In [150]:
def load_songs(file_dict: dict):
    song_dict = {}
    for song_id, path in file_dict.items():
        with open(path, 'r') as fp:
            song_dict[song_id] = json.load(fp)
    return song_dict

In [151]:
songid_to_file = get_song_files(os.path.join(dataset_path,'songs'))
songid_to_song = load_songs(songid_to_file)

## Cleanse lyrics

In [152]:
import re
from nltk.tokenize import word_tokenize
import string

def decontracted(phrase: str):
    """Remove English word contractions.

    Gleaned from: https://stackoverflow.com/a/47091490
    """
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"wanna", "want to", phrase)
    phrase = re.sub(r"gotta", "got to", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def clean_lyric(lyric: str, preserve_lines: bool = False):
    lyric = lyric.lower() # Convert to common case.
    lyric = re.sub(r'\[[^\]]*\]', '', lyric) # Remove paranthetical content "[*]", like markers for chorus and verses.
    lyric = re.sub(r'\([^\)]*\)', '', lyric) # Remove paranthetical content "(*)", like markers for chorus and verses.
    lyric = lyric.strip() # Remove any extra newlines at the ends.
    if preserve_lines:
        lyric = re.sub(r"(?:\s*\n\s*)+", r'\n', lyric)
        lyric = re.sub('\n', ' NEWLINE ', lyric)
    lyric = decontracted(lyric) # Remove contractions before tokenizer to handle special cases.
    tokens = word_tokenize(lyric) # Split into word tokens.
    tokens = [word for word in tokens if word.isalpha()] # Careful to remove punct after contractions.
    if preserve_lines:
        tokens = ['\n' if 'NEWLINE' in word else word for word in tokens]
    return tokens

In [153]:
songid_to_lyrics = {songid: clean_lyric(song['lyrics'], preserve_lines=True) for songid,song in songid_to_song.items()}

## Build vocabulary list

In [154]:
# Create unique set of words.
corpus = sorted(set(sum([lyrics for songid,lyrics in songid_to_lyrics.items()], [])))

## Create integer mapping

In [155]:
# Build mappings for: int <--> token
int_to_token = {i: token for i,token in enumerate(corpus)}
token_to_int = {token: i for i,token in int_to_token.items()}

## 

In [156]:
len(corpus)
corpus[:10]

['\n',
 'a',
 'aa',
 'aah',
 'abandon',
 'abandoned',
 'abatido',
 'abba',
 'abide',
 'abideth']

## Embed lyrics as integers

In [157]:
songid_to_embed = {songid: [token_to_int[token] for token in lyrics] for songid,lyrics in songid_to_lyrics.items()}

In [158]:
print(songid_to_lyrics[147168][:10])
print(songid_to_embed[147168][:10])

['you', 'call', 'me', 'out', 'upon', 'the', 'waters', '\n', 'the', 'great']
[6232, 714, 3231, 3645, 5768, 5368, 5951, 0, 5368, 2320]


## Write embedding to pickle file

In [159]:
import pickle
store = {
    'mapping': int_to_token,
    'embedding': songid_to_embed,
}
store_file = os.path.join(dataset_path,'embedding.pickle')
with open(store_file, 'wb') as fp:
    pickle.dump(store, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [56]:
# list(songid_to_song.keys())[3]

In [55]:
# songid_to_song[list(songid_to_song.keys())[3]]['lyrics'].split('\n')

In [4]:
## TOO EARLY FOR THIS. WAIT UNTIL DATA IS PRE-PROCESSED FIRST!
#
#
# class WorshipSongDataset(torch.utils.data.Dataset):
#     """Worhip Song dataset from Genius.
#     """

#     def __init__(self, root_dir: str):
#         self.root_dir = root_dir
#         self.songid_to_filepath = self._scan_root_dir_for_files(root_dir)

#     def _scan_root_dir_for_files(self, root_dir) -> dict:
#         """Helper to load file names from root directory.

#         Returns a dictionary of key=song_id, value=file_path.
#         """
#         return {os.path.splitext(os.path.basename(f))[0]: os.path.join(root_dir, f) for f in os.listdir(root_dir) if os.path.isfile(os.path.join(root_dir, f))}

#     def __len__(self):
#         return len(self.songid_to_filepath)

#     def __getitem__(self, idx):
#         if torch.is_tensor(idx):
#             idx = idx.tolist()