# Data Pre-processing

In [160]:
import os
import json

## Load lyric files

In [161]:
dataset_path = '../dataset'

In [162]:
def get_song_files(root_dir: str) -> dict:
    return {int(os.path.splitext(os.path.basename(f))[0]): os.path.join(root_dir, f) for f in os.listdir(root_dir) if f.endswith('.json')}

In [163]:
def load_songs(file_dict: dict):
    song_dict = {}
    for song_id, path in file_dict.items():
        with open(path, 'r') as fp:
            song_dict[song_id] = json.load(fp)
    return song_dict

In [164]:
songid_to_file = get_song_files(os.path.join(dataset_path,'songs'))
songid_to_song = load_songs(songid_to_file)

## Cleanse lyrics

In [165]:
import re
from nltk.tokenize import word_tokenize
import string

def decontracted(phrase: str):
    """Remove English word contractions.

    Gleaned from: https://stackoverflow.com/a/47091490
    """
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"wanna", "want to", phrase)
    phrase = re.sub(r"gotta", "got to", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def clean_lyric(lyric: str, preserve_lines: bool = False):
    lyric = lyric.lower() # Convert to common case.
    lyric = re.sub(r'\[[^\]]*\]', '', lyric) # Remove paranthetical content "[*]", like markers for chorus and verses.
    lyric = re.sub(r'\([^\)]*\)', '', lyric) # Remove paranthetical content "(*)", like markers for chorus and verses.
    lyric = lyric.strip() # Remove any extra newlines at the ends.
    if preserve_lines:
        lyric = re.sub(r"(?:\s*\n\s*)+", r'\n', lyric)
        lyric = re.sub('\n', ' NEWLINE ', lyric)
    lyric = decontracted(lyric) # Remove contractions before tokenizer to handle special cases.
    tokens = word_tokenize(lyric) # Split into word tokens.
    tokens = [word for word in tokens if word.isalpha()] # Careful to remove punct after contractions.
    if preserve_lines:
        tokens = ['\n' if 'NEWLINE' in word else word for word in tokens]
    return tokens

In [166]:
songid_to_lyrics = {songid: clean_lyric(song['lyrics'], preserve_lines=True) for songid,song in songid_to_song.items()}

## Build vocabulary list

In [167]:
# Create unique set of words.
corpus = sorted(set(sum([lyrics for songid,lyrics in songid_to_lyrics.items()], [])))

## Create integer mapping

In [168]:
# Build mappings for: int <--> token
int_to_token = {i: token for i,token in enumerate(corpus)}
token_to_int = {token: i for i,token in int_to_token.items()}

## 

In [169]:
len(corpus)
corpus[:10]

['\n',
 'a',
 'aa',
 'aah',
 'abandon',
 'abandoned',
 'abatido',
 'abba',
 'abide',
 'abideth']

## Embed lyrics as integers

In [170]:
songid_to_embed = {songid: [token_to_int[token] for token in lyrics] for songid,lyrics in songid_to_lyrics.items()}

In [171]:
print(songid_to_lyrics[147168][:10])
print(songid_to_embed[147168][:10])

['you', 'call', 'me', 'out', 'upon', 'the', 'waters', '\n', 'the', 'great']
[6232, 714, 3231, 3645, 5768, 5368, 5951, 0, 5368, 2320]


## Write embedding to pickle file

In [172]:
import pickle
store = {
    'mapping': int_to_token,
    'embedding': songid_to_embed,
}
store_file = os.path.join(dataset_path,'embedding.pickle')
with open(store_file, 'wb') as fp:
    pickle.dump(store, fp, protocol=pickle.HIGHEST_PROTOCOL)

# Model Building

In [178]:
import torch
import torch.nn
import torch.utils.data

## Construct Dataset class

In [185]:
class WorshipLyricDataset(torch.utils.data.Dataset):
    """Worhip Song dataset from Genius.
    """

    def __init__(self, embedding_file: str):
        self.embedding_file = embedding_file

        # Load the embedding.
        with open(embedding_file, 'rb') as fp:
            store = pickle.load(fp)
        
        self.corpus = store['mapping']
        self.songid_to_embed = store['embedding']
        self.idx_to_songid = {idx: songid for idx,songid in enumerate(sorted(self.songid_to_embed.keys()))}

    def __len__(self):
        return len(self.songid_to_embed)

    def __getitem__(self, idx):
        songid = self.idx_to_songid[idx]
        lyric = {
            'embed': torch.tensor(self.songid_to_embed[songid], dtype=torch.int),
            'songid': songid,
        }
        return lyric

In [186]:
lyric_dataset = WorshipLyricDataset(embedding_file=store_file)

In [189]:
lyric_dataset[0]

{'embed': tensor([5399, 2773, 2067, 5366, 3066,  863, 6101, 3499, 1898,    0, 2067, 5366,
         3178, 5366, 1503, 3520, 2443,    1, 3849, 5465, 5025,    0, 2067, 5366,
         3066,  599, 3072, 6101,  112,    0,  732, 2646, 5316, 6232,    1, 5081,
         5316, 6232,    1, 5081,    0, 6232,  732, 2966, 3595, 3231,    0, 5376,
         2773,    1, 3178,    0, 5008, 3595, 5368, 1098,    0, 2448, 2427, 3499,
         2565,    0, 2448, 2427, 3499, 2060,    0,  179, 2543,  551, 4823,  236,
         2273,    0,  732, 3520, 6232, 2461, 2538, 1204, 3645,    0,  179, 5376,
         2773,    1, 2220,    0, 4601, 2067,    1, 1898,  179,    1, 2138,    0,
         3942, 5366, 5368, 5077, 4893, 6065, 1681,    0,  697, 2733, 3564, 5905,
          339,    0, 3611, 5765, 6235, 2466,  179, 4548,    0, 2646,  151, 2509,
            0, 6232, 1500, 3520, 2443, 5465, 6155,    0, 2646,  732, 4613, 6235,
         5311,    0, 2646, 6065,  395, 5376, 2687,    1, 2633, 6017, 6232,  714,
            0, 2141