# Data Pre-processing

In [350]:
import os
import json

## Load lyric files

In [351]:
dataset_path = '../dataset'

In [352]:
def get_song_files(root_dir: str) -> dict:
    return {int(os.path.splitext(os.path.basename(f))[0]): os.path.join(root_dir, f) for f in os.listdir(root_dir) if f.endswith('.json')}

In [353]:
def load_songs(file_dict: dict):
    song_dict = {}
    for song_id, path in file_dict.items():
        with open(path, 'r') as fp:
            song_dict[song_id] = json.load(fp)
    return song_dict

In [354]:
songid_to_file = get_song_files(os.path.join(dataset_path,'songs'))
songid_to_song = load_songs(songid_to_file)

## Cleanse lyrics

In [355]:
import re
from nltk.tokenize import word_tokenize
import string

def decontracted(phrase: str):
    """Remove English word contractions.

    Gleaned from: https://stackoverflow.com/a/47091490
    """
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"wanna", "want to", phrase)
    phrase = re.sub(r"gotta", "got to", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

def clean_lyric(lyric: str, preserve_lines: bool = False):
    lyric = lyric.lower() # Convert to common case.
    lyric = re.sub(r'\[[^\]]*\]', '', lyric) # Remove paranthetical content "[*]", like markers for chorus and verses.
    lyric = re.sub(r'\([^\)]*\)', '', lyric) # Remove paranthetical content "(*)", like markers for chorus and verses.
    lyric = lyric.strip() # Remove any extra newlines at the ends.
    if preserve_lines:
        lyric = re.sub(r"(?:\s*\n\s*)+", r'\n', lyric)
        lyric = re.sub('\n', ' NEWLINE ', lyric)
    lyric = decontracted(lyric) # Remove contractions before tokenizer to handle special cases.
    tokens = word_tokenize(lyric) # Split into word tokens.
    tokens = [word for word in tokens if word.isalpha()] # Careful to remove punct after contractions.
    if preserve_lines:
        tokens = ['\n' if 'NEWLINE' in word else word for word in tokens]
    return tokens

In [356]:
songid_to_lyrics = {songid: clean_lyric(song['lyrics'], preserve_lines=True) for songid,song in songid_to_song.items()}

## Build vocabulary list

In [357]:
# Create unique set of words.
corpus = sorted(set(sum([lyrics for songid,lyrics in songid_to_lyrics.items()], [])))

## Create integer mapping

In [358]:
# Build mappings for: int <--> token
int_to_token = {i: token for i,token in enumerate(corpus)}
token_to_int = {token: i for i,token in int_to_token.items()}

## 

In [359]:
len(corpus)
corpus[:10]

['\n',
 'a',
 'aa',
 'aah',
 'abandon',
 'abandoned',
 'abatido',
 'abba',
 'abide',
 'abideth']

## Embed lyrics as integers

In [360]:
songid_to_embed = {songid: [token_to_int[token] for token in lyrics] for songid,lyrics in songid_to_lyrics.items()}

In [361]:
print(songid_to_lyrics[147168][:10])
print(songid_to_embed[147168][:10])

['you', 'call', 'me', 'out', 'upon', 'the', 'waters', '\n', 'the', 'great']
[6232, 714, 3231, 3645, 5768, 5368, 5951, 0, 5368, 2320]


## Write embedding to pickle file

In [362]:
import pickle
store = {
    'mapping': int_to_token,
    'embedding': songid_to_embed,
}
store_file = os.path.join(dataset_path,'embedding.pickle')
with open(store_file, 'wb') as fp:
    pickle.dump(store, fp, protocol=pickle.HIGHEST_PROTOCOL)

# Model Building

In [363]:
import torch
import torch.nn
import torch.optim
import torch.utils.data

## Construct Dataset class

In [364]:
class WorshipLyricDataset(torch.utils.data.Dataset):
    """Worhip Song dataset from Genius.
    """

    def __init__(self, embedding_file: str, sentence_length: int = None, sentence_step: int = 1):
        self.sentence_step = sentence_step
        self.sentence_length = sentence_length
        self.embedding_file = embedding_file

        # Load the embedding.
        with open(embedding_file, 'rb') as fp:
            store = pickle.load(fp)
        
        self.corpus = store['mapping']
        # self.songid_to_embed_all = store['embedding']
        # self.idx_to_songid = {idx: songid for idx,songid in enumerate(sorted(store['embedding'].keys()))}
        self.songids = sorted(store['embedding'].keys())

        # Break each lyric into contiguous sentences.
        self.songid_to_embed = {}
        self.songid_to_nextword = {}
        for songid,embed in store['embedding'].items():

            # Group embedding to contiguous sentence length.
            if sentence_length:
                self.songid_to_embed[songid] = []
                self.songid_to_nextword[songid] = []
                for i in range(0, len(embed) - self.sentence_length, self.sentence_step):
                    self.songid_to_embed[songid].append(embed[i:i+self.sentence_length])
                    self.songid_to_nextword[songid].append(embed[i+self.sentence_length])
                self.songid_to_embed[songid] = torch.tensor(self.songid_to_embed[songid], dtype=torch.long)
                self.songid_to_nextword[songid] = torch.tensor(self.songid_to_nextword[songid], dtype=torch.long)

            # Return original lyric embedding.
            else:
                self.songid_to_embed[songid] = torch.tensor(embed, dtype=torch.long)
                self.songid_to_nextword[songid] = torch.tensor([], dtype=torch.long)

        # Add padding based on longest song.
        self.embed_pad = torch.nn.utils.rnn.pad_sequence([self.songid_to_embed[songid] for songid in self.songids], batch_first=True, padding_value=0)
        self.nextword_pad = torch.nn.utils.rnn.pad_sequence([self.songid_to_nextword[songid] for songid in self.songids], batch_first=True, padding_value=0)
        # longest_size = max(embed.shape[0] for songid,embed in self.songid_to_embed)
        # for songid in self.songid_to_embed.keys():
        #     self.songid_to_embed.


    def __len__(self):
        return len(self.songid_to_embed)

    def __getitem__(self, idx):
        # songid = self.songids[idx] # Get song ID from dataset index.

        lyric = {
            # 'embed': self.songid_to_embed[songid],
            'embed': self.embed_pad[idx],
            # 'nextword': self.songid_to_nextword[songid],
            'nextword': self.nextword_pad[idx],
            # 'songid': songid,
            'songid': self.songids[idx],
        }
        return lyric

In [365]:
lyric_dataset = WorshipLyricDataset(embedding_file=store_file, sentence_length=5)

In [366]:
lyric_dataset[0]['embed'].shape, lyric_dataset[0]['nextword'].shape

(torch.Size([1284, 5]), torch.Size([1284]))

In [367]:
# Construct data loader.
dataloader = torch.utils.data.DataLoader(lyric_dataset, batch_size=1, shuffle=True, num_workers=0)

## Construct Model

In [409]:
class LyricGenerator(torch.nn.Module):
    def __init__(self, sentence_length: int, corpus_length: int, n_hidden: int = 128, n_layers: int = 2, drop_prob: float = 0.5):
        super().__init__()
        self.n_hidden = n_hidden
        self.lstm = torch.nn.LSTM(
            input_size=sentence_length,
            hidden_size=n_hidden,
            num_layers=n_layers,
            dropout=drop_prob,
            bidirectional=True,
            batch_first=True,
            )
        self.dropout = torch.nn.Dropout(p=drop_prob)
        self.fc = torch.nn.Linear(in_features=n_hidden, out_features=corpus_length)
    
    def forward(self, x):
        x = x.float() # Convert input to float type for LSTM.

        # Run inputs through LSTM.
        lstm_out, _ = self.lstm(x)
        print(lstm_out.shape)

        # Pass LSTM outputs through dropout layer.
        out = self.dropout(lstm_out)

        # Stack-up LSTM outputs.
        # out = out.contiguous().view(-1, self.n_hidden)
        out = out.view(-1, self.n_hidden)
        print(out.shape)

        out = self.fc(out)
        return out


## Train

In [410]:
def train(model, epoch, optim, criterion, loader, device='cpu'):
    """Helper to train the model."""
    model.train()
    for e in range(epoch):
        running_loss = 0.0
        for lyric in loader:

            # Send data to desired device.
            x = lyric['embed'].to(device)
            y = lyric['nextword'].to(device)

            # Evaluate the model.
            y_pred = model(x)

            # Compute losses.
            print(y_pred.shape, y.shape)
            loss = criterion(y_pred, y)

            # Zero the gradient, back-propagate, and step the optimizer.
            optim.zero_grad()
            loss.backward()
            optim.step()

            # Accumulate the loss for this epoch.
            running_loss += loss.item()

        # Report epoch results.
        print(f'Epoch {e}: loss {running_loss}')

In [411]:
import time
from contextlib import contextmanager
@contextmanager
def timing(description='Elapsed time'):
    """Context manager to print elapsed time from call."""
    start_time = time.time()
    yield
    stop_time = time.time()
    print(f"{description}: {stop_time - start_time} seconds")

In [416]:
# Set runtime device.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the model.
corpus_length = len(lyric_dataset.corpus)
model = LyricGenerator(
    sentence_length=lyric_dataset.sentence_length,
    corpus_length=corpus_length,
    )

In [417]:
print(corpus_length)

6272


In [418]:
# Learning parameters.
epoch = 1
lr = 1e-2

# Train the model.
# Display training time too.
with timing():
    model.to(device)
    optim = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss(reduction='mean')
    train(model, loader=dataloader, epoch=epoch, optim=optim, criterion=criterion, device=device)

torch.Size([1, 1284, 256])
torch.Size([2568, 128])
torch.Size([2568, 6272]) torch.Size([1, 1284])


ValueError: Expected input batch_size (2568) to match target batch_size (1).