# Model Building

sources:

- https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
- https://towardsdatascience.com/generating-haiku-with-deep-learning-dbf5d18b4246
- https://www.analyticsvidhya.com/blog/2020/01/first-text-classification-in-pytorch/

In [1371]:
import os
import sys
import pickle
import torch
import torch.nn
import torch.optim
import torch.utils.data

In [1372]:
"""Google Drive"""
# # Mount Juliet's google drive
# from google.colab import drive
# drive.mount('/content/gdrive/')
# sys_path = '/content/gdrive/My Drive/project_ece_5424/'
# sys.path.append(sys_path)
# dataset_path = 'dataset'
# store_file = os.path.join(sys_path, dataset_path, 'embedding.pickle')

'Google Drive'

In [1373]:
"""Offline Usage"""
dataset_path = '../../dataset'
store_file = os.path.join(dataset_path,'lyrics.pickle')

## Construct Dataset class

In [1374]:
class WorshipLyricDataset(torch.utils.data.Dataset):
    """Worhip Song dataset from Genius.
    """

    def __init__(self, path: str):

        # Load the pre-processed pickle file.
        with open(path, 'rb') as fp:
            store = pickle.load(fp)
        
        # Unpack the pickle.
        self.index2token = store['index2token']
        self.token2index = store['token2index']
        self.counts = store['counts']
        self.corpus = store['corpus']
        self.vectors = [torch.LongTensor(vec) for vec in store['vectors']]
        self.syllables = torch.nn.functional.one_hot(torch.LongTensor(store['syllables'])) # One-hot encoded syllabl counts.

    def __len__(self):
        return len(self.vectors)

    def __getitem__(self, idx):
        # lyric = {
        #     'vector': self.vectors[idx],
        #     'syllables': self.syllables[idx],
        # }
        return (self.vectors[idx], self.syllables[idx],)

In [1375]:
# Construct the data object.
dataset = WorshipLyricDataset(path=store_file)

In [1376]:
def pad_collate(batch):
    """Pad batches from dataloader.

    This allows for more efficient padding,
    by only padding within each batch.
    """
    sentences, syllables = zip(*batch)
    sen_lens = torch.LongTensor([len(vec) for vec in sentences])
    sen_pad = torch.nn.utils.rnn.pad_sequence(sentences, batch_first=True, padding_value=0)
    syllables = torch.stack(syllables) # Convert tuple of tensors to single 2D tensor.
    syllables = syllables.reshape(syllables.size(0),1,syllables.size(1)) # Convert to 3D.
    syllables = syllables.repeat_interleave(sen_pad.size(1), dim=1) # Duplicate syllable count for every word in each sentence.
    # print('pad_collate','sen_lens',sen_lens.size())
    # print('pad_collate','sen_pad',sen_pad.size())
    # print('pad_collate','syllables',syllables.size())
    return (sen_pad,syllables,sen_lens,)

In [1377]:
# Construct data loader.
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True, num_workers=0, collate_fn=pad_collate)

## Construct Model

We use an encode/decode architecture, with **encoder** and **decoder** layers, and also add an **attention** layer.

This architecture was adapted from the wondeful PyTorch tutorial ["NLP From Scratch: Translation with a Sequence to Sequence Network and Attention"](https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html).

sources:
- https://medium.com/@stepanulyanin/captioning-images-with-pytorch-bc592e5fd1a3
- https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
- https://suzyahyah.github.io/pytorch/2019/07/01/DataLoader-Pad-Pack-Sequence.html

### Encoder
The encoder outputs a value for each word in a given input sentence. For each input word, the encoder outputs the value vector and a hidden state, the hidden state is used for the next input word.

In [1378]:
from typing import List, Tuple

class SylEncoderNet(torch.nn.Module):
    """Encodes syllables to sentence-length feature space."""
    def __init__(self, syl_count: int, n_embed: int):
        super().__init__()

        # Dense layer sequence.
        self.dense = torch.nn.Sequential(
            torch.nn.Linear(in_features=syl_count, out_features=syl_count),
            torch.nn.ReLU(),
            torch.nn.Linear(in_features=syl_count, out_features=syl_count),
            torch.nn.PReLU(),
        )

        # Embedding layer.
        self.embed = torch.nn.Linear(in_features=syl_count, out_features=n_embed)


    def forward(self, syllables):
        output = self.dense(syllables.float())
        embeddings = self.embed(output)
        return embeddings

In [1379]:
class SenDecoderRNN(torch.nn.Module):
    def __init__(self, n_embed: int, n_hidden: int, n_vocab: int, n_layers: int, dropout: float = 0., bidirectional: bool = False):
        super().__init__()

        self.n_embed = n_embed
        self.n_hidden = n_hidden
        self.n_vocab = n_vocab
        self.n_layers = n_layers
        self.bidirectional = bidirectional
        self.n_dir = 2 if bidirectional else 1

        # LSTM layer.
        self.lstm = torch.nn.LSTM(
            input_size=n_embed,
            hidden_size=n_hidden,
            num_layers=n_layers,
            dropout=dropout,
            bidirectional=bidirectional,
            batch_first=True,
            )

        # Embedding layer.
        self.embed = torch.nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=n_embed,
        )

        # Word mapping fully-connected layer.
        self.fc = torch.nn.Linear(in_features=n_hidden, out_features=n_vocab)
    
    def forward(self, features: torch.Tensor, sentences: torch.Tensor, lens: torch.Tensor, hidden: torch.Tensor, cell: torch.Tensor):
        """
        Args:
            features (torch.Tensor): Embedded syllable features.
            sentences (torch.Tensor): Sentence word vectors.
            lens (torch.Tensor): True lengths of padded sentence vectors.
            hidden (torch.Tensor): Hidden state vector.
            cell (torch.Tensor): Cell state vector.
        """
        
        # Embed the sentence vectors as floating-point.
        #
        # inputs: (batch_size, sentence_length,)
        sentences_embed = self.embed(sentences)
        # embedded: (batch_size, sentence_length, embed_dim,)

        # print('features',features.size())
        # print('sentences',sentences.size())
        # print('lens',lens.size())
        # print('hidden',hidden.size())
        # print('sentences_embed',sentences_embed.size())
        # print('sentences_embed',0,sentences_embed[0][0])

        # Pack the embedding so that the paddings are ignored.
        sentences_embed_packed = torch.nn.utils.rnn.pack_padded_sequence(
            input=sentences_embed,
            lengths=lens, 
            batch_first=True,
            enforce_sorted=False,
            )
        print('sentences_embed_packed','data',sentences_embed_packed.data.size())

        # Pass the input feature vector as the first step.
        output_packed, (hidden, cell) = self.lstm(features, (hidden,cell,))

        output_packed, (hidden, cell) = self.lstm(sentences_embed_packed, (hidden,cell,))
        # print('output_packed','data',output_packed.data.size())
        # print('hidden',hidden.size())
        # print('cell',cell.size())

        # Get padded output
        output_padded, output_lens = torch.nn.utils.rnn.pad_packed_sequence(output_packed, batch_first=True)
        # print('output_padded',output_padded.size())
        # print('output_lens',output_lens.size())

        # Obtain word-level classification.
        output_padded_fc = self.fc(output_padded)
        # print('output_padded_fc',output_padded_fc.size())

        # Run packing on output layer.
        # return output_padded, output_lens, hidden
        return output_padded_fc, output_lens, (hidden, cell,)

    def init_hc(self, batch_size: int, device: str = 'cpu'):
        return torch.zeros((self.n_layers*self.n_dir, batch_size, self.n_hidden), device=device)

## Train

In [1380]:
import time
from contextlib import contextmanager
@contextmanager
def timing(description='Elapsed time'):
    """Context manager to print elapsed time from call."""
    start_time = time.time()
    yield
    stop_time = time.time()
    print(f"{description}: {stop_time - start_time} seconds")

In [1381]:
def train(encoder, decoder, loader, epochs, optimizer_encoder, optimizer_decoder, criterion, device='cpu'):
    encoder.to(device)
    decoder.to(device)

    encoder.train()
    decoder.train()

    # Initialize hidden output.
    decoder_hidden = decoder.init_hc(32, device=device)
    decoder_cell = decoder.init_hc(32, device=device)

    for e in range(epochs):
        running_loss = 0.0
        for sentences,syllables,sen_lens in loader:
            # print('sentences',sentences.size())
            # print('sen_lens',sen_lens.size())
            # print('syllables',syllables.size())

            # Zero the gradients
            optimizer_encoder.zero_grad()
            optimizer_decoder.zero_grad()

            # Encode syllables into feature space.
            features = encoder(syllables)

            # Decode.
            # outputs, out_lens, _ = decoder(features, sentences, sen_lens, decoder_hidden, decoder_cell)
            SOS_token = dataset.token2index['<sos>']
            decoder_input = SOS_token*torch.ones((sentences.size(0), sentences.size(1), 1,), dtype=torch.long, device=device)
            decoder_input_lens = torch.ones((sentences.size(0),), dtype=torch.long, device=device)

            print('decoder_input',decoder_input.size())
            print('decoder_input_lens',decoder_input_lens.size())

            # Teacher forcing.
            # Feed the target as the next input.
            loss = 0
            for i in range(sentences.size(1)):
                outputs, out_lens, _ = decoder(features, decoder_input, decoder_input_lens, decoder_hidden, decoder_cell)

                # Calculate batch loss.
                # print(outputs.view(-1, decoder.n_vocab).size())
                # print(sentences.contiguous().view(-1).size())
                loss += criterion(
                    outputs.view(-1, decoder.n_vocab),
                    sentences.contiguous().view(-1)[i],
                )
                decoder_input = sentences.contiguous().view(-1)[i]
                # loss += criterion(
                #     outputs.view(-1, decoder.n_vocab),
                #     sentences.contiguous().view(-1),
                # )
                # print(f"[{e}] loss: {loss}")

            # Back-propagate, and step the optimizers.
            loss.backward()
            optimizer_encoder.step()
            optimizer_decoder.step()

            # Accumulate the loss for this epoch.
            running_loss += loss.item()

            # topv, topi = outputs.topk(1)
            # for i in range(5):
            #     s = ' '.join([dataset.index2token[idx] for idx in topi[i]])
            #     print(topi[i].view(-1))
            #     print(s)
            #     print()

        # Report epoch results.
        print(f'Epoch {e}: loss {running_loss}')

In [1382]:
# Length of vocabulary.
n_words = len(dataset.index2token)
syl_count = len(dataset.syllables[0])

# Encoder.
encoder = SylEncoderNet(
    syl_count=syl_count,
    n_embed=syl_count,
)

# Decoder.
decoder = SenDecoderRNN(
    n_embed=syl_count,
    n_hidden=128,
    n_vocab=n_words,
    n_layers=1,
    dropout=0.,
    bidirectional=False,
)

In [1383]:
# Set runtime device.
device = torch.device('?cuda' if torch.cuda.is_available() else 'cpu')

In [1384]:
# Learning parameters.
epochs = 12
lr = 1e-2

# Train the model.
# Display training time too.
with timing():
    optim_encoder = torch.optim.Adam(encoder.parameters(), lr=lr)
    optim_decoder = torch.optim.Adam(decoder.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss(reduction='mean')
    train(encoder, decoder,
        loader=dataloader,
        epochs=epochs,
        optimizer_encoder=optim_encoder,
        optimizer_decoder=optim_decoder,
        criterion=criterion,
        device=device,
    )

decoder_input torch.Size([32, 15, 1])
decoder_input_lens torch.Size([32])
sentences_embed_packed data torch.Size([32, 1, 169])


RuntimeError: input must have 2 dimensions, got 3

## Evaluate

In [1329]:
def evaluate(encoder, decoder, syllables, device='cpu'):
    with torch.no_grad():
        encoder.eval()
        decoder.eval()

        encoder.to(device)
        decoder.to(device)

        # Convert syllables to one-hot.
        syllables_oh = torch.nn.functional.one_hot(syllables, num_classes=syl_count)
        syllables_oh.to(device)
        print('syllables_oh',syllables_oh.size())

        # Encode syllables into feature space.
        features = encoder(syllables_oh)
        print('features',features.size())

        # Initialize hidden output.
        decoder_hidden = decoder.init_hc(1, device=device)
        decoder_cell = decoder.init_hc(1, device=device)

        while True:
            # Decode.
            decoder_input = torch.LongTensor([[dataset.token2index['<sos>']]], device=device)
            decoder_input_lens = torch.LongTensor([1])
            outputs, out_lens, (decoder_hidden, decoder_cell,) = decoder(features, decoder_input, decoder_input_lens, decoder_hidden, decoder_cell)   

            # Build sentences.
            print(outputs)
            # topv, topi = outputs.topk(1)
            # print(topv, topi)
            break



In [1330]:
syllables = torch.LongTensor([7,5])
# print(syl_count)
evaluate(encoder, decoder, syllables, device=device)

syllables_oh torch.Size([2, 169])
features torch.Size([2, 169])
tensor([[[ 3.3785, -6.2565, -6.3329,  ..., -7.9525, -3.3959, -5.3591]]])
