# Setup

## Imports

In [1]:
import os.path

vai Modules

In [2]:
from vai.io import pickle_load

Keras Modules

In [3]:
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.
  return f(*args, **kwds)


PyTorch Modules

In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.autograd import Variable
from vai.torch.utils import cuda

## Define Useful Features

In [5]:
DIR_DATA = DIR_DATA['LJSpeech']

In [6]:
tokenizer = pickle_load(os.path.join(DIR_CHECKPOINTS, 'tokenizer.p'))

In [7]:
char_idx = tokenizer.word_index
idx_char = {v: k for k, v in char_idx.items()}
vocab_size = len(char_idx)

## Load Data

In [8]:
uttarances = pickle_load(os.path.join(DIR_DATA, 'text_data.p'))

# Create Model

## Define Hyperparameters

In [21]:
embedding_dim = 256
conv_channels = 64
kernel_size = 5
encoder_layers = 7
dropout_probability = 0.95
attention_size = 128

## Convolution Block

In [10]:
class ConvBlock(nn.Module):
    def __init__(self, causal=False, in_channels=conv_channels, kernel_size=kernel_size, dropout_probability=dropout_probability):
        super().__init__()
        self.dropout = nn.Dropout(dropout_probability, inplace=True)
        if not causal: self.conv = nn.Conv1d(in_channels, 2 * in_channels, kernel_size,
                                             padding=(kernel_size - 1) // 2)
    
    def forward(self, x):
        out = self.conv(self.dropout(x))
        a, b = out.split(x.size(1), 1)
        out = a * F.sigmoid(b)
        return (out + x) * np.sqrt(0.5)

## Encoder

In [26]:
class Encoder(nn.Module):
    def __init__(self, embedding_dim=embedding_dim, conv_channels=conv_channels, encoder_layers=encoder_layers):
        super().__init__()
        self.fc_in = nn.Linear(embedding_dim, conv_channels)
        self.conv_blocks = [cuda(ConvBlock())] * encoder_layers
        self.fc_out = nn.Linear(conv_channels, embedding_dim)
    
    def forward(self, x):
        out = self.fc_in(x).transpose(1, 2)
        for conv_block in self.conv_blocks: out = conv_block(out)
        keys = self.fc_out(out.transpose(1, 2))
        values = (keys + x) * np.sqrt(0.5)
        return keys, values

## Attention Block

**TODO:**
* Add Positional Encodings
* Find dropout probabilities
* Find initialization strategy
* Use context normalization
* Apply windowed attention

In [107]:
class Attention(nn.Module):
    def __init__(self, query_dim=embedding_dim, embedding_dim=embedding_dim, hidden_size=attention_size,
                 dropout_probability=dropout_probability):
        super().__init__()
        self.fc_query = nn.Linear(query_dim, hidden_size)
        self.fc_keys = nn.Linear(embedding_dim, hidden_size)
        self.fc_values = nn.Linear(embedding_dim, hidden_size)
        self.fc_context = nn.Linear(hidden_size, embedding_dim)
        self.dropout = nn.Dropout(dropout_probability, inplace=True)
        
    def forward(self, query, encoder_context):
        keys, values = encoder_context
        
        query = self.fc_query(query)
        keys = self.fc_keys(keys)
        values = self.fc_values(values)
        
        context = query.bmm(keys.transpose(1, 2))
        context = F.softmax(context.view(-1, context.size(-1))).view(context.size())
        context = self.dropout(context)
        
        context = context.bmm(values) / np.sqrt(values.size(1))
        context = self.fc_context(context)
        
        return context

## Decoder