In [1]:
from src.model.fastspeech_model import MultiHeadAttention

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import numpy as np
import torch
from torch import nn


class ScaledDotProductAttention2(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, q, k, v, mask=None):
        # q, k, v: [ (batch_size * n_heads) x seq_len x hidden_size ]
        
        attn = q @ k.transpose(-1, -2)
        attn = attn / self.temperature
        
        # attn: [ (batch_size * n_heads) x seq_len x seq_len ]

        if mask is not None:
            attn.masked_fill(mask, -torch.inf)

        attn = self.softmax(attn)
        attn = self.dropout(attn)
        output = attn @ v

        # output: [ (batch_size * n_heads) x seq_len x hidden_size ]
        return output, attn


class MultiHeadAttention2(nn.Module):
    ''' Multi-Head Attention module '''

    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v
        self.d_model = d_model

        self.w_qs = nn.Linear(d_model, n_head * d_k)
        self.w_ks = nn.Linear(d_model, n_head * d_k)
        self.w_vs = nn.Linear(d_model, n_head * d_v)

        self.attention = ScaledDotProductAttention2(
            temperature=d_k**0.5) 
        self.layer_norm = nn.LayerNorm(d_model)

        self.fc = nn.Linear(n_head * d_v, d_model)
        nn.init.xavier_normal_(self.fc.weight)

        self.dropout = nn.Dropout(dropout)
        
        self.reset_parameters()

    def reset_parameters(self):
         # normal distribution initialization better than kaiming(default in pytorch)
        nn.init.normal_(self.w_qs.weight, mean=0,
                        std=np.sqrt(2.0 / (self.d_model + self.d_k)))
        nn.init.normal_(self.w_ks.weight, mean=0,
                        std=np.sqrt(2.0 / (self.d_model + self.d_k)))
        nn.init.normal_(self.w_vs.weight, mean=0,
                        std=np.sqrt(2.0 / (self.d_model + self.d_v))) 
        
    def forward(self, q, k, v, mask=None):
        # inspired by https://github.com/karpathy/minGPT/blob/master/mingpt/model.py

        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head

        sz_b, len_q, _ = q.size()
        sz_b, len_k, _ = k.size()
        sz_b, len_v, _ = v.size()

        residual = q

        # pre-normalization for q, k, v
        q = self.w_qs(self.layer_norm(q)).view(sz_b, len_q, n_head, d_k).transpose(1, 2)
        k = self.w_ks(self.layer_norm(k)).view(sz_b, len_k, n_head, d_k).transpose(1, 2)
        v = self.w_vs(self.layer_norm(v)).view(sz_b, len_v, n_head, d_v).transpose(1, 2)
        
        if mask is not None:
            mask = mask.unsqueeze(1).repeat(1, n_head, 1, 1)   # b x n x .. x ..
        output, attn = self.attention(q, k, v, mask=mask)

        output = output.transpose(1, 2).contiguous().view(sz_b, len_q, -1)  # b x lq x (n*dv)

        output = self.dropout(self.fc(output))
        output = output + residual

        return output, attn

In [52]:
q = torch.randn(4 * 7, 11, 10)
k = torch.randn(4 * 7, 11, 10)
v = torch.randn(4 * 7, 11, 10)

mla2 = MultiHeadAttention2(7, 10, 5, 5)
output2, attn2 = mla2(q, k, v)
output2.size(), attn2.size()

(torch.Size([28, 11, 10]), torch.Size([28, 7, 11, 11]))

In [53]:
sz_b, len_q, _ = q.size()
sz_b, len_k, _ = k.size()
sz_b, len_v, _ = v.size()

In [46]:
mla = MultiHeadAttention(4, 4, 4, 4)
output, attn = mla(q, k, v)

RuntimeError: Given normalized_shape=[4], expected input with shape [*, 4], but got input of size[112, 8, 10]

In [15]:
output.size(), attn.size()

(torch.Size([16, 8, 4]), torch.Size([64, 8, 8]))

In [71]:
f = torch.randn(13, 20, 29, 3)

In [72]:
f.permute(2, 0, 1, 3).size()

torch.Size([29, 13, 20, 3])

In [73]:
f.permute(2, 0, 1, 3).transpose(1, 2).size()

torch.Size([29, 20, 13, 3])

In [75]:
torch.sum(f.view(29, 20, 3, 13) == f.view(29, 3, 20, 13).transpose(1, 2))

tensor(754)

In [76]:
a = torch.randint(1, 10, (6, 8))

In [78]:
a

tensor([[4, 4, 8, 1, 9, 7, 6, 1],
        [9, 6, 3, 7, 8, 2, 8, 2],
        [3, 7, 3, 9, 1, 3, 1, 2],
        [9, 5, 8, 7, 5, 8, 9, 1],
        [4, 9, 3, 6, 6, 3, 9, 4],
        [4, 6, 8, 9, 1, 6, 4, 1]])

In [79]:
a.view(12, 4)

tensor([[4, 4, 8, 1],
        [9, 7, 6, 1],
        [9, 6, 3, 7],
        [8, 2, 8, 2],
        [3, 7, 3, 9],
        [1, 3, 1, 2],
        [9, 5, 8, 7],
        [5, 8, 9, 1],
        [4, 9, 3, 6],
        [6, 3, 9, 4],
        [4, 6, 8, 9],
        [1, 6, 4, 1]])

In [3]:
from src.model.fastspeech_model import FFTBlock
import torch

hidden_size = 16
intermediate_size = 64
n_head = 4
batch_size = 4
seq_len = 12

fft_block = FFTBlock(hidden_size, intermediate_size, n_head, hidden_size // n_head, hidden_size // n_head, (1, 1), (0, 0))

inp_tensor = torch.rand(batch_size, seq_len, hidden_size, dtype=torch.float32)

out_tensor = fft_block(inp_tensor)[0]

assert inp_tensor.shape == out_tensor.shape

In [1]:
from src.model.fastspeech2 import FastSpeech2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = FastSpeech2(max_seq_len=3000,
    encoder_n_layer=4,
    vocab_size=300,
    encoder_dim=256,
    PAD=0,
    encoder_conv1d_filter_size=1024,
    encoder_head=2,
    decoder_n_layer=4,
    decoder_dim=256,
    decoder_conv1d_filter_size=1024,
    decoder_head=2,
    fft_conv1d_kernel=[9, 1],
    fft_conv1d_padding=[4, 0],
    pitch_predictor_filter_size=256,
    pitch_predictor_kernel_size=3,
    energy_predictor_filter_size=256,
    energy_predictor_kernel_size=3,
    dur_predictor_filter_size=256,
    dur_predictor_kernel_size=3,
    num_embed=256,
    min_pitch=59.913448819015024,
    max_pitch=887.2688230720693,
    min_energy=15.023643,
    max_energy=91.4,
    num_mels=80,
    dropout=0.1)