In [239]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


device = 'cuda' if torch.cuda.is_available() else 'cpu'


### Building vocab

In [101]:
dataset_df = pd.read_csv("data/dataset_sentence_level.csv")
print(len(dataset_df))
dataset_df.head()
all_sentences = dataset_df['sentence'].values.tolist()
all_words = ['<PAD>', '<SOS>', '<EOS>'] #including special tokens

all_splitted_sentences = []
max_len = 0
for sentences in all_sentences:
    words = sentences.split()
    if len(words) > max_len:
        max_len = len(words)
    all_splitted_sentences.append(words)
    [all_words.append(w) for w in words if w not in all_words]

vocab = {w: idx for idx, w in enumerate(all_words)}

pad_idx = vocab['<PAD>']
sos_idx = vocab['<SOS>']
eos_idx = vocab['<EOS>']
f"Num words: {len(words)} - max sentence length {max_len}"

'Num words: 7 - max sentence length 78'

In [102]:

all_input_idx = []
all_target_idx = []
all_sequence_len = []
for i, sentence in enumerate(all_splitted_sentences):
    word_idx = [vocab[w] for w in sentence]
    # including <eos> and <sos> tokens
    input_idx = [sos_idx] + word_idx
    target_idx = word_idx + [eos_idx]
    # padding both sequences
    pad_len = (max_len + 1) - len(word_idx)
    pad_input_idx = input_idx + ([pad_idx] * pad_len)
    pad_target_idx = target_idx + ([pad_idx] * pad_len)

    all_input_idx.append(pad_input_idx)
    all_target_idx.append(pad_target_idx)

    all_sequence_len.append(len(input_idx))

tensor_input = torch.tensor(all_input_idx, device=device)
tensor_output = torch.tensor(all_target_idx, device=device)
tensor_len = torch.tensor(all_sequence_len, device=device)
dataset = TensorDataset(tensor_input, tensor_output, tensor_len)

## Autoencoder
- Incluir <SOS> token no dado de treino
- A inferência é autoregressiva, SOS serve para dar o start no processo iterativo

In [260]:
latent_dim = 10
categorical_dim = 10  # one-of-K vector

def sample_gumbel(shape, eps=1e-20):
    U = torch.rand(shape)
    return -Variable(torch.log(-torch.log(U + eps) + eps))

def gumbel_softmax_sample(logits, temperature):
    y = logits + sample_gumbel(logits.size())
    return F.softmax(y / temperature, dim=-1)

def gumbel_softmax(logits, temperature, hard=False):
    """
    ST-gumple-softmax
    input: [*, n_class]
    return: flatten --> [*, n_class] an one-hot vector
    """
    print(f"gumbel input size {logits.size()}")
    y = gumbel_softmax_sample(logits, temperature)
    print(f"gumbel size {y.size()}")

    if not hard:
        print(f"gumbel final size {y.view(-1, latent_dim * categorical_dim).size()}")
        return y.view(-1, latent_dim * categorical_dim)

    shape = y.size()
    _, ind = y.max(dim=-1)
    y_hard = torch.zeros_like(y).view(-1, shape[-1])
    y_hard.scatter_(1, ind.view(-1, 1), 1)
    y_hard = y_hard.view(*shape)
    # Set gradients w.r.t. y_hard gradients w.r.t. y
    y_hard = (y_hard - y).detach() + y

    return y_hard.view(-1, latent_dim * categorical_dim)

class Autoencoder(nn.Module):

    def __init__(self):
        super(Autoencoder, self).__init__()
        # encoder
        self.embedding = nn.Embedding(embedding_dim=100, num_embeddings=len(vocab), padding_idx=pad_idx)
        self.lstm_encoder = nn.GRU(batch_first=True, hidden_size=100, input_size=100, bidirectional=True)
        # VAE
        self.lstm_decoder = nn.GRU(batch_first=True, hidden_size=100, input_size=100)
        self.gumbel_input = nn.Linear(200, latent_dim*categorical_dim)
        # converte o z em um vetor para ser usado como h_t no decoder lstm
        self.z_embedding = nn.Linear(latent_dim * categorical_dim, 100)

        self.output_layer = nn.Linear(in_features=100, out_features=len(vocab)) #

    def encode(self, x, seq_len):
        x_emb = self.embedding(x)
        x_pack = pack_padded_sequence(x_emb, seq_len.numpy(), batch_first=True)
        print(type(x_pack))
        x, ht = self.lstm_encoder(x_pack)
        encoded_sequence = ht.view(ht.size(1), ht.size(2) * 2) # bidirectional -> + <-
        return x, encoded_sequence, x_pack

    def decoder(self, input, z, max_seq_len):
        z_emb = self.z_embedding(z).unsqueeze(0) # z_emb será usado como h inicial do LSTM decoder
        print(f"z_emb {z_emb.size()}")
        #x = input + z_emb - educating text-autoencoder
        #z_emb = z_emb.view(-1, 2, 100)
        print(f"z_emb after view {z_emb.size()}")
        hidden = (z_emb, z_emb) # h_t, c_t
        x, (h_t, c_t) = self.lstm_decoder(input, hidden)
        # TODO: incluir tamanho max da sequencia original para calcular o loss
        x, _ = pad_packed_sequence(x, batch_first=True, total_length=max_seq_len)
        x = self.output_layer(x)
        return x

    def forward(self, x, seq_len, temperature):
        # ordering by sequence length
        sorted_lengths, sorted_idx = torch.sort(seq_len, descending=True)
        x = x[sorted_idx]

        batch_size, max_seq_len, _ = x.size() # [batch_size, maximum seq_len from current batch, dim]

        x, encoded_sequence, x_pack = self.encode(x, sorted_lengths)
        q_y = self.gumbel_input(encoded_sequence)
        z = gumbel_softmax(q_y, temperature=temperature)
        x = self.decoder(x_pack, z, max_seq_len)
        return x, q_y


In [262]:
def loss_function(y, y_hat, qy):
    recon_loss = nn.CrossEntropyLoss(y, y_hat)

    qy_softmax = F.softmax(qy)
    log_ratio = torch.log(qy_softmax * categorical_dim - 1e-20)
    KLD = torch.sum(qy * log_ratio, dim=-1).sum()
    return recon_loss + KLD

model = Autoencoder()
train_dataloader = DataLoader(dataset, batch_size=32)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

temp = 1.
temp_min = 0.5
ANNEAL_RATE = 0.00003

for batch_idx, batch in enumerate(train_dataloader):
    x, y, seq_lens = batch
    optimizer.zero_grad()
    y_hat, qy = model(x, seq_lens, temperature=temp)
    loss = loss_function(y=y, y_hat=y_hat, qy=qy)
    loss.backward()
    optimizer.step()
    if batch_idx % 100 == 1:
        temp = np.maximum(temp * np.exp(-ANNEAL_RATE * batch_idx), temp_min)
    break


ValueError: not enough values to unpack (expected 3, got 2)

In [252]:
torch.rand(20,20).unsqueeze(-1).size()

torch.Size([20, 20, 1])

In [222]:


import numpy as np
np.log(1/categorical_dim)

-2.3025850929940455

In [210]:
KLD = torch.sum(q_y * log_ratio, dim=-1).mean()
KLD

tensor(-2.2991, grad_fn=<MeanBackward0>)

# in this case
- log equivalence: log(x) - log (y) = log(x/y)
- $$log(q_y) - log(\frac{1}{N}) = log(\frac{q_y}{\frac{1}{N}}) = log(q_y * N)$$

In [232]:
log_qy = torch.log(q_y)
g = torch.log(torch.tensor([1/categorical_dim])) # log(q_y) - log(1/cat_dim)
(log_qy - g).sum()

tensor(-230.6127, grad_fn=<SumBackward0>)

In [234]:
log_ratio = torch.log(q_y * categorical_dim + 1e-20) # log(q_y * 1/cat_dim)
log_ratio.sum()

tensor(-230.6127, grad_fn=<SumBackward0>)

In [276]:
# Example of target with class indices
loss = nn.CrossEntropyLoss(reduction='none')
input = torch.randn(3, 5, requires_grad=True).unsqueeze(0)
target = torch.empty(3, dtype=torch.long).random_(5).unsqueeze(0)
print(input.size(), target.size())
output = loss(input, target)
output

torch.Size([1, 3, 5]) torch.Size([1, 3])


RuntimeError: Expected target size [1, 5], got [1, 3]

In [279]:
x = torch.randn(96, 16, 160)
y = torch.empty(96, 16, dtype=torch.long).random_(160)
print(x.view(-1, 160).size(), y.view(-1).size())
criterion = nn.CrossEntropyLoss()
loss = criterion(x.view(-1, 160), y.view(-1))
loss

torch.Size([1536, 160]) torch.Size([1536])


tensor(5.5871)