In [189]:
import numpy as np
import torch
import math
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorboardX as tbx

from src.models import BiLM, SoftmaxLoss
from src.preprocessing import Tokenizer

In [2]:
%load_ext autoreload
%autoreload 2

## BiLM test

In [3]:
sent = torch.tensor([[1, 3, 4, 5, 2, 0]
                       , [1, 4, 3, 6, 4, 2]])

In [4]:
inputs = sent.transpose(1, 0).view(-1, 2)

In [215]:
sent.shape

torch.Size([2, 6])

In [216]:
inputs.shape

torch.Size([6, 2])

In [241]:
bi_lm_model = BiLM(100, 10, 7)

loss_func = SoftmaxLoss()

forward_output, backword_output, c = bi_lm_model(inputs)

loss = loss_func(forward_output, backword_output, sent)

optimizer = torch.optim.SGD(bi_lm_model.parameters(), lr=0.01, momentum=0.9)

In [242]:
for epoch in range(10):
    epoch_loss = 0.0
    for batch in range(10):
        #inputs, target = batch
        
        optimizer.zero_grad()
        
        forward_output, backword_output, c = bi_lm_model(inputs)
        loss = loss_func(forward_output, backword_output, sent)
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.data
    

tensor(2.8676)
tensor(2.8327)
tensor(2.7853)
tensor(2.7352)
tensor(2.6856)
tensor(2.6375)
tensor(2.5910)
tensor(2.5462)
tensor(2.5031)
tensor(2.4617)


In [247]:
c[0, :, :], c[1, :, :]

(tensor([[ 0.1117, -0.0517, -0.1969,  0.2721, -0.1754,  0.4400, -0.3322,  0.1445,
          -0.0375,  0.5117],
         [ 0.7798,  0.2518, -0.4081,  0.4856, -0.0018,  0.7685,  0.0832,  0.0889,
          -0.4110,  1.3958]], grad_fn=<SliceBackward>),
 tensor([[-1.3118, -0.1223,  0.1932, -0.8429,  0.0526, -0.4303, -0.2412,  0.4959,
          -0.3689,  0.3246],
         [-1.8531, -0.0904,  0.0867, -0.9414, -0.1572, -0.5205, -0.3179,  0.4663,
          -0.6213,  0.2895]], grad_fn=<SliceBackward>))

## Preprocessing test

In [183]:
sentences = [
    ['All', 'work', 'and', 'no', 'play'],
    ['makes', 'Jack', 'a', 'dull', 'boy', '.'],
    ['MAKE', 'AMERICA', 'GREAT', 'AGAIN'],
    ['Poyoi']
]

In [184]:
def attach_BOS_EOS(sentences):
    _sents = sentences.copy()
    for s in _sents:
        s.insert(0, '<BOS>')
        s.append('<EOS>')
    
    return _sents

In [185]:
tokenizer = Tokenizer()

tokenizer.fit_word(sentences)

sentences = attach_BOS_EOS(sentences)
sentences = tokenizer.transform_word(sentences)
sentences

[[1, 4, 5, 6, 7, 8, 2],
 [1, 9, 10, 11, 12, 13, 14, 2],
 [1, 15, 16, 17, 18, 2],
 [1, 19, 2]]

In [186]:
def batch_generator(data, batch_size):
    data_size = len(data)
    num_batches = math.ceil(data_size / batch_size)
    
    shuffle_indices = np.random.permutation(np.arange(data_size))
    shuffle_data = np.array(data)[shuffle_indices]
    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        batch_X = shuffle_data[start_index:end_index]
        batch_X = pad_sequences(batch_X, padding='post')
        
        yield (batch_num + 1), batch_X

In [7]:
for i, x in batch_generator(sentences, 2):
    print(i)
    print(x)

1
[[ 1  9 10 11 12 13 14  2]
 [ 1 15 16 17 18  2  0  0]]
2
[[ 1 19  2  0  0  0  0]
 [ 1  4  5  6  7  8  2]]


In [206]:
bi_lm_model = BiLM(100, 10, len(tokenizer.vocab_word))
loss_func = SoftmaxLoss()
optimizer = torch.optim.SGD(bi_lm_model.parameters(), lr=0.01, momentum=0.9)

In [207]:
#writer = tbx.SummaryWriter()

batch_size = 2
num_batches = math.ceil(len(sentences) / batch_size)

for epoch in range(5):
    epoch_loss = 0.0
    for i, data in batch_generator(sentences, batch_size):
        # data shape: (batch_size, timestamp)
        data = torch.tensor(data).long()
        
        # inputs shape: (timestamp, batch_size)
        inputs = data.transpose(1, 0).view(-1, data.shape[0])
            
        optimizer.zero_grad()
        forward_output, backword_output, c = bi_lm_model(inputs)
        loss = loss_func(forward_output, backword_output, data)
        loss.backward()
        optimizer.step()
        
        #writer.add_scalar('loss', loss.data, global_step=(epoch * batch_size + i))
        epoch_loss += loss.data
    
    print(epoch_loss / num_batches)
    
#writer.close()

tensor(0.4018)
tensor(0.4035)
tensor(0.3825)
tensor(0.4014)
tensor(0.4423)
