In [52]:
import torchtext
import torch
from torchtext.data.utils import get_tokenizer
TEXT = torchtext.data.Field(tokenize=get_tokenizer("basic_english"),
                            init_token='<sos>',
                            eos_token='<eos>',
                            lower=True)
train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(TEXT)
TEXT.build_vocab(train_txt)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def batchify(data, bsz):
    data = TEXT.numericalize([data.examples[0].text])
    # 데이터셋을 bsz 파트들로 나눕니다.
    nbatch = data.size(0) // bsz
    # 깔끔하게 나누어 떨어지지 않는 추가적인 부분(나머지들) 은 잘라냅니다.
    data = data.narrow(0, 0, nbatch * bsz)
    # 데이터에 대하여 bsz 배치들로 동등하게 나눕니다.
    data = data.view(bsz, -1).t().contiguous()
    return data.to(device)

batch_size = 20
eval_batch_size = 10
train_data = batchify(train_txt, batch_size)
val_data = batchify(val_txt, eval_batch_size)
test_data = batchify(test_txt, eval_batch_size)

In [53]:
from transformer import TransformerModel
ntokens = len(TEXT.vocab.stoi)  # 단어 사전의 크기
emsize = 200                    # 임베딩 차원
nhid = 200                      # nn.TransformerEncoder 에서 피드포워드 네트워크(feedforward network) 모델의 차원
nlayers = 2                     # nn.TransformerEncoder 내부의 nn.TransformerEncoderLayer 개수
nhead = 2                       # 멀티헤드 어텐션(multi-head attention) 모델의 헤드 개수
dropout = 0.2                   # 드랍아웃(dropout) 값
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout, use_posenc=False).to(device)

In [54]:
def onehot(vec):
    A = torch.eye(ntokens, dtype=torch.float)
    return A[vec.long()].to(device)

In [55]:
bptt = 35
def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i+seq_len]    
    # onehot_data = onehot(data)
    target = source[i+1:i+1+seq_len].view(-1)
    return data, target

In [56]:
data, targets = get_batch(train_data, 0)

In [57]:
data.shape

torch.Size([35, 20])

In [60]:
targets.shape

torch.Size([700])

In [58]:
criterion = torch.nn.CrossEntropyLoss()
lr = 5.0 # 학습률
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

import time
def train():
    model.train() # 학습 모드를 시작합니다.
    total_loss = 0.
    start_time = time.time()
    ntokens = len(TEXT.vocab.stoi)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()

        total_loss += loss.item()
        log_interval = 200
        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | '
                  'lr {:02.2f} | ms/batch {:5.2f} | '
                  'loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch, len(train_data) // bptt, scheduler.get_lr()[0],
                    elapsed * 1000 / log_interval,
                    cur_loss, math.exp(cur_loss)))
            total_loss = 0
            start_time = time.time()

def evaluate(eval_model, data_source):
    eval_model.eval() # 평가 모드를 시작합니다.
    total_loss = 0.
    ntokens = len(TEXT.vocab.stoi)
    with torch.no_grad():
        for i in range(0, data_source.size(0) - 1, bptt):
            data, targets = get_batch(data_source, i)
            output = eval_model(data)
            output_flat = output.view(-1, ntokens)
            total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / (len(data_source) - 1)

In [59]:
import math

best_val_loss = float("inf")
epochs = 3
best_model = None

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(model, val_data)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
          'valid ppl {:8.2f}'.format(epoch, (time.time() - epoch_start_time),
                                     val_loss, math.exp(val_loss)))
    print('-' * 89)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = model

    scheduler.step()

| epoch   1 |   200/ 2981 batches | lr 5.00 | ms/batch 17.22 | loss  9.84 | ppl 18696.91
| epoch   1 |   400/ 2981 batches | lr 5.00 | ms/batch 14.78 | loss  9.60 | ppl 14758.77
| epoch   1 |   600/ 2981 batches | lr 5.00 | ms/batch 14.83 | loss  9.60 | ppl 14729.40
| epoch   1 |   800/ 2981 batches | lr 5.00 | ms/batch 14.76 | loss  9.60 | ppl 14784.71
| epoch   1 |  1000/ 2981 batches | lr 5.00 | ms/batch 14.76 | loss  9.60 | ppl 14718.75
| epoch   1 |  1200/ 2981 batches | lr 5.00 | ms/batch 14.84 | loss  9.60 | ppl 14788.90
| epoch   1 |  1400/ 2981 batches | lr 5.00 | ms/batch 14.79 | loss  9.60 | ppl 14824.71
| epoch   1 |  1600/ 2981 batches | lr 5.00 | ms/batch 14.79 | loss  9.61 | ppl 14918.87
| epoch   1 |  1800/ 2981 batches | lr 5.00 | ms/batch 14.88 | loss  9.60 | ppl 14767.52
| epoch   1 |  2000/ 2981 batches | lr 5.00 | ms/batch 14.80 | loss  9.60 | ppl 14782.57
| epoch   1 |  2200/ 2981 batches | lr 5.00 | ms/batch 14.79 | loss  9.60 | ppl 14800.43
| epoch   1 |  2400/ 

KeyboardInterrupt: 

In [11]:
A = model(data)

In [12]:
A.shape

torch.Size([35, 20, 28785])

In [35]:
data.shape

torch.Size([35, 20])

In [16]:
output = model(data)

In [20]:
output.shape

torch.Size([35, 20, 28785])

In [22]:
bceloss = torch.nn.BCELoss()

In [24]:
bceloss(torch.Tensor([0.1,0.7,0.8]),torch.Tensor([0.5,0.5,0.5]))

tensor(0.9669)

In [31]:
def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

In [33]:
generate_square_subsequent_mask(10).shape

torch.Size([10, 10])