### Tokenzier 사용하기

In [6]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from typing import Iterable, List
from model import transformer
from data import fr_to_en
import utils
import torch.nn as nn
import pandas as pd
import json
import torch


### Data_load
fr_train = utils.open_text_set("data/training/train.fr")
en_train = utils.open_text_set("data/training/train.en")
vocab_transform, token_transfrom = utils.make_vocab(fr_train, en_train)

decoder_en = {v:k for k,v in vocab_transform['en'].get_stoi().items()}
decoder_fr = {v:k for k,v in vocab_transform['fr'].get_stoi().items()}

# param
SRC_LANGUAGE = "fr"
TGT_LANGUAGE = "en"


In [39]:
# xavier 
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

# loss_fn
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=1)

# optimzer
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

def collate_fn(batch_iter: Iterable):
    """
    Data_Loader에서 사용하는 매서드
    """
    text_transform = {}
    for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
        '''
        token_result->vocab_result->tensor_transform

        변수를 정한다음 함수쓰듯 값을 넣으면 3번을 연속으로 수행
        x = sequntial_transforms(transform)
        x('je') => [2,0,3]
        '''
        text_transform[ln] = utils.sequential_transforms(
            token_transfrom[ln],  # 토큰화(Tokenization)
            vocab_transform[ln],  # 수치화(Numericalization)
            utils.tensor_transform,
        )  # BOS/EOS를 추가하고 텐서를 생성
    
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch_iter:
        src_batch.append(text_transform[SRC_LANGUAGE](src_sample))
        tgt_batch.append(text_transform[TGT_LANGUAGE](tgt_sample))

    PAD_IDX = 1
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX)
    tgt_batch = pad_sequence(tgt_batch, padding_value=PAD_IDX)
    return src_batch.T, tgt_batch.T


def helper_what_sen(src,trg,logits,i,c=10,sen_num=0) : 
    if i % c == 0 :
        src_sen = ' '.join([decoder_fr[i] for i in src.tolist()[sen_num] if decoder_fr[i][0] != '<' ])
        trg_sen = ' '.join([decoder_en[i] for i in trg.tolist()[sen_num] if decoder_en[i][0] != '<' ])
        prob = logits.squeeze(0).max(dim=-1, keepdim=False)[1][sen_num]
        prd_sen = ' '.join([decoder_en[i] for i in prob.tolist() if decoder_en[i] != '<' ])
        print(f'{i}번째 중 {sen_num}번째 문장')
        print('src : ',src_sen)
        print('prd : ',prd_sen)
        print('trg : ',trg_sen)
    return None
    
def train_epoch(model,optimizer) : 
    model.train()
    losses = 0
    # Load_Dataset
    dataset= fr_to_en(set_type='training')

    # Data_loader
    batch_size = 128
    train_dataloader = DataLoader(dataset,batch_size,collate_fn=collate_fn)

    j = [i for i in train_dataloader]

    # for src,tgt in j[:3] :
    for i,(src,tgt) in enumerate(train_dataloader) :
        src = src.to(device)
        tgt = tgt.to(device)
        logits = model(src,tgt)
        helper_what_sen(src,tgt,logits,i)
        optimizer.zero_grad()
        # tgt_out = tgt[1:,:]
        loss = loss_fn(logits.reshape(-1,logits.shape[-1]),tgt.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(train_dataloader)


def evaluate(model):
    model.eval()
    losses = 0
    # Load_Dataset
    dataset= fr_to_en(set_type='validation')

    # Data_loader
    batch_size = 128
    val_dataloader = DataLoader(dataset,batch_size,collate_fn=collate_fn)

    for i,(src,tgt) in enumerate(val_dataloader) :
        
        src = src.to(device)
        tgt = tgt.to(device)
        logits = model(src,tgt)

        helper_what_sen(src,tgt,logits,i)
        
        loss = loss_fn(logits.reshape(-1,logits.shape[-1]),tgt.reshape(-1))

        losses += loss.item()

    return losses / len(val_dataloader)




### 훈련이 잘되고 있는지 확인하는 helper 만들기

In [88]:
def helper_what_sen(src,trg,logits,i,c=10,sen_num=0) : 
    if i % c == 0 :
        src_sen = ' '.join([decoder_fr[i] for i in src.tolist()[sen_num] if decoder_fr[i][0] != '<' ])
        trg_sen = ' '.join([decoder_en[i] for i in trg.tolist()[sen_num] if decoder_en[i][0] != '<' ])
        prob = logits.squeeze(0).max(dim=-1, keepdim=False)[1][0]
        prd_sen = ' '.join([decoder_en[i] for i in prob.tolist() if decoder_en[i] != '<' ])
        print(f'{i}번째 중 {sen_num}번째 문장')
        print('src : ',src_sen)
        print('prd : ',prd_sen)
        print('trg : ',trg_sen)
    return None


def evaluate(model):
    model.eval()
    losses = 0
    # Load_Dataset
    dataset= fr_to_en(set_type='validation')

    # Data_loader
    batch_size = 128
    val_dataloader = DataLoader(dataset,batch_size,collate_fn=collate_fn)

    for i,(src,tgt) in enumerate(val_dataloader) :
        
        src = src.to(device)
        tgt = tgt.to(device)
        logits = model(src,tgt)
        helper_what_sen(src,tgt,logits,i,c=1)
        loss = loss_fn(logits.reshape(-1,logits.shape[-1]),tgt.reshape(-1))
        if i % 10 == 0 :
            global logits_2
            logits_2 =logits

        losses += loss.item()

    return losses / len(val_dataloader)
# prob = logits.squeeze(0).max(dim=-1, keepdim=False)[1] # logits.reshape(-1,logits.shape[-1])[1] 이거와 결과가 같네;; 신기하네

In [116]:
print(logits_2.reshape(-1,logits_2.shape[-1]).max(dim=1)[0].size())

print(logits_2.squeeze(0).max(dim=-1, keepdim=False)[0].size())

torch.Size([3840])
torch.Size([128, 30])


In [105]:
prob = logits_2.reshape(-1,logits_2.shape[-1]).max(dim=1)[-1]

print(logits_2.reshape(-1,logits_2.shape[-1])[0])
# prd_sen = ' '.join([decoder_en[i] for i in prob.tolist() if decoder_en[i] != '<' ])

# prd_sen

tensor([-26.6254, -30.9601,   0.0000,  ..., -29.1219, -29.2782, -27.4871],
       grad_fn=<SelectBackward0>)


In [91]:
prob.size()

torch.Size([118, 33])

In [89]:
import torch

model = torch.load('model/model3.pt',map_location=torch.device('cpu'))
model.device = 'cpu'
model.eval()

val_loss = evaluate(model)

# from timeit import default_timer as timer
# NUM_EPOCHS = 1

# for epoch in range(1, NUM_EPOCHS+1):
#     start_time = timer()
#     train_loss = train_epoch(model, optimizer)
#     end_time = timer()
#     val_loss = evaluate(model)
#     print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

Dataset is validation
0번째 중 0번째 문장
src :  Un groupe d' hommes chargent du coton dans un camion
prd :  <bos> A group of men are fighting attempts onto a truck <eos> 2012 2012 2012 2012 2012 2012 <eos> 2012 2012 2012 2012 2012 gain gain 2012 2012 2012 2012
trg :  A group of men are loading cotton onto a truck
1번째 중 0번째 문장
src :  Un homme dans un petit bateau blanc sur un lac .
prd :  <bos> Man in a small white boat on a lake . <eos> 2012 2012 2012 scoop 2012 2012 2012 2012 2012 2012 2012 substance 2012 2012 2012 substance 2012 2012 2012
trg :  Man in a small white boat on a lake .
2번째 중 0번째 문장
src :  Un chien noir dans l' herbe , tenant un objet en plastique blanc dans sa gueule .
prd :  <bos> A black dog standing in some grass holding a white plastic palm in its mouth . <eos> complex 2012 substance 2012 2012 2012 substance substance 2012 2012 puck substance 2012 gain
trg :  A black dog standing in some grass holding a white plastic item in its mouth .
3번째 중 0번째 문장
src :  Deux garçons à 

In [114]:
src = torch.tensor([[2,2012, 4805, 1536,3]])
ys = torch.ones(1, 1).fill_(2).type(torch.long)
b = model.encode(src)
max_len = 10
for i in range(max_len-1):
  memory= b
  out = model.decode(src,ys,memory)
  _, next_word = torch.max(out,dim=-1)
  next_word = next_word.item()
  ys = torch.cat([ys,
                  torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
  if next_word == 3:
      break
print(ys)
  

Model_Parameter
{'src_vocab_size': 11509, 'trg_vocab_size': 10837, 'src_pad_idx': 1, 'trg_pad_idx': 1, 'embed_size': 512, 'num_layers': 3, 'forward_expansion': 2, 'heads': 8, 'dropout': 0.1, 'device': 'cpu', 'max_length': 140}
5 5
1 5
1 5


RuntimeError: shape '[2, 8, 5, 64]' is invalid for input of size 2560

In [None]:
# param
SRC_LANGUAGE = "fr"
TGT_LANGUAGE = "en"

with open('config/transformer.json', 'r') as file:
    param = json.load(file)
    print('Model_Parameter')
    print(param)    
model = ntransformer(**param)

def train_epoch(model,optimizer) : 
    model.train()
    losses = 0
    # Load_Dataset
    dataset= fr_to_en(set_type='training')

    # Data_loader
    batch_size = 128
    train_dataloader = DataLoader(dataset,batch_size,collate_fn=collate_fn)

    j = [i for i in train_dataloader]

    # for src,tgt in train_dataloader :
    for src,tgt in train_dataloader :
        
        # tgt_input = tgt[:, :-1]
        logits = model(src,tgt)
        optimizer.zero_grad()

        # tgt_out = tgt[:,1:]
        loss = loss_fn(logits.reshape(-1,logits.shape[-1]),tgt.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()

    return losses / len(train_dataloader)
        

train_loss = train_epoch(model, optimizer)

In [88]:
def decode_to_en(result) :
    encoding_val = result.argmax().item()
    return decoder_en[encoding_val]

def decode_to_fr(result) :
    if type(result) == torch.Tensor() :
        encoding_val = result.argmax().item()
    elif type(result) == int :
        encoding_val = result
    return decoder_fr[encoding_val]


decode_to_en(b)
decode_to_fr(4805)

'suis'

In [89]:
vocab_transform['fr'](['bleu','vert'])


[56, 108]

In [6]:
def evaluate(model):
    model.eval()
    losses = 0
    # Load_Dataset
    dataset= fr_to_en(set_type='validation')

    # Data_loader
    batch_size = 128
    val_dataloader = DataLoader(dataset,batch_size,collate_fn=collate_fn)

    for i,(src,tgt) in enumerate(val_dataloader) :
        tgt_input = tgt[:, :-1]
        
        logits = model(src,tgt)
        optimizer.zero_grad()

        tgt_out = tgt[:,1:]
        loss = loss_fn(logits.reshape(-1,logits.shape[-1]),tgt.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()
        if i % 20 == 0 :
            print(f'{i}번째 진행')
    return losses / len(val_dataloader)

evaluate(model)

0번째 진행


4.730595707893372

In [None]:
from timeit import default_timer as timer
NUM_EPOCHS = 1

for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(transformer, optimizer)
    end_time = timer()
    val_loss = evaluate(transformer)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))

In [None]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)
    for i in range(max_len-1):
        memory = memory.to(DEVICE)
        tgt_mask = (generate_square_subsequent_mask(ys.size(0))
                    .type(torch.bool)).to(DEVICE)
        out = model.decode(ys, memory, tgt_mask)
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=0)
        if next_word == EOS_IDX:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model: torch.nn.Module, src_sentence: str):
    model.eval()
    text_transform = {}
    for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
        text_transform[ln] = ut.sequential_transforms(
            token_transfrom[ln],  # 토큰화(Tokenization)
            vocab_transform[ln],  # 수치화(Numericalization)
            ut.tensor_transform,
        )  # BOS/EOS를 추가하고 텐서를 생성
    src = text_transform[SRC_LANGUAGE](src_sentence).view(-1, 1)
    model(src)

    return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

[2012, 4805, 1536]