# Training on GPU

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/MyDrive/Programming/Projects/kor-to-eng-translation')
!ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
config	eval.py  README.md  train_gpu.ipynb  transformer
data	lib	 results    train.py


In [2]:
!pip install konlpy 



In [3]:
!ls

config	eval.py  README.md  train_gpu.ipynb  transformer
data	lib	 results    train.py


In [4]:
#@title Transformer layers { display-mode: "form" }
"""
Word Embedding & Positional Embedding
"""
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F


device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

#####################################

class Affine(nn.Module):
    """ Fully Connected Layer """

    def __init__(self, i_dim, o_dim):
        super(Affine, self).__init__()
        self.W = nn.Parameter(nn.init.xavier_normal_(torch.empty(i_dim, o_dim)))
        self.b = nn.Parameter(nn.init.uniform_(torch.empty(o_dim)))

    def forward(self, inp, linear=False):
        """
        Args:
            inp ([Tensor]): [bsize, maxlen, emb_size]
            linear (bool): bool
        Returns: [bsize, maxlen, hid_size]
        """
        # [bsize, maxlen, emb_size] * [emb_size, hid_size]
        if linear:
            return torch.mm(inp, self.W) + self.b
        return F.relu((torch.matmul(inp, self.W)) + self.b)


class NormLayer(nn.Module):
    def __init__(self, d_inp, eps=1e-05):
        super(NormLayer, self).__init__()
        self.eps = eps
        self.gamma = Affine(d_inp, d_inp)

    def forward(self, x):
        """
        Args:
            x (Tensor): [bsize, maxlen, dim]
        Returns: [bsize, maxlen, dim]
        """
        return self.gamma((x - torch.mean(x)) / torch.sqrt(torch.var(x) + self.eps))


class Attention(nn.Module):
    # 기존에 만들었던거 참조해서 만들기
    """ Scaled Dot-product Attention """

    def __init__(self, d_inp, d_q, d_k, d_v):
        super(Attention, self).__init__()
        self.Wq = Affine(d_inp, d_q)
        self.Wk = Affine(d_inp, d_k)
        self.Wv = Affine(d_inp, d_v)

    def forward(self, query, key, value, mask=None):
        """
        Args:
            query (): [bsize, maxlen, d_m]
            key (): [bsize, maxlen, d_m]
            value (): [bsize, maxlen, d_m]
            mask (): [bsize, ?, maxlen]
        Returns:  [bsize, maxlen, d_k]
        """
        # [bsize, maxlen, d_k]
        wq = self.Wq(query)
        wk = self.Wk(key)
        wv = self.Wv(value)
        # attention distribution
        # Energy [bsize, maxlen, d_q] @ [bsize, d_k, maxlen] = [bsize, maxlen, maxlen]
        attn_dstr = torch.bmm(wq, torch.transpose(wk, 1, 2)) / torch.sqrt(torch.FloatTensor([key.size(-1)])).to(device)
        if mask is not None:
            attn_dstr = attn_dstr.masked_fill(mask == 0, -1e10)
        attn_dstr = F.softmax(attn_dstr, dim=2)
        # [bsize, maxlen, maxlen] @ [bsize, maxlen, d_v] = [bsize, maxlen, d_v]
        attn = torch.bmm(attn_dstr, wv)
        return attn


##############################

class PositionalEmbedding(nn.Module):
    """
    Basic Word Embedding
    Let the model learn sequence information with positional-encoding
    """

    def __init__(self, vocab_size, emb_dim):
        super(PositionalEmbedding, self).__init__()
        # self.affine = Affine(vocab_size, emb_dim)
        self.embedding = WordEmbedding(vocab_size, emb_dim)
        self.dropout = nn.Dropout(p=0.1)

    def forward(self, inp):
        """
        Args:
            x (Tensor): [bsize, maxlen, emb_dim]
        Returns: [bsize, maxlen, emb_dim]
        """
        """
        임베딩값 dim 값으로 나눠주는거 놓침 
        """
        # [bsize, maxlen, emb_dim]
        out = self.embedding(inp)
        # [bsize, maxlen, emb_dim]
        pe_rst = positional_embedding(out.size(0), out.size(1), out.size(2))
        # [bsize, maxlen, emb_dim]
        return self.dropout(out + pe_rst)


class WordEmbedding(nn.Module):
    def __init__(self, vocab_size, emb_dim):
        super(WordEmbedding, self).__init__()
        # self.affine = Affine(vocab_size, emb_dim)
        self.embedding = nn.Embedding(vocab_size, emb_dim)

    def forward(self, inp):
        scale = torch.sqrt(torch.FloatTensor([inp.size(0)])).to(device)
        out = self.embedding(inp) / scale
        return out


def positional_embedding(bsize, maxlen, d_m):
    out = torch.stack(
        [positional_encoding(maxlen, d_m)] * bsize
    )
    return out.to(device)


def positional_encoding(maxlen, dim):
    """ Give unique value by position and dimension """

    def term(i):
        return 1 / (10000 ** (2 * (i // 2) / dim))

    pos = torch.as_tensor(np.arange(maxlen))
    dims = np.arange(dim)
    dims = torch.tensor(list(map(lambda x: term(x), dims)))
    # [maxlen, dim]
    pe_val = pos.unsqueeze(1) * dims
    # [maxlen, dim]
    pe = torch.zeros(maxlen, dim)
    pe[:, 0::2] = torch.sin(pe_val[:, 0::2])
    pe[:, 1::2] = torch.cos(pe_val[:, 0::2])
    return pe.to(device)


########################


class MultiHeadAttention(nn.Module):
    """ Multi-head Attention + Add&Norm """

    def __init__(self, d_m, n_head=4):
        super(MultiHeadAttention, self).__init__()
        assert d_m % n_head == 0
        d_k = int(d_m / n_head)
        self.head = n_head
        self.Wo = Affine(d_m, d_m)
        self.attn_layers = nn.ModuleList(
            [Attention(d_m, d_k, d_k, d_k) for _ in range(n_head)])
        self.dropout = nn.Dropout(p=0.01)
        self.addnorm = NormLayer(d_m)

    def forward(self, query, key, value, mask):
        """
        Args:
            query (Tensor): [batch size, maxlen, d_m]
            key (Tensor): [batch size, maxlen, d_m]
            value (Tensor): [batch size, maxlen, d_m]
            mask (Tensor): [batch size, ?, maxlen]
        Returns: [batch size, maxlen, d_m]
        """
        heads = []
        for layer in self.attn_layers:  # TODO 이게 맞는지 확인
            # head : [batch size, maxlen, d_k]
            head = layer(query, key, value, mask)
            heads.append(head)
        # [batch size, maxlen, d_k*head]
        multi_attn = self.Wo(torch.cat(heads, dim=2))
        multi_attn = self.dropout(multi_attn)

        # [batch size, maxlen, d_k*head]
        resdl = query + multi_attn
        # [batch size, maxlen, d_k*head]
        out = self.addnorm(resdl)
        return out


class PositionWiseFFLayer(nn.Module):
    """ Position-wise FeedForward + Add&Norm """

    def __init__(self, d_m, d_ff):
        super(PositionWiseFFLayer, self).__init__()
        self.W1 = Affine(d_m, d_ff)
        self.W2 = Affine(d_ff, d_m)
        self.dropout = nn.Dropout(p=0.01)
        self.addnorm = NormLayer(d_m)

    def forward(self, inp):
        """
        Args:
            inp (Tensor): [batch size, maxlen, d_m]
        Returns: [batch size, maxlen, d_m]
        """
        # [batch size, maxlen, d_ff]
        out = torch.relu(self.W1(inp))
        # [batch size, maxlen, d_m]
        out = self.W2(out)
        resdl = inp + self.dropout(out)
        # [batch size, maxlen, d_m]
        out = self.addnorm(resdl)
        return out

#######################


class Encoder(nn.Module):
    def __init__(self, d_m, d_ff):
        super(Encoder, self).__init__()
        self.multi_attn = MultiHeadAttention(d_m)
        self.pw_ff = PositionWiseFFLayer(d_m, d_ff)

    def forward(self, inp, mask):
        """
        Args:
            inp (Tensor): [batch size, maxlen, d_m]
            mask (Tensor): [batch size, 1, maxlen]
        Returns: [batch size, maxlen, d_m]
        """
        # Sub-layer 1
        out = self.multi_attn(inp, inp, inp, mask)
        # Sub-layer 2
        out = self.pw_ff(out)
        return out


class Decoder(nn.Module):
    def __init__(self, inp_dim, d_m, d_ff):
        super(Decoder, self).__init__()
        self.multi_attn = MultiHeadAttention(d_m, inp_dim)
        self.multi_attn = MultiHeadAttention(d_m, inp_dim)
        self.pw_ff = PositionWiseFFLayer(d_m, d_ff)

    def forward(self, inp, enc_out, src_mask, trg_mask):
        """
        Args:
            inp (Tensor): [batch size, maxlen, d_m]
            enc_out (Tensor): [batch size, maxlen, d_m]
            src_mask (Tensor): [batch size, 1, maxlen]
            trg_mask (Tensor): [batch size, maxlen, maxlen]
        Returns:
        """
        # Sub-layer 1
        # [batch size, maxlen, d_m]
        out = self.multi_attn(inp, inp, inp, trg_mask)  # masked self attention
        # Sub-layer 2
        # [batch size, maxlen, d_m]
        out = self.multi_attn(out, enc_out, enc_out, src_mask)  # encoder-decoder attention
        # Sub-layer 3
        # [batch size, maxlen, d_m]
        out = self.pw_ff(out)
        return out



##############################


class BatchNorm(nn.Module):
    def __init__(self, num_feature, eps=0.01, momentum=0.9):  # maxlen
        super(BatchNorm, self).__init__()
        shape = 1, 1, num_feature  # (batch, maxlen, hidd), norm target is hidd
        self.eps = eps
        self.momentum = momentum
        self.gamma = nn.Parameter(nn.init.xavier_normal_(torch.empty(shape)))
        self.beta = nn.Parameter(nn.init.xavier_normal_(torch.empty(shape)))

        # The variables that are not model parameters are initialized to 0
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.zeros(shape)

    def update_movings(self, mean, var):
        self.moving_mean = self.momentum * self.moving_mean + (1 - self.momentum) * mean
        self.moving_var = self.momentum * self.moving_var + (1 - self.momentum) * var

    def forward(self, batch):
        # If `X` is not on the main memory, copy `moving_mean` and
        # `moving_var` to the device where `X` is located
        if not torch.is_grad_enabled():
            self.moving_mean = self.moving_mean.to(batch.device)
            self.moving_var = self.moving_var.to(batch.device)
            normed = (batch - self.moving_mean) / torch.sqrt(self.moving_var + self.eps)
        else:
            mean = torch.mean(batch, dim=(0, 1), keepdim=True)
            var = torch.var(batch, dim=(0, 1), keepdim=True)
            normed = (batch - mean) / torch.sqrt(var + self.eps)
            self.update_movings(mean, var)
        new_batch = self.gamma * normed + self.beta
        return new_batch


class LabelSmoothingLoss(nn.NLLLoss):
    def __init__(self, a: float = 0.01, reduction='mean', ignore_index=-100):
        super(LabelSmoothingLoss, self).__init__()
        self.a = a
        self.reduction = reduction
        self.ignore_index = ignore_index

    @torch.no_grad()
    def forward(self, pred, trg):
        K = pred.size(-1)  # class number
        trg_idx = trg != self.ignore_index  # identify not PAD
        trg = trg[trg_idx]

        log_pred = F.log_softmax(pred[trg_idx], dim=-1)
        loss = -torch.sum(log_pred, dim=-1)
        if self.reduction == 'mean':
            loss = torch.mean(loss)
        elif self.reduction == 'sum':
            loss = torch.sum(loss)
        nll_loss = F.nll_loss(log_pred, trg, reduction=self.reduction)
        loss = nll_loss * (1 - self.a) + self.a * (loss / K)
        return loss.mean()


class CrossEntropyLoss(nn.Module):
    def __init__(self, a: float = 0.01, reduction='mean', ignore_index=-100):
        super(CrossEntropyLoss, self).__init__()
        self.a = a
        self.reduction = reduction
        self.ignore_index = ignore_index

    @torch.no_grad()
    def forward(self, pred, trg):
        pass



###################


class Transformer(nn.Module):
    """ Assemble layers to build Transformer """

    def __init__(self, d_m, inp_vocab_size, out_vocab_size, d_ff, n=3):
        super(Transformer, self).__init__()
        self.inp_emb = PositionalEmbedding(inp_vocab_size, d_m)
        self.out_emb = PositionalEmbedding(out_vocab_size, d_m)
        self.enc_layers = nn.ModuleList(
            [Encoder(d_m, d_ff) for _ in range(n)])
        self.dec_layers = nn.ModuleList(
            [Decoder(d_m, d_m, d_ff) for _ in range(n)])
        self.affine = Affine(d_m, out_vocab_size)
        self.n = n

    def encoder(self, inp_batch, src_mask):
        """
        Args:
            inp_batch (Tensor): [batch size, maxlen]
            src_mask (Tensor): [bsize, 1, maxlen]
        Returns: [batch size, maxlen, d_m]
        """
        # [batch size, maxlen, d_m]
        i_emb = self.inp_emb(inp_batch)
        # Encoder
        enc = i_emb
        for layer in self.enc_layers:
            # [batch size, maxlen, d_m]
            enc = layer(enc, src_mask)
        return enc

    def forward(self, inp_batch, out_batch):
        """
        Args:
            inp_batch (Tensor): [batch size, maxlen]
            out_batch (Tensor): [batch size, maxlen]
        Returns: [batch size, maxlen, vocab_size]
        """
        # Encoder
        src_mask = mask_not_pad(inp_batch)
        # [batch size, maxlen, d_m]
        enc = self.encoder(inp_batch, src_mask)

        # Decoder
        trg_mask = mask_get_dec(out_batch)
        # [batch size, maxlen, d_m]
        o_emb = self.out_emb(out_batch)
        dec = o_emb
        for layer in self.dec_layers:
            # [batch size, maxlen, d_m]
            dec = layer(dec, enc, src_mask, trg_mask)
        # [batch size, maxlen, vocab_size]
        rst = F.log_softmax(self.affine(dec), dim=2)
        return rst

    @torch.no_grad()
    def predict(self, inp_batch):
        """
        Args:
            inp_batch (Tensor): [batch size, maxlen]
        Returns: [batch size, maxlen, vocab_size]
        """
        src_mask = mask_not_pad(inp_batch)
        # [batch size, maxlen, d_m]
        enc = self.encoder(inp_batch, src_mask)
        # [batch size, maxlen, d_m] @ [d_m, vocab_size]
        # => [batch size, maxlen, vocab_size]
        rst = F.log_softmax(self.affine(enc), dim=2)
        rst = torch.argmax(rst, dim=-1).tolist()
        return rst


def mask_not_pad(x):
    """
    Mark True at PAD
    Args:
        x (Tensor): [bsize, maxlen] with word idx
    Returns: [bsize, 1, maxlen] with bool if idx <=0, True
    """
    return (x > 0).unsqueeze(1).to(device)


def mask_get_dec(x):
    """
    Mark dec right sequence
    Args:
        x (Tensor): [bsize, maxlen] with bool
    Returns: [bsize, maxlen, maxlen] with bool
    """
    # [bsize, 1, maxlen]
    pad_masked = mask_not_pad(x)
    # [maxlen, maxlen]
    seq_masked = torch.tril(torch.ones(x.size(1), x.size(1))).to(device)
    # [bsize, maxlen, maxlen]
    seq_masked = seq_masked.unsqueeze(0).repeat(x.size(0), 1, 1)
    # [bsize, maxlen, maxlen]
    masked = seq_masked.masked_fill(pad_masked == 0, 0)
    return masked.to(device)


In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import math

from lib.data_batchify import TrainCorpus, collate_fn
from lib.data_preprocess import preprocessor


@torch.no_grad()
def accuracy(pred, target):
    acc = sum(pred.argmax(1) == target).item() / len(target)
    return acc


class LangTranslator:
    def __init__(self, model, ko_vocab, en_vocab, dconf, mconf, device):
        self.dconf = dconf
        self.mconf = mconf

        self.ko_vocab = ko_vocab
        self.en_vocab = en_vocab
        self.dataset = None
        self.dataload = None

        self.device = device
        self.model = model

        self.loss = nn.CrossEntropyLoss()
        self.optim = optim.Adam(params=self.model.parameters(), lr=self.mconf.lr)
        self.lrscheder = optim.lr_scheduler.ReduceLROnPlateau(self.optim, patience=5)

    def train(self, ko_corpus, en_corpus):
        train_set = self.trainset_form(ko_corpus, en_corpus, self.ko_vocab, self.en_vocab)
        self.dataset = TrainCorpus(train_set)
        self.dataload = DataLoader(self.dataset,
                                   batch_size=self.mconf.batch_size,
                                   num_workers=0, collate_fn=collate_fn)
        self.mconf.ko_size, self.mconf.en_size = len(self.ko_vocab) + 1, len(self.en_vocab) + 1

        total_loss = 0
        total_acc = 0
        self.model.train()
        # self.info()
        for epoch in tqdm(range(self.mconf.epoch), desc='epoch'):
            for i, batch in tqdm(enumerate(self.dataload), desc="step", total=len(self.dataload)):
                ko, en = map(lambda ds: ds.to(self.device), batch)
                self.optim.zero_grad()
                en_xs = en[:, :-1]
                en_ts = en[:, 1:]
                pred = self.model(ko, en_xs)
                pred, en_ts = pred.view(-1, pred.shape[2]), en_ts.reshape(1, -1).squeeze(0)
                b_loss = self.loss(pred, en_ts)
                b_loss.backward()
                self.optim.step()

                total_acc += accuracy(pred, en_ts)
                total_loss += b_loss.item()
                del ko, en, en_xs, en_ts, pred
                torch.cuda.empty_cache()


            itersize = math.ceil(len(self.dataset) / self.mconf.batch_size)
            ppl = math.exp(total_loss / itersize)
            print(epoch, total_loss, total_acc / itersize, ppl)
            self.lrscheder.step(total_loss)
            total_loss = 0
        self.en_vocab.to_idx2word()

    def trainset_form(self, ko_corpus, en_corpus, ko_vocab, en_vocab):
        """ form train data - word to idx """
        rst = []
        for ko, en in zip(ko_corpus, en_corpus):
            ko = [ko_vocab[x] for x in ko]
            en = [en_vocab[x] for x in en]
            rst.append([ko, en])
        return rst

    def predset_form(self, corpus, vocab):
        """ form evaluate data - word to idx """
        rst = []
        for ko in corpus:
            ko = [vocab[x] for x in ko]
            rst.append(ko)
        return rst

    def predict(self, corpus):
        """ predict trained model """
        ko_corpus = preprocessor(corpus, lang='ko')
        pred_set = self.predset_form(ko_corpus, self.ko_vocab)
        pred_set = [torch.tensor(data) for data in pred_set]
        dataset = torch.nn.utils.rnn.pad_sequence(pred_set, batch_first=True)
        pred = self.model.predict(dataset, maxlen=dataset.size(1))
        return pred

    def translate(self, kor: list):
        """ Translate Korean to English """
        pred = self.predict(kor)
        rst = []
        for sent_idx in pred:
            sent = [self.en_vocab.get_word(idx) for idx in sent_idx if not 0]
            rst.append(sent)
        return rst

    def save(self, fname: str):
        """ save model """

        torch.save({
            'model': self.model.state_dict(),
            'optim': self.optim.state_dict(),
            'ko_vocab': self.ko_vocab,
            'en_vocab': self.en_vocab
        }, 'results/model/' + fname)

    def load(self, fname: str, retrain=False):
        """ load model """
        if not self.model:
            raise
        checkpoint = torch.load('results/model/' + fname)
        self.model.load_state_dict(checkpoint['model'])
        if self.optim and retrain:
            self.optim.load_state_dict(checkpoint['optim'])
        self.ko_vocab = checkpoint['ko_vocab']
        self.en_vocab = checkpoint['en_vocab']
        self.en_vocab.to_idx2word()
        self.model.eval()
        print(len(self.ko_vocab), len(self.en_vocab))

    def __repr__(self):
        print("Model's state_dict:")
        for param_tensor in self.model.state_dict():
            print(param_tensor, "\t", self.model.state_dict()[param_tensor].size())

        print("Optimizer's state_dict:")
        for var_name in self.optim.state_dict():
            print(var_name, "\t", self.optim.state_dict()[var_name])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
import torch
# from transformer.lib.model.transformer import Transformer

from lib.util import Config
# from lib.kor2eng import LangTranslator
from lib.util import load_data
from lib.data_preprocess import Vocab, preprocessor
from lib.model.seq2seq import BiLSTMSeq2Seq

# import os
# cwd = os.getcwd()
# print(cwd)

# import os
# arr = os.listdir()
# print(arr)

# load configs
dconf_path = 'config/data.json'
mconf_path = 'config/lm.json'
dconf = Config(dconf_path)
mconf = Config(mconf_path)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

print('Using device:', device)

ko_data = load_data(dconf.train_ko_path)
en_data = load_data(dconf.train_en_path)
print(len(ko_data), len(en_data))
# load & preprocess corpus
ko_corpus = preprocessor(ko_data[:int(len(ko_data)/2)], lang='ko')
en_corpus = preprocessor(en_data[:int(len(en_data)/2)], lang='en')

# load vocab
ko_vocab = Vocab(dconf.min_cnt)
en_vocab = Vocab(dconf.min_cnt)
ko_vocab.load(ko_corpus)
en_vocab.load(en_corpus)

# # define lm model
# if mconf.model == 'transformer':
#     model = Transformer(mconf.d_m, len(ko_vocab) + 1, len(en_vocab) + 1,
#                         mconf.d_m * 4, n_layer=3)
# else:
#     model = BiLSTMSeq2Seq(len(ko_vocab) + 1, len(en_vocab) + 1,
#                           mconf.emb_dim, mconf.d_m)
model = Transformer(mconf.d_m, len(ko_vocab) + 1, len(en_vocab) + 1, mconf.d_m * 4, n=3)
model.to(device)

# load translator and train
lm = LangTranslator(model, ko_vocab, en_vocab, dconf, mconf, device)
lm.train(ko_corpus, ko_corpus)

# save model
lm.save('trained.pth')
mconf.save(mconf_path)

test = ['또 하나 필요한 것은 훌륭한 영어 실력이다.',
        '경찰은 월요일 밤 집무실을 찾아 증거를 압수했다.']
print(lm.translate(test))

Using device: cuda
5000 5000


epoch:   0%|          | 0/100 [00:00<?, ?it/s]
step:   0%|          | 0/6 [00:00<?, ?it/s]
epoch:   0%|          | 0/100 [00:00<?, ?it/s]


RuntimeError: ignored

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

In [None]:
torch.cuda.empty_cache()