In [None]:
pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import sentencepiece as spm
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from random import random, randrange, randint, shuffle, choice
from typing import Optional
import torch.optim as optim
import torch.nn.functional as F
import os
from tqdm import tqdm, tqdm_notebook, trange
import json
import numpy as np

In [None]:
path = open('/content/drive/MyDrive/한영기술.json')
json = json.load(path)

In [None]:
text1 = []
text2 = []
for i in range(10000):
  text1.append(json['data'][i]['ko'])
  text2.append(json['data'][i]['en'])  

In [None]:
text = '/content/drive/MyDrive/text.txt'
f = open(text, 'w')
for i in range(len(text1)):
  f.write(text1[i] + " , " +text2[i] + "\n")
  # f.write(text2[i] + " , " +text1[i] + "\n")

f.close()

In [None]:
corpus = "/content/drive/MyDrive/text.txt"
# corpus = text
prefix = "ko_en"
vocab_size = 32000
spm.SentencePieceTrainer.train(
    f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" + 
    " --model_type=bpe" +
    " --max_sentence_length=99999" + # 문장 최대 길이
    " --pad_id=0 --pad_piece=[PAD]" + # pad (0)
    " --unk_id=1 --unk_piece=[UNK]" + # unknown (1)
    " --bos_id=2 --bos_piece=[BOS]" + # begin of sequence (2)
    " --eos_id=3 --eos_piece=[EOS]" + # end of sequence (3)
    " --user_defined_symbols=[SEP],[CLS],[MASK]") # 사용자 정의 토큰

In [None]:
vocab_file = "ko_en.model"
vocab = spm.SentencePieceProcessor()
vocab.load(vocab_file)

lines = [
  'i love you , asd hamburger'
]
for line in lines:
    pieces = vocab.encode_as_pieces(line)
    ids = vocab.encode_as_ids(line)
    a = vocab.Decode(ids)
    print(line)
    print(pieces)
    print(ids)
    print(a)
    print()

i love you , asd hamburger
['▁i', '▁love', '▁you', '▁,', '▁as', 'd', '▁h', 'amb', 'ur', 'ger']
[9990, 12394, 1918, 33, 99, 31057, 64, 6139, 72, 5958]
i love you , asd hamburger



In [None]:
class Config(dict): 
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__

    @classmethod
    def load(cls, file):
        with open(file, 'r') as f:
            config = json.loads(f.read())
            return Config(config)

In [None]:
config = Config({
    "n_dec_vocab": len(vocab),
    "n_dec_seq": 128,
    "n_layer": 1,
    "d_hidn": 768,
    "i_pad": 0,
    "d_ff": 1024,
    "n_head": 16,
    "d_head": 48,
    "dropout": 0.1,
    "layer_norm_epsilon": 1e-12
})
print(config)

{'n_dec_vocab': 32007, 'n_dec_seq': 128, 'n_layer': 1, 'd_hidn': 768, 'i_pad': 0, 'd_ff': 1024, 'n_head': 16, 'd_head': 48, 'dropout': 0.1, 'layer_norm_epsilon': 1e-12}


In [None]:
""" sinusoid position encoding """
def get_sinusoid_encoding_table(n_seq, d_hidn):
    def cal_angle(position, i_hidn):
        return position / np.power(10000, 2 * (i_hidn // 2) / d_hidn)
    def get_posi_angle_vec(position):
        return [cal_angle(position, i_hidn) for i_hidn in range(d_hidn)]

    sinusoid_table = np.array([get_posi_angle_vec(i_seq) for i_seq in range(n_seq)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # even index sin 
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # odd index cos

    return sinusoid_table


""" attention pad mask """
def get_attn_pad_mask(seq_q, seq_k, i_pad):
    batch_size, len_q = seq_q.size()
    batch_size, len_k = seq_k.size()
    pad_attn_mask = seq_k.data.eq(i_pad).unsqueeze(1).expand(batch_size, len_q, len_k)  # 
    return pad_attn_mask


""" attention decoder mask """
def get_attn_decoder_mask(seq):
    subsequent_mask = torch.ones_like(seq).unsqueeze(-1).expand(seq.size(0), seq.size(1), seq.size(1))
    subsequent_mask = subsequent_mask.triu(diagonal=1) # upper triangular part of a matrix(2-D)
    return subsequent_mask


""" scale dot product attention """
class ScaledDotProductAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.dropout = nn.Dropout(config.dropout)
        self.scale = 1 / (self.config.d_head ** 0.5)
    
    def forward(self, Q, K, V, attn_mask):
        # (bs, n_head, n_q_seq, n_k_seq)
        scores = torch.matmul(Q, K.transpose(-1, -2)).mul_(self.scale)
        scores.masked_fill_(attn_mask, -1e9)
        # (bs, n_head, n_q_seq, n_k_seq)
        attn_prob = nn.Softmax(dim=-1)(scores)
        attn_prob = self.dropout(attn_prob)
        # (bs, n_head, n_q_seq, d_v)
        context = torch.matmul(attn_prob, V)
        # (bs, n_head, n_q_seq, d_v), (bs, n_head, n_q_seq, n_v_seq)
        return context, attn_prob


""" multi head attention """
class MultiHeadAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.W_Q = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head)
        self.W_K = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head)
        self.W_V = nn.Linear(self.config.d_hidn, self.config.n_head * self.config.d_head)
        self.scaled_dot_attn = ScaledDotProductAttention(self.config)
        self.linear = nn.Linear(self.config.n_head * self.config.d_head, self.config.d_hidn)
        self.dropout = nn.Dropout(config.dropout)
    
    def forward(self, Q, K, V, attn_mask):
        batch_size = Q.size(0)
        # (bs, n_head, n_q_seq, d_head)
        q_s = self.W_Q(Q).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2)
        # (bs, n_head, n_k_seq, d_head)
        k_s = self.W_K(K).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2)
        # (bs, n_head, n_v_seq, d_head)
        v_s = self.W_V(V).view(batch_size, -1, self.config.n_head, self.config.d_head).transpose(1,2)

        # (bs, n_head, n_q_seq, n_k_seq)
        attn_mask = attn_mask.unsqueeze(1).repeat(1, self.config.n_head, 1, 1)

        # (bs, n_head, n_q_seq, d_head), (bs, n_head, n_q_seq, n_k_seq)
        context, attn_prob = self.scaled_dot_attn(q_s, k_s, v_s, attn_mask)
        # (bs, n_head, n_q_seq, h_head * d_head)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.config.n_head * self.config.d_head)
        # (bs, n_head, n_q_seq, e_embd)
        output = self.linear(context)
        output = self.dropout(output)
        # (bs, n_q_seq, d_hidn), (bs, n_head, n_q_seq, n_k_seq)
        return output, attn_prob


""" feed forward """
class PoswiseFeedForwardNet(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.conv1 = nn.Conv1d(in_channels=self.config.d_hidn, out_channels=self.config.d_ff, kernel_size=1)
        self.conv2 = nn.Conv1d(in_channels=self.config.d_ff, out_channels=self.config.d_hidn, kernel_size=1)
        self.active = F.gelu
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, inputs):
        # (bs, d_ff, n_seq)
        output = self.active(self.conv1(inputs.transpose(1, 2)))
        # (bs, n_seq, d_hidn)
        output = self.conv2(output).transpose(1, 2)
        output = self.dropout(output)
        # (bs, n_seq, d_hidn)
        return output


In [None]:
""" decoder layer """
class DecoderLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.self_attn = MultiHeadAttention(self.config)
        self.layer_norm1 = nn.LayerNorm(self.config.d_hidn, eps=self.config.layer_norm_epsilon)
        self.pos_ffn = PoswiseFeedForwardNet(self.config)
        self.layer_norm3 = nn.LayerNorm(self.config.d_hidn, eps=self.config.layer_norm_epsilon)
    
    def forward(self, dec_inputs, self_attn_mask):
        # (bs, n_dec_seq, d_hidn), (bs, n_head, n_dec_seq, n_dec_seq)
        self_att_outputs, self_attn_prob = self.self_attn(dec_inputs, dec_inputs, dec_inputs, self_attn_mask)
        self_att_outputs = self.layer_norm1(dec_inputs + self_att_outputs)
        # (bs, n_dec_seq, d_hidn)
        ffn_outputs = self.pos_ffn(self_att_outputs)
        ffn_outputs = self.layer_norm3(self_att_outputs + ffn_outputs)
        # (bs, n_dec_seq, d_hidn), (bs, n_head, n_dec_seq, n_dec_seq), (bs, n_head, n_dec_seq, n_enc_seq)
        return ffn_outputs, self_attn_prob

In [None]:
""" decoder """
class Decoder(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.dec_emb = nn.Embedding(self.config.n_dec_vocab, self.config.d_hidn)
        sinusoid_table = torch.FloatTensor(get_sinusoid_encoding_table(self.config.n_dec_seq + 1, self.config.d_hidn))
        self.pos_emb = nn.Embedding.from_pretrained(sinusoid_table, freeze=True)

        self.layers = nn.ModuleList([DecoderLayer(self.config) for _ in range(self.config.n_layer)])
    
    def forward(self, dec_inputs):
        positions = torch.arange(dec_inputs.size(1), device=dec_inputs.device, dtype=dec_inputs.dtype).expand(dec_inputs.size(0), dec_inputs.size(1)).contiguous() + 1
        pos_mask = dec_inputs.eq(self.config.i_pad)
        positions.masked_fill_(pos_mask, 0)
    
        # (bs, n_dec_seq, d_hidn)
        dec_outputs = self.dec_emb(dec_inputs) + self.pos_emb(positions)

        # (bs, n_dec_seq, n_dec_seq)
        dec_attn_pad_mask = get_attn_pad_mask(dec_inputs, dec_inputs, self.config.i_pad)
        # (bs, n_dec_seq, n_dec_seq)
        dec_attn_decoder_mask = get_attn_decoder_mask(dec_inputs)
        # (bs, n_dec_seq, n_dec_seq)
        dec_self_attn_mask = torch.gt((dec_attn_pad_mask + dec_attn_decoder_mask), 0)

        self_attn_probs = []
        for layer in self.layers:
            # (bs, n_dec_seq, d_hidn), (bs, n_dec_seq, n_dec_seq)
            dec_outputs, self_attn_prob = layer(dec_outputs, dec_self_attn_mask)
            self_attn_probs.append(self_attn_prob)
        # (bs, n_dec_seq, d_hidn), [(bs, n_dec_seq, n_dec_seq)]
        return dec_outputs, self_attn_probs

In [None]:
""" gpt """
class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.decoder = Decoder(self.config)
    
    def forward(self, dec_inputs):
        # (bs, n_seq, d_hidn), [(bs, n_head, n_dec_seq, n_dec_seq)]
        dec_outputs, dec_self_attn_probs = self.decoder(dec_inputs)
        # (bs, n_dec_seq, n_dec_vocab), [(bs, n_head, n_dec_seq, n_dec_seq)]
        return dec_outputs, dec_self_attn_probs
    
    # def save(self, epoch, loss, path):
    #     torch.save({
    #         "epoch": epoch,
    #         "loss": loss,
    #         "state_dict": self.state_dict()
    #     }, path)
    
    # def load(self, path):
    #     save = torch.load(path)
    #     self.load_state_dict(save["state_dict"])
    #     return save["epoch"], save["loss"]

In [None]:
""" GPT pretrain """
class GPTPretrain(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config

        self.gpt = GPT(self.config)
        # lm

        self.projection_lm = nn.Linear(self.config.d_hidn, self.config.n_dec_vocab, bias=False)
        self.projection_lm.weight = self.gpt.decoder.dec_emb.weight
    
    def forward(self, dec_inputs):
        # (bs, n_dec_seq, d_hidn), [(bs, n_head, n_dec_seq, n_dec_seq)]
        dec_outputs, dec_self_attn_probs = self.gpt(dec_inputs)
        # (bs, n_dec_seq, n_dec_vocab)
        logits_lm = self.projection_lm(dec_outputs)
        # (bs, n_dec_seq - 1, n_dec_vocab), (bs, n_output), [(bs, n_head, n_dec_seq, n_dec_seq)]
        return logits_lm, dec_self_attn_probs

In [None]:
""" doc별 pretrain 데이터 생성 """
def create_pretrain_instances(doc, n_seq):
    # for [BOS], [EOS]
    max_seq = n_seq - 2
    tgt_seq = max_seq
    instances = []
    a_end=0
    for i in range(len(doc)):
        if doc[i] =='▁,':
            a_end = i
    
    tokens_a =["[BOS]"] + doc[:a_end] + ["[EOS]"] 
    tokens_b = ["[BOS]"] + doc[a_end+1: ] + ["[EOS]"]
    # if len(tokens_a)>len(tokens_b):
    #     a= [0] * (len(tokens_a)-len(tokens_b))
    # else:
    #     a= [0] * (len(tokens_b)-len(tokens_a))
    # for i in range(len(doc)):
    #     tokens.append(doc[i]) # line
    
    instance = {
        "tokens_a": tokens_a,
        "tokens_b":tokens_b
    }
    instances.append(instance)
    return instances

In [None]:
""" pretrain 데이터 생성 """
def make_pretrain_data(vocab, in_file, out_file, n_seq):
    line_cnt = 0
    with open(in_file, "r", encoding='UTF8') as in_f:
        for line in in_f:
            line_cnt += 1

    docs = []
    with open(in_file, "r", encoding='UTF8') as f:
        doc = []
        with tqdm(total=line_cnt, desc=f"Loading") as pbar:
            for i, line in enumerate(f):
                line = line.strip()
                if line == "":
                    if 0 < len(doc):
                        docs.append(doc)
                        doc = []
                        # 메모리 사용량을 줄이기 위해 100,000개만 처리 함
                        if 100000 < len(docs): break
                else:
                    pieces = vocab.encode_as_pieces(line)
                    if 0 < len(pieces):
                        doc.append(pieces)
                pbar.update(1)
        if doc:
            docs.append(doc)
    docs = sum(docs,[])
    with open(out_file, "w", encoding='UTF8') as out_f:
        with tqdm(total=len(docs), desc=f"Making") as pbar:
            for i, doc in enumerate(docs):
                instances = create_pretrain_instances(doc, n_seq)
                for instance in instances:
                    print(instance, file=out_f)
                    # out_f.write(json.dumps(instance))
                    # out_f.write("\n")
                    pbar.update(1)


In [None]:
in_file = "/content/drive/MyDrive/text.txt"
out_file = "text.json"

n_seq = 128
make_pretrain_data(vocab, in_file, out_file, n_seq)


Loading: 100%|██████████| 10000/10000 [00:01<00:00, 5043.75it/s]
Making: 100%|██████████| 10000/10000 [00:00<00:00, 54955.30it/s]


In [None]:
""" pretrain 데이터셋 """
class PretrainDataSet(torch.utils.data.Dataset):
    def __init__(self, vocab, infile):
        self.vocab = vocab
        self.sentences = []
        self.label = []
        line_cnt = 0
        with open(infile, "r", encoding='UTF8') as f:
            for line in f:
                line_cnt += 1

        with open(infile, "r", encoding='UTF8') as f:
            for i, line in enumerate(tqdm(f, total=line_cnt, desc=f"Loading {infile}", unit=" lines")):
                instance = eval(line)
                
                tokens_a = [vocab.piece_to_id(p) for p in instance["tokens_a"]]
                tokens_b = [vocab.piece_to_id(p) for p in instance["tokens_b"]]
                if len(tokens_a) > len(tokens_b):
                    a = [0]*(len(tokens_a) - len(tokens_b))
                    tokens_b = tokens_b +a
                    
                else:
                    a = [0]*(len(tokens_b) - len(tokens_a))
                    tokens_a = tokens_a +a
                self.sentences.append(tokens_a)
                self.label.append(tokens_b)
               

    def __len__(self):
        assert len(self.sentences) == len(self.label)
        return len(self.sentences)
    
    def __getitem__(self, item):
        return (torch.tensor(self.sentences[item]),
                torch.tensor(self.label[item]),
                torch.tensor(item))

In [None]:
""" pretrain data collate_fn """
def pretrin_collate_fn(inputs):
    dec_inputs,labels,item = list(zip(*inputs))
    dec_inputs = torch.nn.utils.rnn.pad_sequence(dec_inputs, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=0)
    batch = [
        dec_inputs,
        labels,
        torch.stack(item, dim=0)
    ]
    
    return batch

In [None]:
""" pretrain 데이터 로더 """
batch_size = 256
dataset = PretrainDataSet(vocab, "text.json")
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True , collate_fn=pretrin_collate_fn)

Loading text.json: 100%|██████████| 10000/10000 [00:01<00:00, 5417.90 lines/s]


In [None]:
""" 모델 epoch 학습 """
def train_epoch(config, epoch, model, criterion_lm, optimizer, train_loader):
    losses = []
    model.train()

    # with tqdm(total=len(train_loader), desc=f"Train({epoch})") as pbar:
    for i, value in enumerate(train_loader):
        dec_inputs,labels_lm,_ = map(lambda v: v.to(config.device), value)
        
        optimizer.zero_grad()
        outputs = model(dec_inputs)
        
        logits_lm = outputs[0]
        # print(logits_lm.shape)
        loss_lm = criterion_lm(logits_lm.view(-1, logits_lm.size(2)), labels_lm.view(-1))

        losses.append(loss_lm.cpu().detach().numpy())

        loss_lm.backward()
        optimizer.step()
     
    print(f"Loss: {loss_lm:.3f} , Perplexity: {np.exp(loss_lm.item())}")
            # pbar.update(1)
            # pbar.set_postfix_str(f"Loss: {loss_lm:.3f} ({np.mean(losses):.3f})")
    return np.mean(losses)

In [None]:
config.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(config)

learning_rate = 0.0001
betas=(0.9, 0.999)
weight_decay = 0.01
n_epoch = 10

{'n_dec_vocab': 32007, 'n_dec_seq': 128, 'n_layer': 1, 'd_hidn': 768, 'i_pad': 0, 'd_ff': 1024, 'n_head': 16, 'd_head': 48, 'dropout': 0.1, 'layer_norm_epsilon': 1e-12, 'device': device(type='cuda')}


In [None]:
model = GPTPretrain(config)
# save_pretrain = "save_gpt_pretrain_gpt.pth"
best_epoch, best_loss = 0, 0
# if os.path.isfile(save_pretrain):
#     best_epoch, best_loss = model.gpt.load(save_pretrain)
#     print(f"load pretrain from: {save_pretrain}, epoch={best_epoch}, loss={best_loss}")
#     best_epoch += 1

model.to(config.device)

criterion_lm = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

losses = []
offset = best_epoch
for step in trange(n_epoch):
    epoch = step + offset
    loss = train_epoch(config, epoch, model, criterion_lm, optimizer, train_loader)
    losses.append(loss)
    # model.gpt.save(epoch, loss, save_pretrain)

 10%|█         | 1/10 [00:09<01:21,  9.06s/it]

Loss: 45.191 , Perplexity: 4.2291438207152644e+19


 20%|██        | 2/10 [00:18<01:12,  9.00s/it]

Loss: 36.082 , Perplexity: 4677905641093986.0


 30%|███       | 3/10 [00:26<01:02,  8.92s/it]

Loss: 34.019 , Perplexity: 594749381016412.1


 40%|████      | 4/10 [00:35<00:53,  8.93s/it]

Loss: 29.075 , Perplexity: 4236120052523.589


 50%|█████     | 5/10 [00:44<00:44,  8.89s/it]

Loss: 26.979 , Perplexity: 521143835228.9808


 60%|██████    | 6/10 [00:53<00:35,  8.93s/it]

Loss: 21.821 , Perplexity: 2998502120.8784914


 70%|███████   | 7/10 [01:02<00:26,  8.94s/it]

Loss: 25.922 , Perplexity: 180967823738.31555


 80%|████████  | 8/10 [01:11<00:17,  8.94s/it]

Loss: 19.661 , Perplexity: 345635842.7818536


 90%|█████████ | 9/10 [01:20<00:08,  8.89s/it]

Loss: 24.592 , Perplexity: 47873381889.79681


100%|██████████| 10/10 [01:29<00:00,  8.93s/it]

Loss: 20.883 , Perplexity: 1172804500.8749003





In [None]:
# model.eval()
test_inputs = vocab.encode('[BOS] 한 의료단체는 연방법원에 보건부 지침 취소를 요구하는 소송을 제기하기도 했다. [EOS]')
test_inputs = torch.tensor([test_inputs]).to(config.device)
out = model(test_inputs)
[vocab.decode(i) for i in out[0].argmax(2).cpu().detach().tolist()]

['[ rarely system suppression- distribute a는 information channel 보건부 확진 CD소를약의aves 제기 in 했다. [ it diagnostic father']