<a href="https://colab.research.google.com/github/syoung7388/NlpBasicModel/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install sentencepiece 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


**1. 라이브러리**

In [None]:
import pandas as pd
import numpy as np
import re
import sentencepiece as spm
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import Transformer
from torch import nn
import torch
import math
from tqdm import tqdm
from tqdm.notebook import tqdm

**2.HyperParameter**

In [None]:
vocab_size = 16000
max_len = 40
batch_size = 64
lr = 1e-4
epoch = 30
hidde_size = 256
head = 8
d_model = 512
n_layers = 2
dropout = 0.1


#seed
seed=2891
torch.manual_seed = seed 

**3. GPU 설정**

In [None]:
! nvidia-smi
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

Tue Aug  9 14:09:58 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P0    27W /  70W |   2454MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

**4.Data 준비**

* Data 불러오기

In [None]:
train_data = pd.read_csv('https://raw.githubusercontent.com/Doheon/Chatbot-Transformer/main/ChatBotData.csv')
Q, A= train_data.iloc[:, 0], train_data.iloc[:, 1]
questions = [re.sub(r"([?.!,])", r" \1 ", text) for text in Q]
answers = [re.sub(r"([?.!,])", r" \1 ", text) for text in A]
print("[Q]")
print(questions[:5])
print()
print("[A]")
print(answers[:5])

[Q]
['12시 땡 ! ', '1지망 학교 떨어졌어', '3박4일 놀러가고 싶다', '3박4일 정도 놀러가고 싶다', 'PPL 심하네']

[A]
['하루가 또 가네요 . ', '위로해 드립니다 . ', '여행은 언제나 좋죠 . ', '여행은 언제나 좋죠 . ', '눈살이 찌푸려지죠 . ']


* Tokenizing

In [None]:
with open('all.txt', 'w', encoding='utf8') as f:
    f.write('\n'.join(questions))
    f.write('\n'.join(answers))

corpus = "all.txt" #text 파일로 불러오기 
prefix = "chatbot" # vocab 파일 이름
spm.SentencePieceTrainer.train(
    f"--input={corpus} --model_prefix={prefix} --vocab_size={vocab_size + 7}" + 
    " --model_type=bpe" +
    " --max_sentence_length=999999" + # 문장 최대 길이
    " --pad_id=0 --pad_piece=[PAD]" + # pad (0)
    " --unk_id=1 --unk_piece=[UNK]" + # unknown (1)
    " --bos_id=2 --bos_piece=[BOS]" + # begin of sequence (2)
    " --eos_id=3 --eos_piece=[EOS]" + # end of sequence (3)
    " --user_defined_symbols=[SEP],[CLS],[MASK]") # 사용자 정의 토큰

# vocab load
vocab = spm.SentencePieceProcessor()
vocab.load( "chatbot.model")

line = "안녕하세요 만나서 반갑습니다"
pieces = vocab.encode_as_pieces(line)
ids = vocab.encode_as_ids(line)

print(line)
print(pieces)
print(ids)

안녕하세요 만나서 반갑습니다
['▁안녕하세요', '▁만나서', '▁반갑습니다']
[4626, 1930, 8499]


In [None]:
# 최대 길이를 40으로 정의
START_TOKEN = [2]
END_TOKEN = [3]

# 토큰화 / 정수 인코딩 / 시작 토큰과 종료 토큰 추가 / 패딩
def tokenize_and_filter(inputs, outputs):
  tokenized_inputs, tokenized_outputs = [], []
  for (sentence1, sentence2) in zip(inputs, outputs):
    # encode(토큰화 + 정수 인코딩), 시작 토큰과 종료 토큰 추가
    zeros1 = np.zeros(max_len, dtype=int)
    sentence1 = START_TOKEN + vocab.encode_as_ids(sentence1) + END_TOKEN
    zeros1[:len(sentence1)] = sentence1[:max_len]

    zeros2 = np.zeros(max_len, dtype=int)
    sentence2 = START_TOKEN + vocab.encode_as_ids(sentence2) + END_TOKEN
    zeros2[:len(sentence2)] = sentence2[:max_len]

    tokenized_inputs.append(zeros1)
    tokenized_outputs.append(zeros2)

  return tokenized_inputs, tokenized_outputs

questions_encode, answers_encode = tokenize_and_filter(questions, answers)
print(questions_encode[0])
print(answers_encode[0])

[    2  5566 14968  3210   111     3     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
[   2 5192  217 5936    7    3    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]


* Dataset, Dataloader 준비

In [None]:
class MyDataset(Dataset):
    def __init__(self, questions_encode, answers_encode):
        self.enc_inputs = questions_encode
        self.dec_inputs = answers_encode
        self.labels = answers_encode

    def __getitem__(self, idx):
        return torch.tensor(self.enc_inputs[idx]), torch.tensor(self.dec_inputs[idx][:-1]), torch.tensor(self.labels[idx][1:]) 

    def __len__(self):
        return len(self.labels)



train_dataset = MyDataset(questions_encode[:10000], answers_encode[:10000])
train_dataloader = DataLoader(train_dataset, batch_size = batch_size, shuffle = True)


test_dataset = MyDataset(questions_encode[10000:], answers_encode[10000:])
test_dataloader = DataLoader(test_dataset, batch_size = 1, shuffle = False)


print(f"[Dataset] train: {len(train_dataset)}, test: {len(test_dataset)}")
print(f"[DataLoader] train: {len(train_dataloader)}, test: {len(test_dataloader)}")


[Dataset] train: 10000, test: 1823
[DataLoader] train: 157, test: 1823


**5. Model 설계**

* Mask


In [None]:

def generate_square_subsequent_mask(size):
    mask = (torch.triu(torch.ones(size, size))==1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def generate_mask(enc_inputs, dec_inputs):
    src_mask = generate_square_subsequent_mask(enc_inputs.shape[1]).to(device)
    src_padding_mask = torch.eq(enc_inputs, 0).to(device)
    trg_mask = generate_square_subsequent_mask(dec_inputs.shape[1]).to(device)
    trg_padding_mask = torch.eq(dec_inputs, 0).to(device)
    return src_mask, src_padding_mask, trg_mask, trg_padding_mask

* PositionalEncoding

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)



* Transformer

In [None]:
class TFModel(nn.Module):
    def __init__(self, tot_vocab_size, hidden_size, head, d_model, n_layers, dropout=0.5):
        super(TFModel, self).__init__()
        self.transformer = Transformer(hidden_size, head, dim_feedforward=d_model, num_encoder_layers=n_layers, num_decoder_layers=n_layers,dropout=dropout)
        self.enc_pos = PositionalEncoding(hidden_size, dropout)
        self.enc_embedding = nn.Embedding(tot_vocab_size, hidden_size)
        self.dec_pos = PositionalEncoding(hidden_size, dropout)
        self.dec_embedding = nn.Embedding(tot_vocab_size, hidden_size)
        self.hidden_size = hidden_size
        self.tot_vocab_size = tot_vocab_size
        self.linear = nn.Linear(hidden_size, tot_vocab_size)


    def forward(self, src, trg, src_mask, trg_mask, src_pad_mask, trg_pad_mask):
        
        src = self.enc_embedding(src) * math.sqrt(self.hidden_size)
        src = self.enc_pos(src)

        trg = self.dec_embedding(trg) * math.sqrt(self.hidden_size)
        trg = self.dec_pos(trg)
        
        output = self.transformer(src.transpose(0,1), trg.transpose(0,1), src_mask, trg_mask, src_key_padding_mask=src_pad_mask, tgt_key_padding_mask=trg_pad_mask)
        output = self.linear(output)
        
        return output


In [None]:
model = TFModel(vocab_size+7, hidde_size, head, d_model, n_layers, dropout).to(device)

**6. 모델 훈련 (Training)**

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

model.train()


for e in range(1, epoch+1):
    tot_loss = 0.0
    for (enc_inputs, dec_inputs, outputs) in tqdm(train_dataloader):
        optimizer.zero_grad()
        
        src_mask, src_padding_mask, trg_mask, trg_padding_mask = generate_mask(enc_inputs, dec_inputs)

        result = model(enc_inputs.to(device), dec_inputs.to(device), src_mask, trg_mask, src_padding_mask,trg_padding_mask)

        loss = criterion(result.permute(1,2,0), outputs.to(device).long())
        loss.backward()
        optimizer.step()
        tot_loss += loss

    print(f"EPOCH [{e}/{epoch}] LOSS [{tot_loss.item() / len(train_dataloader):.5f}] ")



  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [1/30] LOSS [1.93539] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [2/30] LOSS [0.95729] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [3/30] LOSS [0.88435] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [4/30] LOSS [0.85428] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [5/30] LOSS [0.83344] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [6/30] LOSS [0.81399] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [7/30] LOSS [0.79613] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [8/30] LOSS [0.78031] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [9/30] LOSS [0.76491] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [10/30] LOSS [0.74975] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [11/30] LOSS [0.73289] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [12/30] LOSS [0.71689] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [13/30] LOSS [0.70111] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [14/30] LOSS [0.68519] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [15/30] LOSS [0.66955] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [16/30] LOSS [0.65296] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [17/30] LOSS [0.63838] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [18/30] LOSS [0.62292] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [19/30] LOSS [0.60812] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [20/30] LOSS [0.59420] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [21/30] LOSS [0.57927] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [22/30] LOSS [0.56529] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [23/30] LOSS [0.55144] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [24/30] LOSS [0.53757] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [25/30] LOSS [0.52341] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [26/30] LOSS [0.51001] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [27/30] LOSS [0.49672] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [28/30] LOSS [0.48368] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [29/30] LOSS [0.46926] 


  0%|          | 0/157 [00:00<?, ?it/s]

EPOCH [30/30] LOSS [0.45553] 


**6. 모델 검증 (Testing)**

In [None]:

def preprocess_sentence(sentence):
    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = sentence.strip()
    return sentence

def evaluate(sentence):
    sentence = preprocess_sentence(sentence)
    input = torch.tensor([START_TOKEN + vocab.encode_as_ids(sentence) + END_TOKEN]).to(device)
    output = torch.tensor([START_TOKEN]).to(device)

    # 디코더의 예측 시작
    model.eval()
    for i in range(max_len):
        src_mask, src_padding_mask, trg_mask, trg_padding_mask = generate_mask(input ,output)

        predictions = model(input, output, src_mask, trg_mask, src_padding_mask, trg_padding_mask).transpose(0,1)
        # 현재(마지막) 시점의 예측 단어를 받아온다.
        predictions = predictions[:, -1:, :]
        predicted_id = torch.LongTensor(torch.argmax(predictions.cpu(), axis=-1))


        # 만약 마지막 시점의 예측 단어가 종료 토큰이라면 예측을 중단
        if torch.equal(predicted_id[0][0], torch.tensor(END_TOKEN[0])):
            break

        # 마지막 시점의 예측 단어를 출력에 연결한다.
        # 이는 for문을 통해서 디코더의 입력으로 사용될 예정이다.
        output = torch.cat([output, predicted_id.to(device)], axis=1)

    return torch.squeeze(output, axis=0).cpu().numpy()

def predict(sentence):
    prediction = evaluate(sentence)
    predicted_sentence = vocab.Decode(list(map(int,[i for i in prediction if i < vocab_size+7])))

    print('Input: {}'.format(sentence))
    print('Output: {}'.format(predicted_sentence))

    return predicted_sentence



In [None]:
predict("난 뭘 해야 할까?")

Input: 난 뭘 해야 할까?
Output: 직접 물어보세요 .


'직접 물어보세요 .'

In [None]:
predict("나 여행가고 싶어")

Input: 나 여행가고 싶어
Output: 저도 보고 싶어요 .


'저도 보고 싶어요 .'

In [None]:
predict("안녕하세요 만나서 반갑습니다")

Input: 안녕하세요 만나서 반갑습니다
Output: 그런 사람 고쳐쓰는 거 아니에요 .


'그런 사람 고쳐쓰는 거 아니에요 .'