In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m85.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


In [2]:
from transformers import AutoTokenizer
from transformers import GPT2LMHeadModel

In [6]:
tokenizer = AutoTokenizer.from_pretrained('skt/kogpt2-base-v2', bos_token='</s>', eos_token='</s>', pad_token='<pad>')
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
print(tokenizer.bos_token_id)
print(tokenizer.eos_token_id)
print(tokenizer.pad_token_id)
print('-' * 10)
print(tokenizer.decode(1))
print(tokenizer.decode(2))
print(tokenizer.decode(3))
print(tokenizer.decode(4))

1
1
3
----------
</s>
<usr>
<pad>
<sys>


In [8]:
import pandas as pd
import tqdm
import urllib.request

In [9]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
train_data = pd.read_csv('ChatBotData.csv')

In [10]:
len(train_data)

11823

In [11]:
batch_size = 32

In [13]:
import torch
from torch.utils.data import Dataset, DataLoader

class ChatDataset(Dataset):
    def __init__(self, train_data, tokenizer):
        self.train_data = train_data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.train_data)

    def __getitem__(self, idx):
        question = self.train_data.Q.iloc[idx]
        answer = self.train_data.A.iloc[idx]
        bos_token = self.tokenizer.bos_token_id
        eos_token = self.tokenizer.eos_token_id
        sent = self.tokenizer.encode('<usr>' + question + '<sys>' + answer, add_special_tokens=False)
        return torch.tensor([bos_token] + sent + [eos_token], dtype=torch.long)

def collate_fn(batch):
    return torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=tokenizer.pad_token_id)

batch_size = 32
chat_dataset = ChatDataset(train_data, tokenizer)
data_loader = DataLoader(chat_dataset, batch_size=batch_size, collate_fn=collate_fn)

In [14]:
import tqdm

optimizer = torch.optim.Adam(model.parameters(), lr=3e-5, eps=1e-08)

steps = len(train_data) // batch_size + 1
print(steps)

EPOCHS = 3

370


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(EPOCHS):
    epoch_loss = 0

    for batch in tqdm.tqdm(data_loader, total=steps):
        batch = batch.to(device)
        labels = batch.clone()
        optimizer.zero_grad()
        result = model(input_ids=batch, labels=labels)
        loss = result.loss
        batch_loss = loss.mean()
        
        batch_loss.backward()
        optimizer.step()
        epoch_loss += batch_loss.item() / steps

    print('[Epoch: {:>4}] cost = {:>.9}'.format(epoch + 1, epoch_loss))

100%|██████████| 370/370 [01:41<00:00,  3.64it/s]


[Epoch:    1] cost = 2.12702096


100%|██████████| 370/370 [01:46<00:00,  3.48it/s]


[Epoch:    2] cost = 1.69815452


100%|██████████| 370/370 [01:46<00:00,  3.46it/s]

[Epoch:    3] cost = 1.37507791





In [16]:
text = '오늘도 좋은 하루!'

In [17]:
sent = '<usr>' + text + '<sys>'

In [18]:
input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent)
input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)

In [19]:
output = model.generate(input_ids, max_length=50, early_stopping=True, eos_token_id=tokenizer.eos_token_id)

In [23]:
decoded_sentence = tokenizer.decode(output[0].tolist())

In [24]:
decoded_sentence.split('<sys> ')[1].replace('</s>', '')

'좋은 하루네요.'

In [26]:
output = model.generate(input_ids, max_length=50, do_sample=True, top_k=10)
tokenizer.decode(output[0].tolist())

'</s><usr> 오늘도 좋은 하루!<sys> 좋은 하루!</s>'

In [76]:
def return_answer_by_chatbot(user_text):
    sent = '<usr>' + user_text + '<sys>'
    input_ids = [tokenizer.bos_token_id] + tokenizer.encode(sent, add_special_tokens=False)
    input_ids = torch.tensor([input_ids], dtype=torch.long).to(device)
    output = model.generate(input_ids, max_length=50, do_sample=True, top_k=2)
    sentence = tokenizer.decode(output[0].tolist())
    chatbot_response = sentence.split('<sys> ')[1].replace('</s>', '')
    return chatbot_response

In [67]:
return_answer_by_chatbot('안녕! 반가워~')

'반가워요.'

In [68]:
return_answer_by_chatbot('너는 누구야?')

'저는 짝사랑이에요.'

In [69]:
return_answer_by_chatbot('사랑해')

'사랑해 보세요.'

In [77]:
return_answer_by_chatbot('나랑 영화보자')

'영화 한 편 보세요.'

In [71]:
return_answer_by_chatbot('너무 심심한데 나랑 놀자')

'짝사랑은 영원할 거예요.'

In [72]:
return_answer_by_chatbot('영화 해리포터 재밌어?')

'영화나 드라마는 다 비슷하죠.'

In [73]:
return_answer_by_chatbot('너 딥 러닝 잘해?')

'잘할 수 있어요.'

In [74]:
return_answer_by_chatbot('너 취했어?')

'취했어요.'

In [75]:
return_answer_by_chatbot('커피 한 잔 할까?')

'커피가 좋죠.'