# 실습 심화 - 간단한 챗봇 구현

## `transformers` 설치

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.11.0-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 13.5 MB/s 
Collecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 72.1 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 78.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 78.2 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 

<br>

## 데이터 다운로드

In [1]:
# https://github.com/songys/Chatbot_data
!git clone https://github.com/songys/Chatbot_data.git

Cloning into 'Chatbot_data'...
remote: Enumerating objects: 47, done.[K
remote: Counting objects: 100% (29/29), done.[K
remote: Compressing objects: 100% (28/28), done.[K
remote: Total 47 (delta 17), reused 3 (delta 1), pack-reused 18[K
Unpacking objects: 100% (47/47), done.


<br>

## Dataset 구축

In [10]:
import pandas as pd

df = pd.read_csv('./Chatbot_data/ChatbotData.csv')
df.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [11]:
question_list = df['Q'].tolist()
answer_list = df['A'].tolist()

print(len(question_list), len(answer_list))

11823 11823


<br>

## 모델 및 토크나이저 정의

In [12]:
import torch
from transformers import AutoTokenizer, AutoModel

MODEL_NAME = 'bert-base-multilingual-cased'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/681M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<br>

## [CLS] token을 얻기 위한 함수 정의

In [13]:
def get_cls_token(sent):
    model.eval()
    tokenized_sent = tokenizer(
        sent,
        return_tensors='pt',
        truncation=True,
        add_special_tokens=True,
        max_length=128
    )
    with torch.no_grad():
        outputs = model(**tokenized_sent)
    logits = outputs.last_hidden_state[:,0,:].detach().cpu().numpy()
    return logits

<br>

## 데이터셋의 질문 문장 벡터 생성

In [15]:
from tqdm import tqdm
import numpy as np

question_cls_hiddens = []
for question in tqdm(question_list):
    question_cls_hidden = get_cls_token(question)
    question_cls_hiddens.append(question_cls_hidden)

question_cls_hiddens = np.array(question_cls_hiddens).squeeze(axis=1)
print(question_cls_hiddens.shape)

100%|██████████| 11823/11823 [31:15<00:00,  6.30it/s]


(11823, 768)


In [16]:
# 저장
np.save('./question_cls_hiddens_saved', question_cls_hiddens)

<br>

## 챗봇 함수 정의

In [17]:
question_cls_hiddens = np.load('./question_cls_hiddens_saved.npy')

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

def chatbot():
    query = input('질문을 입력하세요: ')
    query_cls_hidden = get_cls_token(query)

    # 코사인 유사도 계산
    cos_sim = cosine_similarity(query_cls_hidden, question_cls_hiddens)
    top_question_idx = np.argmax(cos_sim)

    print(answer_list[top_question_idx])

<br>

## 챗봇 함수 사용

In [20]:
chatbot()

질문을 입력하세요: 오늘 날씨 어때요?
날씨 어플에 물어보세요.


In [21]:
chatbot()

질문을 입력하세요: 요즘 날씨가 추워졌어요
안전 귀가 하세요.


In [22]:
chatbot()

질문을 입력하세요: 서울역은 어디에 있죠?
어떻게든 참는게 좋을 거예요.
