In [9]:
!pip install transformers
!pip install sentencepiece
!apt-get install curl git
!apt-get install build-essential
!apt-get install cmake
!apt-get install g++
!apt-get install flex
!apt-get install bison
!apt-get install python-dev
!pip install cython
!pip install mecab-python
!python3 -m pip install konlpy
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)
!pip install konlpy pandas seaborn gensim wordcloud python-mecab-ko wget svgling

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
curl is already the newest version (7.81.0-1ubuntu1.15).
git is already the newest version (1:2.34.1-1ubuntu1.10).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
build-essential is already the newest version (12.9ubuntu3).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
cmake is already the newest version (3.22.1-1ubuntu1.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
g++ is already the newest version (4:11.2.0-1ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.
Reading package lists... Done
Building dependency tree... Done

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
from mecab import MeCab
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from transformers import BertModel

In [12]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

def token_nouns(text):
    reply_nouns = mecab.nouns(text)
    reply_mecab = ' '.join(reply_nouns)
    return reply_mecab

In [13]:
mecab = MeCab()

# 정답
answer = '커피 주세요'

# 정답 문장 형태소 분석(명사 추출)
answer_nouns = mecab.nouns(answer)
answer_mecab = ' '.join(answer_nouns)

In [14]:
texts = ['바닐라 라떼 주세요','딸기라떼 주세요', '커피 주세요', '빨리 먹고 싶어요', '저게 뭐에요?', '잘 모르겠어요', '초코 프라프치노 주세요', '카푸치노 주세요','넌 뭐야', '집에 가고 싶어요', '아아 주세요', '이게 뭐에요?', '아메리카노 주세요', '라떼 주세요', '나는 18살이에요']

In [15]:
model2 = BertModel.from_pretrained('jhgan/ko-sbert-sts')
tokenizer2 = AutoTokenizer.from_pretrained('jhgan/ko-sbert-sts', last_hidden_states=True, max_length = 512)

In [16]:
print('정답:', answer)

for text in texts:
    # 단어만 비교
    reply_mecab = token_nouns(text)
    compare1 = [answer_mecab, reply_mecab]
    encoded_input1 = tokenizer2(compare1, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        model_output1 = model2(**encoded_input1)

    sentence_embeddings1 = mean_pooling(model_output1, encoded_input1['attention_mask'])

    # 문장 전체 비교
    compare2 = [answer, text]
    encoded_input2 = tokenizer2(compare2, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        model_output2 = model2(**encoded_input2)

    sentence_embeddings2 = mean_pooling(model_output2, encoded_input2['attention_mask'])

    # 유사도 검사
    similarity1 = F.cosine_similarity(sentence_embeddings1[0], sentence_embeddings1[1], dim = -1)#, dim=1)
    similarity2 = F.cosine_similarity(sentence_embeddings2[0], sentence_embeddings2[1], dim = -1)#, dim=1)
    print('-----------------------------------------------')
    print("질문:", text)
    print('단어만 비교한 유사도:', similarity1.item())
    print('문장 전체 유사도:', similarity2.item())
    print('유사도 합:', similarity1.item() + similarity2.item())
    print('-----------------------------------------------')

정답: 커피 주세요
-----------------------------------------------
질문: 바닐라 라떼 주세요
단어만 비교한 유사도: 0.7243887782096863
문장 전체 유사도: 0.7243887782096863
유사도 합: 1.4487775564193726
-----------------------------------------------
-----------------------------------------------
질문: 딸기라떼 주세요
단어만 비교한 유사도: 0.35612380504608154
문장 전체 유사도: 0.6409763097763062
유사도 합: 0.9971001148223877
-----------------------------------------------
-----------------------------------------------
질문: 커피 주세요
단어만 비교한 유사도: 0.9999999403953552
문장 전체 유사도: 0.9999999403953552
유사도 합: 1.9999998807907104
-----------------------------------------------
-----------------------------------------------
질문: 빨리 먹고 싶어요
단어만 비교한 유사도: 0.3467792272567749
문장 전체 유사도: 0.31928297877311707
유사도 합: 0.666062206029892
-----------------------------------------------
-----------------------------------------------
질문: 저게 뭐에요?
단어만 비교한 유사도: 0.23364795744419098
문장 전체 유사도: 0.15934336185455322
유사도 합: 0.3929913192987442
-----------------------------------------------
-

In [17]:
# 모델 저장
import joblib

joblib.dump(model2, '/content/drive/MyDrive/Colab Notebooks/빅프로젝트/model/model.pkl')
joblib.dump(tokenizer2, '/content/drive/MyDrive/Colab Notebooks/빅프로젝트/model/tokenizer.pkl')

['/content/drive/MyDrive/Colab Notebooks/빅프로젝트/model/tokenizer.pkl']