In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

import re
import os
import io
import time
import random

from sklearn.model_selection import train_test_split

print(tf.__version__)

2.2.0


## 데이터 로드 

In [2]:
# data load
path_to_file = os.getenv('HOME')+'/aiffel/Chatbot_data-master/ChatbotData .csv'
data = pd.read_csv(path_to_file)

In [3]:
data

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0
...,...,...,...
11818,훔쳐보는 것도 눈치 보임.,티가 나니까 눈치가 보이는 거죠!,2
11819,훔쳐보는 것도 눈치 보임.,훔쳐보는 거 티나나봐요.,2
11820,흑기사 해주는 짝남.,설렜겠어요.,2
11821,힘든 연애 좋은 연애라는게 무슨 차이일까?,잘 헤어질 수 있는 사이 여부인 거 같아요.,2


In [4]:
# data check
src = []
tgt = []
for s,t in zip(data['Q'],data['A']):
    src.append(str(s))
    tgt.append(str(t))

for s,t in zip(src[:20],tgt[:20]):
    print(">> ", s, "+",t)

>>  12시 땡! + 하루가 또 가네요.
>>  1지망 학교 떨어졌어 + 위로해 드립니다.
>>  3박4일 놀러가고 싶다 + 여행은 언제나 좋죠.
>>  3박4일 정도 놀러가고 싶다 + 여행은 언제나 좋죠.
>>  PPL 심하네 + 눈살이 찌푸려지죠.
>>  SD카드 망가졌어 + 다시 새로 사는 게 마음 편해요.
>>  SD카드 안돼 + 다시 새로 사는 게 마음 편해요.
>>  SNS 맞팔 왜 안하지ㅠㅠ + 잘 모르고 있을 수도 있어요.
>>  SNS 시간낭비인 거 아는데 매일 하는 중 + 시간을 정하고 해보세요.
>>  SNS 시간낭비인데 자꾸 보게됨 + 시간을 정하고 해보세요.
>>  SNS보면 나만 빼고 다 행복해보여 + 자랑하는 자리니까요.
>>  가끔 궁금해 + 그 사람도 그럴 거예요.
>>  가끔 뭐하는지 궁금해 + 그 사람도 그럴 거예요.
>>  가끔은 혼자인게 좋다 + 혼자를 즐기세요.
>>  가난한 자의 설움 + 돈은 다시 들어올 거예요.
>>  가만 있어도 땀난다 + 땀을 식혀주세요.
>>  가상화폐 쫄딱 망함 + 어서 잊고 새출발 하세요.
>>  가스불 켜고 나갔어 + 빨리 집에 돌아가서 끄고 나오세요.
>>  가스불 켜놓고 나온거 같아 + 빨리 집에 돌아가서 끄고 나오세요.
>>  가스비 너무 많이 나왔다. + 다음 달에는 더 절약해봐요.


In [5]:
def preprocess_sentence(sentence):
    sentence = sentence.lower().strip()

    sentence = re.sub(r"([?.!,])", r" \1 ", sentence)
    sentence = re.sub(r'[" "]+', " ", sentence)
    sentence = re.sub(r"[^0-9ㄱ-ㅎㅏ-ㅣ가-힣a-zA-Z?.!,]+", " ", sentence)

    sentence = sentence.strip()
    
    return sentence

## 중복치, 대문자 소문자, 토큰화 

In [6]:
vocab_size = 20000
from konlpy.tag import Mecab
from collections import Counter
tokenizer = Mecab()
max_len = 50

def build_corpus(src, tgt, l, num_words=vocab_size, dup=0):
    if dup == 0:
        sen_idx = {}
        src_u = []
        tgt_u = []

        for sen1,sen2 in zip(src,tgt):
            if sen1 not in sen_idx:
                sen_idx[sen1] = 1
                src_u.append(sen1)
                tgt_u.append(sen2)

        sen_idx = {}
        src = []
        tgt = []

        for sen1,sen2 in zip(src_u,tgt_u):
            if sen2 not in sen_idx:
                sen_idx[sen2] = 1
                src.append(sen1)
                tgt.append(sen2)

    
    
    src_p = []
    tgt_p = []
    for s,t in zip(src,tgt):
        src_p.append(preprocess_sentence(s))
        tgt_p.append(preprocess_sentence(t))
    
    src_tok = []
    tgt_tok = []
    word_tok = []
    
    for s in src_p:
        tmp = tokenizer.morphs(s)
        src_tok.append(tmp)
        word_tok.append(tmp)
    
    for t in tgt_p:
        tmp = tokenizer.morphs(t)
        tgt_tok.append(tmp)
        word_tok.append(tmp)
        
    words = np.concatenate(word_tok).tolist()
    counter = Counter(words)
    counter = counter.most_common(num_words-4)
    vocab = ['<PAD>', '<BOS>', '<UNK>', '<EOS>'] + [key for key, _ in counter]
    # 사전 구성
    word_to_index = {word:index for index, word in enumerate(vocab)}

    def wordlist_to_indexlist(wordlist):
        return [word_to_index[word] if word in word_to_index else word_to_index['<UNK>'] for word in wordlist]
    # 변환 text to index
    src_data = list(map(wordlist_to_indexlist, src_tok))
    tgt_data = list(map(wordlist_to_indexlist, tgt_tok))
    
    src_l = []
    tgt_l = []
    
    for s,t in zip(src_data,tgt_data):
        if len(s) <= l and len(t) <= l:
            src_l.append(s)
            tgt_l.append(t)
            
    
    
    return src_l, tgt_l, word_to_index

que_corpus, ans_corpus, word_to_index = build_corpus(src,tgt,max_len)
for q,a in zip(que_corpus[:20], ans_corpus[:20]):
    print(q, a)

[2057, 209, 2581, 105] [264, 9, 136, 9, 39, 4]
[284, 3559, 599, 1043, 13] [624, 17, 1492, 4]
[279, 2058, 480, 65, 271, 261, 9, 11, 44, 35] [275, 16, 672, 10, 34, 4]
[3560, 1126, 36] [5180, 5, 5181, 19, 34, 4]
[3561, 1535, 3562, 13] [121, 2000, 188, 6, 24, 51, 5182, 4]
[544, 190, 1044, 132, 42, 7, 19, 627] [57, 116, 11, 15, 8, 38, 23, 15, 30, 4]
[544, 73, 1536, 102, 14, 21, 46, 412, 7, 6, 146] [73, 8, 2452, 11, 17, 18, 12, 4]
[544, 18, 37, 22, 66, 1127, 11, 35, 152, 17, 521] [2864, 7, 6, 846, 254, 4]
[494, 318, 17] [90, 27, 23, 217, 14, 28, 4]
[494, 16, 180, 102, 24, 10, 35] [180, 31, 969, 12, 4]
[3563, 33, 206, 40, 3564] [266, 16, 121, 3278, 14, 28, 4]
[3565, 15, 239, 2059, 1757] [2059, 8, 3279, 56, 12, 4]
[3566, 3567, 3568, 2582] [291, 123, 11, 513, 1162, 7, 12, 4]
[2060, 1045, 1758, 11, 1537, 13] [523, 228, 26, 844, 143, 982, 11, 562, 12, 4]
[2060, 325, 54, 78, 825, 35, 4] [424, 140, 26, 6, 67, 1873, 17, 49, 4]
[2060, 325, 3569, 863, 1128, 25, 13] [608, 7, 24, 707, 105]
[454, 436, 33

In [7]:

index_to_word = {index:word for word, index in word_to_index.items()}

## Augmentation 

In [8]:
from gensim.models import Word2Vec
wv = Word2Vec.load(os.getenv('HOME') + '/aiffel/ko/ko.bin')

In [9]:
def lexical_sub(sentence, word2vec, top=0):
    import random

    res = ""
    toks = sentence.split()
    
    try:
        _from = random.choice(toks)
        _to = word2vec.most_similar(_from)[top][0]

    except:   # 단어장에 없는 단어
        return None

    for tok in toks:
        if tok is _from: res += _to + " "
        else: res += tok + " "

    return res

In [10]:
# augmentation
from tqdm import tqdm_notebook

src_corpus = []
tgt_corpus = []
for idx in tqdm_notebook(range(7731)):
    
    old_src = []
    for w in que_corpus[idx]:
        ow = index_to_word[w]
        old_src.append(ow)
    old_src = ' '.join(old_src)

    old_tgt = []
    for w in ans_corpus[idx]:
        ow = index_to_word[w]
        old_tgt.append(ow)
    old_tgt = ' '.join(old_tgt)

    
    new_src = [None]*3
    new_tgt = [None]*3
    
    new_src[0] = old_src
    new_tgt[0] = old_tgt
    
    new_src[1] = lexical_sub(old_src, wv)
    new_src[2] = lexical_sub(old_src, wv, 1)
#    new_src[3] = lexical_sub(old_src, wv, 2)
#    new_src[4] = lexical_sub(old_src, wv, 3)
    
    new_tgt[1] = lexical_sub(old_tgt, wv)
    new_tgt[2] = lexical_sub(old_tgt, wv, 1)
#    new_tgt[3] = lexical_sub(old_tgt, wv, 2)
#    new_tgt[4] = lexical_sub(old_tgt, wv, 3)
    
    
    for i in new_src:
        for j in new_tgt:
            if i is not None and j is not None:
                src_corpus.append(i)
                tgt_corpus.append(j)
    
    
#     if new_src_1 is not None and new_tgt_1 is not None:
#         src_corpus.append(new_src_1)
#         tgt_corpus.append(new_tgt_1)
        
#     if new_src_2 is not None and new_tgt_2 is not None:
#         src_corpus.append(new_src_2)
#         tgt_corpus.append(new_tgt_2)
        
#     if new_src_3 is not None and new_tgt_3 is not None:
#         src_corpus.append(new_src_3)
#         tgt_corpus.append(new_tgt_3)
        
#     if new_src_4 is not None and new_tgt_4 is not None:
#         src_corpus.append(new_src_4)
#         tgt_corpus.append(new_tgt_4)
    
print(src_corpus[:20])
print(src[:20])
print(len(src_corpus))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/7731 [00:00<?, ?it/s]

  if __name__ == '__main__':


['12 시 땡 !', '12 시 땡 !', '12 시 땡 !', '12 시 땡 스페셜 ', '12 시 땡 스페셜 ', '12 시 땡 스페셜 ', '1 지망 학교 떨어졌 어', '1 중퇴 학교 떨어졌 어 ', '1 지망 강습소 떨어졌 어 ', '3 박 4 일 놀 러 가 고 싶 다', '3 박 4 일 놀 러 가 고 싶 다', '3 박 4 일 놀 러 가 고 싶 다', '3 박 4 일 살 러 가 고 싶 다 ', '3 박 4 일 살 러 가 고 싶 다 ', '3 박 4 일 살 러 가 고 싶 다 ', '3 박 4 초순 놀 러 가 고 싶 다 ', '3 박 4 초순 놀 러 가 고 싶 다 ', '3 박 4 초순 놀 러 가 고 싶 다 ', 'ppl 심하 네', 'ppl 심하 네']
['12시 땡!', '1지망 학교 떨어졌어', '3박4일 놀러가고 싶다', '3박4일 정도 놀러가고 싶다', 'PPL 심하네', 'SD카드 망가졌어', 'SD카드 안돼', 'SNS 맞팔 왜 안하지ㅠㅠ', 'SNS 시간낭비인 거 아는데 매일 하는 중', 'SNS 시간낭비인데 자꾸 보게됨', 'SNS보면 나만 빼고 다 행복해보여', '가끔 궁금해', '가끔 뭐하는지 궁금해', '가끔은 혼자인게 좋다', '가난한 자의 설움', '가만 있어도 땀난다', '가상화폐 쫄딱 망함', '가스불 켜고 나갔어', '가스불 켜놓고 나온거 같아', '가스비 너무 많이 나왔다.']
58005


In [11]:
new_que_corpus, new_ans_corpus, word_to_index = build_corpus(src_corpus,tgt_corpus,max_len,dup=1)


In [12]:
len(new_que_corpus)

58005

In [13]:
index_to_word = {index:word for word, index in word_to_index.items()}

In [14]:

ans = []
for a in new_ans_corpus:
    ans.append([word_to_index["<BOS>"]] + a + [word_to_index["<EOS>"]])

In [15]:
enc_tensor = tf.keras.preprocessing.sequence.pad_sequences(new_que_corpus, padding='post')
dec_tensor = tf.keras.preprocessing.sequence.pad_sequences(ans, padding='post')

enc_train, enc_val, dec_train, dec_val = \
train_test_split(enc_tensor, dec_tensor, test_size=0.01)

print("enc_train :", len(enc_train), "enc_val :", len(enc_val))
print("dec_train :", len(dec_train), "dec_val :",len(dec_val))

enc_train : 57424 enc_val : 581
dec_train : 57424 dec_val : 581


In [16]:
len(enc_train[0])

34

In [17]:

len(dec_train[0])

43

## 모델 

In [18]:
def positional_encoding(pos, d_model):
    def cal_angle(position, i):
        return position / np.power(10000, int(i) / d_model)

    def get_posi_angle_vec(position):
        return [cal_angle(position, i) for i in range(d_model)]

    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(pos)])

    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])

    return sinusoid_table
print("슝=3")

슝=3


In [19]:
# Mask  생성하기
def generate_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def generate_causality_mask(src_len, tgt_len):
    mask = 1 - np.cumsum(np.eye(src_len, tgt_len), 0)
    return tf.cast(mask, tf.float32)

def generate_masks(src, tgt):
    enc_mask = generate_padding_mask(src)
    dec_mask = generate_padding_mask(tgt)

    dec_causality_mask = generate_causality_mask(tgt.shape[1], tgt.shape[1])
    dec_mask = tf.maximum(dec_mask, dec_causality_mask)

    dec_enc_causality_mask = generate_causality_mask(tgt.shape[1], src.shape[1])
    dec_enc_mask = tf.maximum(enc_mask, dec_enc_causality_mask)

    return enc_mask, dec_enc_mask, dec_mask
print("슝=3")


슝=3


In [20]:
# Multi Head Attention 구현
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        self.depth = d_model // self.num_heads

        self.W_q = tf.keras.layers.Dense(d_model)
        self.W_k = tf.keras.layers.Dense(d_model)
        self.W_v = tf.keras.layers.Dense(d_model)

        self.linear = tf.keras.layers.Dense(d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask):
        d_k = tf.cast(K.shape[-1], tf.float32)
        QK = tf.matmul(Q, K, transpose_b=True)

        scaled_qk = QK / tf.math.sqrt(d_k)

        if mask is not None: scaled_qk += (mask * -1e9)  

        attentions = tf.nn.softmax(scaled_qk, axis=-1)
        out = tf.matmul(attentions, V)

        return out, attentions


    def split_heads(self, x):
        bsz = x.shape[0]
        split_x = tf.reshape(x, (bsz, -1, self.num_heads, self.depth))
        split_x = tf.transpose(split_x, perm=[0, 2, 1, 3])

        return split_x

    def combine_heads(self, x):
        bsz = x.shape[0]
        combined_x = tf.transpose(x, perm=[0, 2, 1, 3])
        combined_x = tf.reshape(combined_x, (bsz, -1, self.d_model))

        return combined_x


    def call(self, Q, K, V, mask):
        WQ = self.W_q(Q)
        WK = self.W_k(K)
        WV = self.W_v(V)

        WQ_splits = self.split_heads(WQ)
        WK_splits = self.split_heads(WK)
        WV_splits = self.split_heads(WV)

        out, attention_weights = self.scaled_dot_product_attention(
            WQ_splits, WK_splits, WV_splits, mask)

        out = self.combine_heads(out)
        out = self.linear(out)

        return out, attention_weights
print("슝=3")

슝=3


In [21]:
class PoswiseFeedForwardNet(tf.keras.layers.Layer):
    def __init__(self, d_model, d_ff):
        super(PoswiseFeedForwardNet, self).__init__()
        self.d_model = d_model
        self.d_ff = d_ff

        self.fc1 = tf.keras.layers.Dense(d_ff, activation='relu')
        self.fc2 = tf.keras.layers.Dense(d_model)

    def call(self, x):
        out = self.fc1(x)
        out = self.fc2(out)

        return out
print("슝=3")

슝=3


In [22]:

class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, n_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.enc_self_attn = MultiHeadAttention(d_model, n_heads)
        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)

    def call(self, x, mask):

        """
        Multi-Head Attention
        """
        residual = x
        out = self.norm_1(x)
        out, enc_attn = self.enc_self_attn(out, out, out, mask)
        out = self.do(out)
        out += residual

        """
        Position-Wise Feed Forward Network
        """
        residual = out
        out = self.norm_2(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, enc_attn
print("슝=3")

슝=3


In [23]:
# Decoder 레이어 구현
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.dec_self_attn = MultiHeadAttention(d_model, num_heads)
        self.enc_dec_attn = MultiHeadAttention(d_model, num_heads)

        self.ffn = PoswiseFeedForwardNet(d_model, d_ff)

        self.norm_1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm_3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.do = tf.keras.layers.Dropout(dropout)

    def call(self, x, enc_out, causality_mask, padding_mask):

        """
        Masked Multi-Head Attention
        """
        residual = x
        out = self.norm_1(x)
        out, dec_attn = self.dec_self_attn(out, out, out, padding_mask)
        out = self.do(out)
        out += residual

        """
        Multi-Head Attention
        """
        residual = out
        out = self.norm_2(out)
        out, dec_enc_attn = self.dec_self_attn(out, enc_out, enc_out, causality_mask)
        out = self.do(out)
        out += residual

        """
        Position-Wise Feed Forward Network
        """
        residual = out
        out = self.norm_3(out)
        out = self.ffn(out)
        out = self.do(out)
        out += residual

        return out, dec_attn, dec_enc_attn
print("슝=3")

슝=3


In [24]:
# Encoder 구현
class Encoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Encoder, self).__init__()
        self.n_layers = n_layers
        self.enc_layers = [EncoderLayer(d_model, n_heads, d_ff, dropout) 
                        for _ in range(n_layers)]

        self.do = tf.keras.layers.Dropout(dropout)

    def call(self, x, mask):
        out = x

        enc_attns = list()
        for i in range(self.n_layers):
            out, enc_attn = self.enc_layers[i](out, mask)
            enc_attns.append(enc_attn)

        return out, enc_attns
print("슝=3")

슝=3


In [25]:
class Decoder(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    dropout):
        super(Decoder, self).__init__()
        self.n_layers = n_layers
        self.dec_layers = [DecoderLayer(d_model, n_heads, d_ff, dropout) 
                            for _ in range(n_layers)]


    def call(self, x, enc_out, causality_mask, padding_mask):
        out = x

        dec_attns = list()
        dec_enc_attns = list()
        for i in range(self.n_layers):
            out, dec_attn, dec_enc_attn = \
            self.dec_layers[i](out, enc_out, causality_mask, padding_mask)

            dec_attns.append(dec_attn)
            dec_enc_attns.append(dec_enc_attn)

        return out, dec_attns, dec_enc_attns
print("슝=3")

슝=3


## 학습 

In [26]:
class Transformer(tf.keras.Model):
    def __init__(self,
                    n_layers,
                    d_model,
                    n_heads,
                    d_ff,
                    src_vocab_size,
                    tgt_vocab_size,
                    pos_len,
                    dropout=0.2,
                    shared_fc=True,
                    shared_emb=False):
        super(Transformer, self).__init__()

        self.d_model = tf.cast(d_model, tf.float32)

        if shared_emb:
            self.enc_emb = self.dec_emb = \
            tf.keras.layers.Embedding(src_vocab_size, d_model)
        else:
            self.enc_emb = tf.keras.layers.Embedding(src_vocab_size, d_model)
            self.dec_emb = tf.keras.layers.Embedding(tgt_vocab_size, d_model)

        self.pos_encoding = positional_encoding(pos_len, d_model)
        self.do = tf.keras.layers.Dropout(dropout)

        self.encoder = Encoder(n_layers, d_model, n_heads, d_ff, dropout)
        self.decoder = Decoder(n_layers, d_model, n_heads, d_ff, dropout)

        self.fc = tf.keras.layers.Dense(tgt_vocab_size)

        self.shared_fc = shared_fc

        if shared_fc:
            self.fc.set_weights(tf.transpose(self.dec_emb.weights))

    def embedding(self, emb, x):
        seq_len = x.shape[1]

        out = emb(x)

        if self.shared_fc: out *= tf.math.sqrt(self.d_model)

        out += self.pos_encoding[np.newaxis, ...][:, :seq_len, :]
        out = self.do(out)

        return out


    def call(self, enc_in, dec_in, enc_mask, causality_mask, dec_mask):
        enc_in = self.embedding(self.enc_emb, enc_in)
        dec_in = self.embedding(self.dec_emb, dec_in)

        enc_out, enc_attns = self.encoder(enc_in, enc_mask)

        dec_out, dec_attns, dec_enc_attns = \
        self.decoder(dec_in, enc_out, causality_mask, dec_mask)

        logits = self.fc(dec_out)

        return logits, enc_attns, dec_attns, dec_enc_attns

print("슝=3")

슝=3


In [27]:
# 주어진 하이퍼파라미터로 Transformer 인스턴스 생성
transformer = Transformer(
    n_layers=2,
    d_model=128,
    n_heads=8,
    d_ff=256,
    src_vocab_size=vocab_size,
    tgt_vocab_size=vocab_size,
    pos_len=42,
    dropout=0.3,
    shared_fc=True,
    shared_emb=True)

d_model = 128
print("슝=3")

슝=3


In [28]:
class LearningRateScheduler(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(LearningRateScheduler, self).__init__()

        self.d_model = d_model
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = step ** -0.5
        arg2 = step * (self.warmup_steps ** -1.5)

        return (self.d_model ** -0.5) * tf.math.minimum(arg1, arg2)

print("슝=3")

슝=3


In [29]:
learning_rate = LearningRateScheduler(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate,
                                        beta_1=0.9,
                                        beta_2=0.98, 
                                        epsilon=1e-9)
print("슝=3")

슝=3


In [30]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

print("슝=3")

슝=3


In [31]:
# Train Step 정의
@tf.function()
def train_step(src, tgt, model, optimizer):
    tgt_in = tgt[:, :-1]  # Decoder의 input
    gold = tgt[:, 1:]     # Decoder의 output과 비교하기 위해 right shift를 통해 생성한 최종 타겟

    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt_in)

    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = \
        model(src, tgt_in, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions)

    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss, enc_attns, dec_attns, dec_enc_attns

print("슝=3")

슝=3


In [32]:
# 훈련시키기
from tqdm import tqdm_notebook 

BATCH_SIZE = 64
EPOCHS = 5

for epoch in range(EPOCHS):
    total_loss = 0

    idx_list = list(range(0, enc_train.shape[0], BATCH_SIZE))
    random.shuffle(idx_list)
    t = tqdm_notebook(idx_list)

    for (batch, idx) in enumerate(t):
        batch_loss, enc_attns, dec_attns, dec_enc_attns = \
        train_step(enc_train[idx:idx+BATCH_SIZE],
                    dec_train[idx:idx+BATCH_SIZE],
                    transformer,
                    optimizer)

        total_loss += batch_loss

        t.set_description_str('Epoch %2d' % (epoch + 1))
        t.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (batch + 1)))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  if sys.path[0] == '':


  0%|          | 0/898 [00:00<?, ?it/s]

  0%|          | 0/898 [00:00<?, ?it/s]

  0%|          | 0/898 [00:00<?, ?it/s]

  0%|          | 0/898 [00:00<?, ?it/s]

  0%|          | 0/898 [00:00<?, ?it/s]

In [33]:
# !pip install nltk # nltk가 설치되어 있지 않은 경우 주석 해제
from nltk.translate.bleu_score import sentence_bleu

reference = "많 은 자연어 처리 연구자 들 이 트랜스포머 를 선호 한다".split()
candidate = "적 은 자연어 학 개발자 들 가 트랜스포머 을 선호 한다 요".split()

print("원문:", reference)
print("번역문:", candidate)
print("BLEU Score:", sentence_bleu([reference], candidate))

원문: ['많', '은', '자연어', '처리', '연구자', '들', '이', '트랜스포머', '를', '선호', '한다']
번역문: ['적', '은', '자연어', '학', '개발자', '들', '가', '트랜스포머', '을', '선호', '한다', '요']
BLEU Score: 8.190757052088229e-155


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [34]:
print("1-gram:", sentence_bleu([reference], candidate, weights=[1, 0, 0, 0]))
print("2-gram:", sentence_bleu([reference], candidate, weights=[0, 1, 0, 0]))
print("3-gram:", sentence_bleu([reference], candidate, weights=[0, 0, 1, 0]))
print("4-gram:", sentence_bleu([reference], candidate, weights=[0, 0, 0, 1]))

1-gram: 0.5
2-gram: 0.18181818181818182
3-gram: 2.2250738585072626e-308
4-gram: 2.2250738585072626e-308


## 평가 

In [35]:
from nltk.translate.bleu_score import SmoothingFunction

def calculate_bleu(reference, candidate, weights=[0.25, 0.25, 0.25, 0.25]):
    return sentence_bleu([reference],
                         candidate,
                         weights=weights,
                         smoothing_function=SmoothingFunction().method1)  # smoothing_function 적용

print("BLEU-1:", calculate_bleu(reference, candidate, weights=[1, 0, 0, 0]))
print("BLEU-2:", calculate_bleu(reference, candidate, weights=[0, 1, 0, 0]))
print("BLEU-3:", calculate_bleu(reference, candidate, weights=[0, 0, 1, 0]))
print("BLEU-4:", calculate_bleu(reference, candidate, weights=[0, 0, 0, 1]))

print("\nBLEU-Total:", calculate_bleu(reference, candidate))

BLEU-1: 0.5
BLEU-2: 0.18181818181818182
BLEU-3: 0.010000000000000004
BLEU-4: 0.011111111111111112

BLEU-Total: 0.05637560315259291


In [36]:
# translate()

def evaluate(sentence, model, src_tokenizer, tgt_tokenizer):
    sentence = preprocess_sentence(sentence)
    
    print(sentence)
    s = src_tokenizer.morphs(sentence)
    tokens = []
    for i in s:
        tokens.append(word_to_index[i])
    pieces = tokens        
        
#     pieces = src_tokenizer.encode_as_pieces(sentence)
#     tokens = src_tokenizer.encode_as_ids(sentence)
        
    _input = tf.keras.preprocessing.sequence.pad_sequences([tokens],
                                                           maxlen=enc_train.shape[-1],
                                                           padding='post')
    
    ids = []
    output = tf.expand_dims([1], 0)
    for i in range(dec_train.shape[-1]):
        enc_padding_mask, combined_mask, dec_padding_mask = \
        generate_masks(_input, output)

        predictions, enc_attns, dec_attns, dec_enc_attns =\
        model(_input, 
              output,
              enc_padding_mask,
              combined_mask,
              dec_padding_mask)

        predicted_id = \
        tf.argmax(tf.math.softmax(predictions, axis=-1)[0, -1]).numpy().item()

        if predicted_id == 3:
            #result = tgt_tokenizer.decode_ids(ids)
            result = []
            for i in ids:
                result.append(index_to_word[i])
            print(result)
            
            return pieces, result, enc_attns, dec_attns, dec_enc_attns

        ids.append(predicted_id)
        output = tf.concat([output, tf.expand_dims([predicted_id], 0)], axis=-1)

    #result = tgt_tokenizer.decode_ids(ids)
    result = ''
    for i in ids:
        result += index_to_word[i]
    
    return pieces, result, enc_attns, dec_attns, dec_enc_attns

def translate(sentence, model, src_tokenizer, tgt_tokenizer):
    pieces, result, enc_attns, dec_attns, dec_enc_attns = \
    evaluate(sentence, model, src_tokenizer, tgt_tokenizer)

    return result
print("슝=3")

슝=3


In [37]:
def eval_bleu(src_corpus, tgt_corpus, verbose=True):
    total_score = 0.0
    sample_size = len(tgt_corpus)

    for idx in tqdm_notebook(range(sample_size)):
        src_tokens = src_corpus[idx]
        tgt_tokens = tgt_corpus[idx]

#         src_sentence = tokenizer.decode_ids((src_tokens.tolist()))
#         tgt_sentence = tokenizer.decode_ids((tgt_tokens.tolist()))

        src_sentence = []
        tgt_sentence = []
        
        for w in src_tokens:
            if w != 0:
                ow = index_to_word[w]
                src_sentence.append(ow)

        src_sentence = ' '.join(src_sentence)
        
        for w in tgt_tokens:
            if w != 0 and w != 1 and w != 3:
                ow = index_to_word[w]
                tgt_sentence.append(ow)
   
        tgt_sentence = ' '.join(tgt_sentence)
        
        reference = preprocess_sentence(tgt_sentence).split()
        candidate = translate(src_sentence, transformer, tokenizer, tokenizer)
        
        score = sentence_bleu([reference], candidate,
                              smoothing_function=SmoothingFunction().method1)
        total_score += score

        if verbose:
            print("Source Sentence: ", src_sentence)
            print("Model Prediction: ", candidate)
            print("Real: ", reference)
            print("Score: %lf\n" % score)

    print("Num of Sample:", sample_size)
    print("Total Score:", total_score / sample_size)
print("슝=3")

슝=3


In [38]:
eval_bleu(enc_val[:3], dec_val[:3], True)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/3 [00:00<?, ?it/s]

헤어진 사람 이 생각나 는데 어떻게 견디 지
['다른', '관심사', '를', '만들', '어', '보', '세요', '.']
Source Sentence:  헤어진 사람 이 생각나 는데 어떻게 견디 지
Model Prediction:  ['다른', '관심사', '를', '만들', '어', '보', '세요', '.']
Real:  ['다른', '관심사', '를', '만들', '어', '보', '세요', '.', '취미', '생활', '도', '많', '은', '도움', '이', '됩니다', '.']
Score: 0.324652

이젠 증오 가 될 거 같 테
['충분히', '이해', '해요', '.', '하지만', '나', '에게', '때문', '독', '이', '될', '수', '있', '어요', '.']
Source Sentence:  이젠 증오 가 될 거 같 테
Model Prediction:  ['충분히', '이해', '해요', '.', '하지만', '나', '에게', '때문', '독', '이', '될', '수', '있', '어요', '.']
Real:  ['충분히', '인식', '해요', '.', '하지만', '나', '에게', '도', '독', '이', '될', '수', '있', '어요', '.']
Score: 0.660633

주택 청약 가입
['내', '집', '확보', '의', '한', '걸음', '을', '내딛', '으셨', '네요', '.']
Source Sentence:  주택 청약 가입
Model Prediction:  ['내', '집', '확보', '의', '한', '걸음', '을', '내딛', '으셨', '네요', '.']
Real:  ['내', '집', '마련', '의', '한', '걸음', '을', '내딛', '으셨', '네요', '.']
Score: 0.741945

Num of Sample: 3
Total Score: 0.5757433312325374


In [40]:
translate("지루하다, 놀러가고 싶어.", transformer, tokenizer, tokenizer)

지루하다 , 놀러가고 싶어 .
['차', '전화', '도', '제', '가', '있', '죠', '.']


['차', '전화', '도', '제', '가', '있', '죠', '.']

In [41]:
translate("오늘 일찍 일어났더니 피곤하다.", transformer, tokenizer, tokenizer)

오늘 일찍 일어났더니 피곤하다 .
['오늘', '도', '중요', '하', '지', '않', '아요', '.']


['오늘', '도', '중요', '하', '지', '않', '아요', '.']

In [42]:
translate("간만에 여자친구랑 데이트 하기로 했어.", transformer, tokenizer, tokenizer)

간만에 여자친구랑 데이트 하기로 했어 .
['아파하', '지', '않', '았', '나', '들', '은', '위', '로', '되', '죠', '.']


['아파하', '지', '않', '았', '나', '들', '은', '위', '로', '되', '죠', '.']

In [43]:
translate("집에 있는다는 소리야.", transformer, tokenizer, tokenizer)

집에 있는다는 소리야 .
['뭐', '든', '하', '는', '게', '먹', '죠', '.']


['뭐', '든', '하', '는', '게', '먹', '죠', '.']

## 회고 

훈련할 때 차원이 맞지 않는 에러가 계속 발생해서 오랫동안 고생했다. 정상적으로 수행했다고 생각했는데 빠진 부분을 찾지 못해서 아쉬웠다. CV에서만 augmentation이 가능한줄 알았는데, NLP에서도 word2vec을 통해서 비슷한 어휘로 대체 하는 방법이 있다는 것이 놀라울 따름이었다. going deeper 노드를 진행하면 진행할수록 꼼꼼한 복습이 필요하겠다는 생각이 들었다.