# Attention

### seq2seq 한계

-> 인코더 : 입력 시퀸스를 받아 고정된 크기의 컨텍스트 벡터로 변환(타임스탭의 마지막 상태를 활용)

-> 디코더 : 인코더가 생성한 컨텍스트 벡터를 전달받아 출력 시퀸스를 생성

===> 정보 손실 : 입력 시퀸스를 고정된 크기의 컨텍스트 벡터로 변환 ➡️ 압축

===> 기울기 소실 : 긴 시퀸스를 처리할 때 초반 부분의 정보를 충분히 학습할 수 없다.

#### 어텐션 또한 입력 시퀸스의 모든 타임 스탭을 참조하여 디코더가 출력 시퀸스를 생성할 때, 각 타임스탭의 중요도를 부여하여 긴 시퀸스도 처리할 수 있다.

In [22]:
dict = {'2024' : 'LSTM', '2025' : 'Transformer'} # {key : value)

print(dict['2024'])
print(dict['2025'])

LSTM
Transformer


In [23]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Concatenate, Dropout
from tensorflow.keras import Input, Model

In [24]:
vocab_size = 10000
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=vocab_size)

In [25]:
print('리뷰의 최대 길이 : ', max(len(l) for l in X_train))
print('리뷰의 평균 길이 : ', sum(map(len, X_train))/len(X_train))

리뷰의 최대 길이 :  2494
리뷰의 평균 길이 :  238.71364


In [26]:
max_len = 500
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [27]:
import tensorflow as tf

In [28]:
class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = Dense(units) # 가중치로 입력 값 변환
        self.W2 = Dense(units) # 가중치로 입력 값 변환
        self.V = Dense(1) # 스코어를 계산하기 위한 Dense

    def call(self, values, query):
        # query shape = (batch_size, hidden_size)
        hidden_with_time_axis = tf.expand_dims(query, 1) # 이후의 연산을 위한 차원을 변경

        # score 계산 : values에 W1, hidden_with_time_axis W2를 적용하고 tanh 활성화 함수를 통과시킨 후 V 를 적용시켜 스코어를 계산
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))

        attention_weights = tf.nn.softmax(score, axis=1) # softmax 함수를 적용하여 스코어를 확률 분포로 변환

        context_vector = attention_weights * values # weights과 values를 곱하여 컨텐스트 벡터를 계산
        context_vector = tf.reduce_sum(context_vector, axis=1) # 차원을 기준으로 합하여 최종 context_vector를 계산

        return context_vector, attention_weights

In [29]:
# 모델 생성
from tensorflow.keras.layers import Dense, Embedding, Bidirectional, LSTM, Concatenate, Dropout
from tensorflow.keras import Input, Model
from tensorflow.keras import optimizers

In [30]:
sequence_input = Input(shape=(max_len,), dtype='int32')
embedded_sequences = Embedding(vocab_size, 128)(sequence_input)
lstm = Bidirectional(LSTM(64, dropout=0.5, return_sequences = True))(embedded_sequences)

2024-07-16 11:14:33.389310: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-16 11:14:33.390318: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-16 11:14:33.390982: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [31]:
# 순방향 은닉상태와 셀 상태(forword_h, forword_c), 역방향 은닉상태와 셀상태(backword_h, backword_c)

lstm, forward_h, forward_c, backward_h, backward_c = Bidirectional(LSTM(64, dropout=0.5, return_sequences=True, return_state=True))(lstm)

2024-07-16 11:14:33.600182: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-16 11:14:33.600839: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-16 11:14:33.602090: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [32]:
print(lstm.shape, forward_h.shape, forward_c.shape, backward_h.shape, backward_c.shape)

(None, 500, 128) (None, 64) (None, 64) (None, 64) (None, 64)


In [33]:
state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])

In [34]:
del dict

In [35]:
attention = Attention(64)
context_vector, attention_weights = attention(lstm, state_h)

In [36]:
dense1 = Dense(20, activation='relu')(context_vector)
dropout = Dropout(0.5)(dense1)
output = Dense(1, activation='sigmoid')(dropout)
model = Model(inputs=sequence_input, outputs = output)

In [37]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [38]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 500)]        0           []                               
                                                                                                  
 embedding_1 (Embedding)        (None, 500, 128)     1280000     ['input_2[0][0]']                
                                                                                                  
 bidirectional_2 (Bidirectional  (None, 500, 128)    98816       ['embedding_1[0][0]']            
 )                                                                                                
                                                                                                  
 bidirectional_3 (Bidirectional  [(None, 500, 128),  98816       ['bidirectional_2[0][0]']  

In [39]:
history = model.fit(X_train, y_train, epochs=3, batch_size=256, validation_data=(X_test, y_test), verbose=1)

Epoch 1/3


2024-07-16 11:14:35.454463: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-16 11:14:35.455366: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-16 11:14:35.455963: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2024-07-16 11:25:05.895845: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-16 11:25:05.897046: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-16 11:25:05.897710: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/3
Epoch 3/3


In [40]:
print('Test Acc :', model.evaluate(X_test, y_test)[1])

Test Acc : 0.8784000277519226


In [3]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from keras.layers import Layer
import tensorflow.keras.backend as K

In [4]:
# Bagdangu Attention(바다나우 어텐션) : Query가 t 시점이 아닌 t-1 시점의 디코더 셀의 은닉상태
class AttentionLayer(Layer):
    # 생성자 : 부모클래스 Layer를 호출하여 초기화(build 메서드에서 설정을 수행하므로 생성자에서 작업을 하지 않는다.)
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    # build : 클래스가 처음 사용될 때 한 번 호출되는 메서드
    def buile(self, input_shape):
        # assert문은 특정 조건이 참인지 아닌지 검증(거짓이면 오류 발생)
        assert isinstance(input_shape, list) # input_shape이 list 타입인지?

        # 가중치 W_a, U_a, V_a를 초기화한다.
        # W_a : 인코더 출력에 대한 가중치 행렬
        # U_a : 디코더 히든 상태에 대한 가중치 행렬
        # V_a : attention score를 계산하기 위한 가중치 벡터
        self.W_a = self.add_weight(name='W_a', shape=tf.TensorShape((input_shape[0][2], input_shape[0][2])), initializer='uniform', trainable=True)
        self.U_a = self.add_weight(name='U_a', shape=tf.TensorShape((input_shape[1][2], input_shape[0][2])), initializer='uniform', trainable=True)
        self.V_a = self.add_weight(name='V_a', shape=tf.TensorShape((input_shape[0][2], 1)), initializer='uniform', trainable=True)

        super(AttentionLayer, self).build(input_shape) # 부모클래스의 build 메서드 호출

    def call(self, inputs, verbose=False):
        assert type(inputs) == list # input이 list 타입인지?

        # 인코더와 디코더 출력 시퀸스 저장(inputs에는 인코더와 디코더의 출력을 갖고 있다.)
        encoder_out_seq, decoder_out_seq = inputs 
        if verbose: # 지워주어도 상관 없는 부분
            print(encoder_out_seq.shape)
            print(decoder_out_seq.shape)

        # 단일 디코더 계산 
        def energy_step(inputs, states):
            # inputs : batch_size * 1 * de_in_dim(현재 디코더 상태)
            # states : batch_size * 1 * de_latent_dim(전체 상태)
            
            assert_msg = 'States must be an iterable. Got {} of type {}'.format(states, type(states))
            assert isinstance(states, list) or instance(states, tuple), assert_msg

            # 텐서를 형성하는데 필요한 매개변수
            en_seq_len, en_hidden = encoder_out_seq.shape[1], encoder_out_seq.shape[2]
            de_hidden = inputs.shape[-1]

            # W(a) * s : s(인코더 출력)
            W_a_dot_s = K.dot(decoder_out_seq, self.W_a)

            # U(a) * h(j) : h(j)(현재 디코더 상태)
            U_a_dot_h = K.expand_dims(K.dot(inputs, self.U_a), 1)

            if verbose:
                print(U_a_dot_h.shape)

            # 두 결과의 합에 tanh 함수 적용
            Ws_plus_Uh = K.tanh(W_a_dot_s + U_a_dot_h)

            if verbose:
                print(Ws_plus_Uh.shape)

            # 배치크기 세팅 softmax를 이용하여 attention score를 계산
            e_i = K.squeeze(K.dot(Ws_plus_Uh, self.V_a), axis=1)
            e_i = K.softmax(e_i)

            if verbose:
                print(e_i.shape)

            return e_i, [e_i]

        # Attention score(e_i)를 사용하여 컨텍스트 벡터(c(i)) 생성(계산)
        def context_step(inputs, states):
            assert_msg = 'States must be an iterable. Got {} of type {}'.format(states, type(states))
            assert isinstance(states, list) or isinstance(states, tuple), assert_msg

            # attention score를 가중치로 사용하여 인코더 출력의 가중합을 계산해 컨텍스트 벡터를 만든다.
            c_i = K.sum(encoder_out_seq * K.expand_dums(inputs, -1), axis=1)
            if verbose:
                print(c_i.shape)
                
            return c_i, [c_i]

        # 가짜 상태 : K.rnn 함수를 초기화하기 위해 사용
        fake_state_c = K.sum(encoder_out_seq, axis=1)
        fake_state_e = K.sum(encodre_out_seq, axis=2)

        # energy_step을 통해 attention score를 계산(rnn을 사용하여 디코더 시퀸스를 반복하여 각 타임스탭에 대한 점수)
        last_out, e_outputs, _ = K.rnn(energy_step, decoder_out_seq, [fake_state_e],)

        # 계산된 attention score를 통해 컨텍스트 벡터를 계산한다.
        last_out, c_outputs, _ = K.rnn(context_step, e_outputs, [fake_state_c], )

        # c_outputs : 각 디코더 타임스탭에 대한 컨텍스트 벡터, e_outputs : 각 디코더 타임스탭에 대한 attention score
        return c_outputs, e_outputs

    # 레이어의 출력형태를 정의하는 메서드
    def compute_output_shape(self, input_shape):
        return [
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[1][2])), # 컨텍스트 벡터의 형태
            tf.TensorShape((input_shape[1][0], input_shape[1][1], input_shape[0][1])) # attention score 형태
        ]

In [5]:
lines = open('movie_lines.txt', encoding='utf-8', errors='ignore').read().split('\n')

convers = open('movie_conversations.txt', encoding='utf-8', errors='ignore').read().split('\n')

In [6]:
len(lines), len(convers)

(304714, 83098)

In [7]:
import re

In [8]:
exchn = []
for conver in convers:
    exchn.append(conver.split(' +++$+++ ')[-1][1:-1].replace("'", " ").replace(",", "").split())

exchn[:3]

[['L194', 'L195', 'L196', 'L197'],
 ['L198', 'L199'],
 ['L200', 'L201', 'L202', 'L203']]

In [9]:
diag = {}
for line in lines:
    # print(line.split(' +++$+++ ')[0])
    # print(line.split(' +++$+++ ')[-1])
    diag[line.split(' +++$+++ ')[0]] = line.split(' +++$+++ ')[-1]

# print(diag)

In [10]:
del(lines, convers, conver, line)

In [11]:
questions = []
answers = []

for conver in exchn:
    for i in range(len(conver)-1):
        questions.append(diag[conver[i]])
        answers.append(diag[conver[i+1]])

In [12]:
questions[:3], answers[:3]

(['Can we make this quick?  Roxanne Korrine and Andrew Barrett are having an incredibly horrendous public break- up on the quad.  Again.',
  "Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part.  Please.'],
 ["Well, I thought we'd start with pronunciation, if that's okay with you.",
  'Not the hacking and gagging and spitting part.  Please.',
  "Okay... then how 'bout we try out some French cuisine.  Saturday?  Night?"])

In [13]:
del(diag, exchn, conver, i)

In [14]:
sorted_ques = []
sorted_ans = []

for i in range(len(questions)):
    if len(questions[i]) < 13:
        sorted_ques.append(questions[i])
        sorted_ans.append(answers[i])

In [15]:
def clean_text(txt):
    txt = txt.lower()
    txt = re.sub(r"i'm'", "i am", txt)
    txt = re.sub(r"he's", "he is", txt)
    txt = re.sub(r"she's", "she is", txt)
    txt = re.sub(r"that's", "that is", txt)
    txt = re.sub(r"what's", "what is", txt)
    txt = re.sub(r"where's", "where is", txt)
    txt = re.sub(r"\'ll", "will", txt)
    txt = re.sub(r"\'re", "are", txt)
    txt = re.sub(r"\'d", "would", txt)
    txt = re.sub(r"won't", "will not", txt)
    txt = re.sub(r"can't", "can not", txt)
    txt = re.sub(r"[^\w\s]", "", txt) # 숫자, 문자, 언더바(\n), 공백(\s)을 제외하고 모두 제거
    return txt

In [16]:
clean_ques = []
clean_ans = []

for line in sorted_ques:
    clean_ques.append(clean_text(line))

for line in sorted_ans:
    clean_ans.append(clean_text(line))

In [17]:
clean_ques[:3], clean_ans[:3]

(['cameron', 'why', 'there'],
 ['the thing is cameron  im at the mercy of a particularly hideous breed of loser  my sister  i can not date until she does',
  'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
  'where'])

In [18]:
del(answers, questions, line)

In [19]:
del(sorted_ans, sorted_ques)

In [20]:
clean_ans[:3]

['the thing is cameron  im at the mercy of a particularly hideous breed of loser  my sister  i can not date until she does',
 'unsolved mystery  she used to be really popular when she started high school then it was just like she got sick of it or something',
 'where']

In [21]:
for i in range(len(clean_ans)):
    clean_ans[i] = ' '.join(clean_ans[i].split()[:11])

In [22]:
clean_ans[:3]

['the thing is cameron im at the mercy of a particularly',
 'unsolved mystery she used to be really popular when she started',
 'where']

In [23]:
clean_ans = clean_ans[:30000]
clean_ques = clean_ques[:30000]

In [24]:
clean_total = clean_ans + clean_ques

In [25]:
word2count = {}

for line in clean_total:
    words = line.split()
    for word in words:
        if word in word2count:
            word2count[word] += 1
        else:
            word2count[word] = 1

In [26]:
print(word2count['dog'])
print(word2count['cat'])

38
20


In [27]:
#print(word2count)

In [28]:
del(word, line)

In [29]:
# 빈도수 낮은 단어 삭제
thresh = 5

vocab = {}
word_num = 0

for word, count in word2count.items():
    if count >= thresh:
        vocab[word] = word_num
        word_num += 1

In [30]:
vocab['dog'], vocab['cat'], word_num

(380, 1565, 3076)

In [31]:
del(word2count, word, count, thresh, word_num)

In [32]:
for i in range(len(clean_ans)):
    clean_ans[i] = '<SOS> ' + clean_ans[i] + ' <EOS>'

In [33]:
tokens = ['<PAD>', '<EOS>', '<OUT>', '<SOS>']
# 토큰 뒷 부분에 하나씩 삽입
x = len(vocab)
for token in tokens:
    vocab[token] = x
    x += 1

In [34]:
for token in tokens:
    print(vocab[token])

3076
3077
3078
3079


In [35]:
vocab['cameron'] = vocab['<PAD>']
vocab['<PAD>'] = 0

In [36]:
print(vocab['cameron'], vocab['<PAD>'])

3076 0


In [37]:
del(token, tokens, x, i)

In [38]:
inv_vocab = {w:v for v, w in vocab.items()}

In [39]:
print(inv_vocab[0], inv_vocab[3023])

<PAD> nix


In [40]:
encoder_inp = []
for line in clean_ques:
    lst = []
    for word in line.split():
        if word not in vocab:
            lst.append(vocab['<OUT>'])
        else:
            lst.append(vocab[word])

    encoder_inp.append(lst)

In [41]:
decoder_inp = []
for line in clean_ans:
    lst = []
    for word in line.split():
        if word not in vocab:
            lst.append(vocab['<OUT>'])
        else:
            lst.append(vocab[word])
    decoder_inp.append(lst)

In [42]:
print(encoder_inp[:3], decoder_inp[:3])

[[3076], [170], [246]] [[3079, 0, 1, 2, 3076, 4, 5, 0, 3078, 6, 7, 8, 3077], [3079, 3078, 9, 10, 11, 12, 13, 14, 15, 16, 10, 17, 3077], [3079, 18, 3077]]


In [43]:
del(clean_ans, clean_ques, line, lst, word)

In [44]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

encoder_inp = pad_sequences(encoder_inp, 13, padding='post', truncating='post')
decoder_inp = pad_sequences(decoder_inp, 13, padding='post', truncating='post')

In [45]:
decoder_final_output = []
for i in decoder_inp:
    decoder_final_output.append(i[1:])

decoder_final_output = pad_sequences(decoder_final_output, 13, padding='post', truncating='post')

In [46]:
print(encoder_inp.shape, decoder_inp.shape, decoder_final_output.shape)

(30000, 13) (30000, 13) (30000, 13)


In [47]:
del(i)

In [48]:
VOCAB_SIZE = len(vocab)
MAX_LEN = 13

In [49]:
inv_vocab[13]

'be'

In [50]:
from tensorflow.keras.utils import to_categorical

decoder_final_output = to_categorical(decoder_final_output, VOCAB_SIZE)

In [51]:
decoder_final_output.shape

(30000, 13, 3080)

In [52]:
embeddings_index = {}

with open('glove.6B.50d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

In [53]:
embedding_dimention = 50

def embedding_matrix_creater(embedding_dimention, word_index):
    embedding_matrix = np.zeros((len(word_index)+1, embedding_dimention))

    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

embedding_matrix = embedding_matrix_creater(embedding_dimention, vocab)

In [54]:
del(embeddings_index)

In [55]:
embedding_matrix.shape

(3081, 50)

In [56]:
embedding_matrix[13]

array([ 9.11019981e-01, -2.28719994e-01,  2.07699999e-01, -2.02370003e-01,
        5.06969988e-01, -5.78930005e-02, -4.17290002e-01, -7.53410012e-02,
       -3.04540008e-01, -3.28600011e-03,  4.44810003e-01,  4.18179989e-01,
       -3.34089994e-01,  3.29170004e-02,  9.88720000e-01,  9.19839978e-01,
        4.05209988e-01,  1.92499999e-02, -1.05200000e-01, -7.98650026e-01,
       -3.64030004e-01, -8.79950002e-02,  7.21819997e-01,  1.11139998e-01,
        2.15299994e-01, -1.94110000e+00, -2.63760000e-01,  4.45499986e-01,
        2.75860012e-01, -2.11040005e-01,  4.02120018e+00, -6.19429983e-02,
       -3.21339995e-01, -8.19220006e-01,  2.10800007e-01, -2.04139993e-01,
        7.26249993e-01,  4.75169986e-01, -3.98530006e-01, -3.91680002e-01,
       -3.45809996e-01,  2.59280000e-02,  1.30720004e-01,  7.35620022e-01,
       -1.51989996e-01, -1.84389994e-01, -6.71280026e-01,  1.66920006e-01,
       -5.00629991e-02,  1.92410007e-01])

In [57]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, Input, Bidirectional, Concatenate, Dropout, Attention

In [58]:
embed = Embedding(VOCAB_SIZE+1, embedding_dimention, trainable=True)

embed.build((None,)) # 임베딩 레이어의 입력크기를 설정 (None,) : 크기가 가변적
embed.set_weights([embedding_matrix]) # 가중치 설정

In [59]:
enc_inp = Input(shape=(MAX_LEN, ))

enc_embed = embed(enc_inp)
enc_lstm = Bidirectional(LSTM(512, return_state=True, dropout=0.1, return_sequences=True))

encoder_outputs, forward_h, forward_c, backward_h, backward_c = enc_lstm(enc_embed)

state_h = Concatenate()([forward_h, backward_h])
state_c = Concatenate()([forward_c, backward_c])

enc_states = [state_h, state_c]

2024-07-17 09:13:28.401222: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-17 09:13:28.402519: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-17 09:13:28.403241: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [60]:
dec_inp = Input(shape=(MAX_LEN, ))

dec_embed = embed(dec_inp)
dec_lstm = LSTM(1024, return_state=True, dropout=0.1, return_sequences=True)
output, _, _ = dec_lstm(dec_embed, initial_state=enc_states)

2024-07-17 09:13:29.764986: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-17 09:13:29.766056: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-17 09:13:29.766776: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [61]:
from tensorflow.keras.layers import Attention
attn_layer = AttentionLayer()
attn_op, attn_state = attn_layer([encoder_outputs, output]) # 컨텍스트 벡터, attention score
decoder_concat_input = Concatenate(axis=-1)([output, attn_op])

NameError: Exception encountered when calling layer "attention_layer" (type AttentionLayer).

in user code:

    File "/var/folders/mr/zpw8mcz14pj_gkyqs4k_zl4c0000gn/T/ipykernel_8881/2801044672.py", line 81, in call  *
        fake_state_e = K.sum(encodre_out_seq, axis=2)

    NameError: name 'encodre_out_seq' is not defined


Call arguments received by layer "attention_layer" (type AttentionLayer):
  • inputs=['tf.Tensor(shape=(None, 13, 1024), dtype=float32)', 'tf.Tensor(shape=(None, 13, 1024), dtype=float32)']
  • verbose=False

In [None]:
dec_dense = Dense(VOCAB_SIZE, activation = 'softmax')
final_output = dec_dense(decoder_concat_input)

model = Model([enc_inp, dec_inp], final_output)

In [None]:
model.summary()

In [None]:
import keras
import tensorflow as tf

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [None]:
model.fit([encoder_inp, decoder_inp], decoder_final_output, epochs=10, batch_size=64, validation_split=0.1)

In [None]:
enc_model = tf.keras.models.Model(enc_inp, [encoder_outputs, enc_states])

decoder_state_input_h = tf.keras.layers.Input(shape=(1024, ))
decoder_state_input_c = tf.keras.layers.Input(shape=(1024, ))

decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = dec_lstm(dec_embed, initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]

dec_model = tf.keras.models.Model([dec_inp, decoder_states_inputs], [decoder_outputs] + decoder_states)

In [None]:
print('#'*30)
print('#       Start Chatting       #')
print('#'*30)

prepro1 = ''

while prepro1 != 'q' : # q를 입력하면 프로그램 종료
    prepro1 = input('you : ')
    try :
        prepro1 = clean_text(prepro1) # 입력문장에 대한 전처리
        prepro = [prepro1]

        txt = []
        for x in prepro: # 토큰화
            lst = []
            for y in x.split():
                try:
                    lst.append(vocab[y]) # 정수
                except:
                    lst.append(vocab['<OUT>']) # 없으면 <OUT> 토큰 입력
            txt.append(lst)
        txt = pad_sequences(txt, MAX_LEN, padding='post') # 패딩

        enc_op, stat = enc_model.predict(txt) # 인코더 모델로 출력을 생성 (출력 시퀸스, 상태)
        empty_target_seq = np.zeros((1,1)) # 초기 디코더 입력 시퀸스 초기화
        empty_target_seq[0,0] = vocab['<SOS>']
        stop_condition = False
        decoded_translation = ''

        while not stop_condition :
            dec_outputs, h, c = dec_model.predict([empty_target_seq] + stat) # 디코더 모델로 출력 생성

            attn_op, attn_state = attn_layer([enc_op, dec_outputs]) # attention layer를 사용하여 컨텍스트 벡터, 가중치 반환
            decoder_concat_input = Concatenate(axis=-1)([dec_outputs, attn_op])
            decoder_concat_input = dec_dense(decoder_concat_input) # 디코더 모델 출력 시퀸스와 컨텍스트 벡터를 denselayer로 변환

            sampled_word_index = np.argmax(decoder_concat_input[0,-1,:]) # 가장 높은 확률을 가진 단어를 가져온다.
            sampled_word = inv_vocab[sampled_word_index] + ' '

            if sampled_word != '<EOS> ':
                decoded_translation += sampled_word

            if sampled_word == '<EOS> ' or len(decoded_translation.split()) > MAX_LEN:
                stop_condition = True

            # 다음 입력 준비
            empty_target_seq = np.zeros((1,1))
            empty_target_seq[0,0] = sampled_word_index
            stat = [h,c]

        print('chatbot :', decoded_translation)
        print('='*30)

    except:
        print('Please try again')