attention.py 출처 : https://github.com/thushv89/attention_keras

In [None]:
import numpy as np
import re
import pandas as pd
import csv
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
reviews = pd.read_csv("/content/drive/My Drive/Colab Notebooks/ai_school/data/Musical_instruments_reviews.csv")

reviews = reviews.loc[:, ['reviewText', 'summary']]
reviews = reviews.dropna(axis=0)

reviews['reviewText'] = reviews['reviewText'].str.lower()
reviews['reviewText'] = reviews['reviewText'].str.replace('[^\w]', ' ')

reviews['summary'] = reviews['summary'].str.lower()
reviews['summary'] = reviews['summary'].str.replace('[^\w]', ' ')

print(reviews)

In [None]:
encoder_input, decoder_input, decoder_output = [], [], []

for stc in reviews['reviewText']:
    encoder_input.append(stc.split())

for stc in reviews['summary']:
    decoder_input.append(("<start> "+stc).split())

for stc in reviews['summary']:
    decoder_output.append((stc+" <end>").split())

In [None]:
tokenizer_re = Tokenizer()
tokenizer_re.fit_on_texts(encoder_input)
encoder_input = tokenizer_re.texts_to_sequences(encoder_input)

tokenizer_su = Tokenizer()
tokenizer_su.fit_on_texts(decoder_input)
tokenizer_su.fit_on_texts(decoder_output)
decoder_input = tokenizer_su.texts_to_sequences(decoder_input)
decoder_output = tokenizer_su.texts_to_sequences(decoder_output)

In [None]:
encoder_input = pad_sequences(encoder_input, padding="post")
decoder_input = pad_sequences(decoder_input, padding="post")
decoder_output = pad_sequences(decoder_output, padding="post")

In [None]:
print(encoder_input.shape)
print(decoder_input.shape)

In [None]:
su_to_index = tokenizer_su.word_index
index_to_su = tokenizer_su.index_word

In [None]:
test_size = 2500
encoder_input_train = encoder_input[:-test_size]
decoder_input_train = decoder_input[:-test_size]
decoder_output_train = decoder_output[:-test_size]

encoder_input_test = encoder_input[-test_size:]
decoder_input_test = decoder_input[-test_size:]
decoder_output_test = decoder_output[-test_size:]

In [None]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Masking, Concatenate
from tensorflow.keras.models import Model

In [None]:
encoder_inputs = Input(shape=(2089,))
encoder_embed = Embedding(len(tokenizer_re.word_index)+1, 50)(encoder_inputs)
encoder_mask = Masking(mask_value=0)(encoder_embed)
# return sequences = True 를 통해서 어텐션을 구할 때 필요한 전체 시점의 히든 상태값을 리턴하도록!
encoder_outputs, h_state, c_state = LSTM(50, return_state=True, return_sequences=True)(encoder_mask)

In [None]:
decoder_inputs = Input(shape=(27,))
decoder_embed = Embedding(len(tokenizer_su.word_index)+1, 50)(decoder_inputs)
decoder_mask = Masking(mask_value=0)(decoder_embed)
decoder_lstm = LSTM(50, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_mask, initial_state=[h_state, c_state])

In [None]:
from attention import AttentionLayer

# 어텐션 레이어 객체 생성
attn_layer = AttentionLayer()
# attn_out는 어텐션 밸류 (가중치가 보정된 인코더의 은닉 상태값의 합), attn_states는 가중치 
attn_out, attn_states = attn_layer([encoder_outputs, decoder_outputs])
# 디코더의 히든상태랑 어텐션 밸류를 결합해서 새로운 출력 벡터 구함
decoder_concat_input = Concatenate()([decoder_outputs, attn_out])

decoder_dense = Dense(len(tokenizer_su.word_index)+1, activation='softmax')
decoder_softmax_outputs = decoder_dense(decoder_concat_input)

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_softmax_outputs)
model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['acc'])
model.fit(x = [encoder_input_train, decoder_input_train], y = decoder_output_train, validation_data = ([encoder_input_test, decoder_input_test], decoder_output_test), batch_size = 128, epochs = 10)

In [None]:
# 어텐션 계산을 위해 출력으로 encoder_outputs 까지!
encoder_model = Model(encoder_inputs, [encoder_outputs, h_state, c_state])

In [None]:
encoder_h_state = Input(shape=(50,))
encoder_c_state = Input(shape=(50,))

pd_decoder_outputs, pd_h_state, pd_c_state = decoder_lstm(decoder_mask, initial_state=[encoder_h_state, encoder_c_state])

# 어텐션
# 2089는 시점 (단어, 패딩) 의 수, 50은 히든 스테이트의 차원
pd_encoder_outputs = Input(shape=(2089, 50))
pd_attn_out, pd_attn_states = attn_layer([pd_encoder_outputs, pd_decoder_outputs])
pd_decoder_concat = Concatenate()([pd_decoder_outputs, pd_attn_out])

pd_decoder_softmax_outputs = decoder_dense(pd_decoder_concat)

# 어텐션은 디코더 모델 안에서 사용하는거기 때문에, 인풋으로 encoder outputs 까지 넣어준다!
decoder_model = Model([decoder_inputs, pd_encoder_outputs, encoder_h_state, encoder_c_state], [pd_decoder_softmax_outputs, pd_h_state, pd_c_state])

In [None]:
input_stc = input()
token_stc = input_stc.split()
encode_stc = tokenizer_re.texts_to_sequences([token_stc])
pad_stc = pad_sequences(encode_stc, maxlen=2089, padding="post")

# 출력이 3가지 (전체 시점의 히든 상태값, 마지막 시점의 히든/셀 상태 값) 가 나온다
en_out, en_hidden, en_cell = encoder_model.predict(pad_stc)

predicted_seq = np.zeros((1,1))
predicted_seq[0, 0] = su_to_index['<start>']

decoded_stc = []

while True:
    # 여기서 인풋으로 en_out 도 같이 넣어준다!
    output_words, h, c = decoder_model.predict([predicted_seq, en_out, en_hidden, en_cell])

    predicted_word = index_to_su[np.argmax(output_words[0,0])]

    if predicted_word == '<end>':
        break

    decoded_stc.append(predicted_word)

    predicted_seq = np.zeros((1,1))
    predicted_seq[0, 0] = np.argmax(output_words[0, 0])

    en_hidden = h
    en_cell = c

print(' '.join(decoded_stc))