<a href="https://colab.research.google.com/github/yulbeom/portfolio/blob/main/%EC%9E%90%EC%97%B0%EC%96%B49_%EC%88%98%EC%97%85%EB%82%B4%EC%9A%A9_%EC%A0%95%EB%A6%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [None]:
lines = pd.read_csv('./fra.txt', names=['src', 'tar', 'lic'], sep='\t')
lines

In [None]:
for i in lines.src.iloc[-5:]:
  print(i)
  print('문장의 길이: ', len(i))

In [None]:
lines['length_src'] = lines.src.apply(lambda x:len(x))
lines['length_tar'] = lines.tar.apply(lambda x:len(x))

In [None]:
lines

In [None]:
print('src 평균: %.1f' %(lines.length_src.mean()))
print('src 중앙값: ', lines.length_src.median())
print('src 최대값: ', lines.length_src.max())
lines.length_src.plot(kind='hist', bins=100)

In [None]:
print('taret 평균: %.1f' %(lines.length_tar.mean()))
print('taret 중앙값: ', lines.length_tar.median())
print('taret 최대값: ', lines.length_tar.max())
lines.length_tar.plot(kind='hist', bins=100)

In [None]:
lines.sample(10)

In [None]:
lines_30 = lines.loc[lines.length_src <= 15]

In [None]:
lines_30.shape

In [None]:
lines_30 = lines_30.reset_index()

In [None]:
lines_30

In [None]:
lines_30.columns

In [None]:
lines_30.drop(['index','lic', 'length_src', 'length_tar'], axis=1, inplace=True)

In [None]:
lines_30.sample(10)

In [None]:
# <sos> : \t
# <eos> : \n
lines_30['tar'] = lines_30.tar.apply(lambda x: '\t ' + x + ' \n')

In [None]:
lines_30.sample(10)

In [None]:
src_vocab = set()
for line in lines_30.src:
  for char in line:
    src_vocab.add(char)

tar_vocab = set()
for line in lines_30.tar:
  for char in line:
    tar_vocab.add(char)

In [None]:
print('src_vocab:', '\n', src_vocab)
#print('src_vocab length: ', len(src_vocab), '\n')
print('tar_vocab:', '\n', tar_vocab)
#print('tar_vocab length: ', len(tar_vocab))

In [None]:
src_vocab_size = len(src_vocab) + 1
tar_vocab_size = len(tar_vocab) + 1
print('src_vocab_size: ', src_vocab_size)
print('tar_vocab_size: ', tar_vocab_size)

In [None]:
src_vocab = sorted(src_vocab)
tar_vocab = sorted(tar_vocab)

In [None]:
print(src_vocab[50:75])
print(tar_vocab[50:75])

In [None]:
print(src_vocab[:2])
print(tar_vocab[:2])

In [None]:
# dict([('a', 'b')])
src_to_index = dict([(word, i+1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word, i+1) for i, word in enumerate(tar_vocab)])

In [None]:
print(src_to_index)
print(tar_to_index)

In [None]:
src_to_index['a']

In [None]:
encoder_input = []

for line in lines_30.src:
  encoded_line = []
  for char in line:
    encoded_line.append(src_to_index[char])
  encoder_input.append(encoded_line)

In [None]:
print('src 문장 인코딩: ', encoder_input[:5])
print('src 문장 원본  : ', '\n', lines_30.src[:5])

In [None]:
print('src 문장 인코딩 last_one: ', encoder_input[-1])

In [None]:
decoder_input = []

for line in lines_30.tar:
  encoded_line = []
  for char in line:
    encoded_line.append(tar_to_index[char])
  decoder_input.append(encoded_line)

In [None]:
print('src 문장 인코딩: ', decoder_input[:5])
print('src 문장 원본  : ', '\n', lines_30.tar[:5])

In [None]:
decoder_target = []

for line in lines_30.tar:
  timestep = 0
  encoded_line = []
  for char in line:
    if timestep > 0:
      encoded_line.append(tar_to_index[char])
    timestep += 1
  decoder_target.append(encoded_line)

In [None]:
print(encoder_input[:5])
print(decoder_input[:5])
print(decoder_target[:5])

In [None]:
max_src_len = max([len(line) for line in lines_30.src])
max_tar_len = max([len(line) for line in lines_30.tar])
print('max_src_len: ', max_src_len)
print('max_tar_len: ', max_tar_len)

In [None]:
encoder_input = pad_sequences(encoder_input, maxlen=max_src_len, padding='post')
decoder_input = pad_sequences(decoder_input, maxlen=max_tar_len, padding='post')
decoder_target = pad_sequences(decoder_target, maxlen=max_tar_len, padding='post')

In [None]:
print(encoder_input[0])
print(len(encoder_input[0]))
print(decoder_input[0])
print(len(decoder_input[0]))
print(decoder_target[0])
print(len(decoder_target[0]))

In [None]:
encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)

In [None]:
print('encoder input dim: ', encoder_input.shape)
print('number of samples: ', lines_30.shape[0])
print('max_src_len: ', max_src_len)
print('src_vocab_size: ', src_vocab_size)
print('-'*40)
print('decoder input dim: ', decoder_input.shape)
print('number of samples: ', lines_30.shape[0])
print('max_tar_len: ', max_tar_len)
print('tar_vocab_size: ', tar_vocab_size)
print('-'*40)
print('decoder target dim: ', decoder_target.shape)

In [None]:
from tensorflow.keras.layers import Input, SimpleRNN, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
import numpy as np

In [None]:
print(Input(82))
print(Input(shape=(82)))
print(Input(shape=(None, src_vocab_size)))

In [None]:
encoder_input_test = Input(82)
encoder_inputs = Input(shape=(None, src_vocab_size))

In [None]:
print(LSTM(units=256)(encoder_inputs))
print(LSTM(units=256, return_sequences=True, return_state=False)(encoder_inputs))
print(LSTM(units=256, return_sequences=False, return_state=True)(encoder_inputs))
print(LSTM(units=256, return_sequences=True, return_state=True)(encoder_inputs))

In [None]:
#print(LSTM(units=256)(encoder_input_test)) error 발생

In [None]:
encoder_rnn = SimpleRNN(256, return_state=True)
encoder_lstm = LSTM(units=256, return_state=True)

In [None]:
# LSTM(units=256, return_state=True)(Input(shape=(None, src_vocab_size)))
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
# encoder_rnn(encoder_inputs)

In [None]:
print('encoder_outputs: ', encoder_outputs)
print('state_h: ', state_h)
print('state_c: ', state_c)

In [None]:
encoder_states = [state_h, state_c]
print(encoder_states) # context vector

In [None]:
print(tar_vocab_size)
decoder_inputs = Input(shape=(None, tar_vocab_size))

In [None]:
decoder_inputs

In [None]:
decoder_lstm = LSTM(units=256, return_state=True, return_sequences=True)

In [None]:
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)

In [None]:
10

In [None]:
_

In [None]:
_ * 10

In [None]:
print(decoder_outputs)

In [None]:
print(tar_vocab_size)
decoder_softmax_layer = Dense(tar_vocab_size, activation='softmax')

In [None]:
decoder_outputs = decoder_softmax_layer(decoder_outputs)

In [None]:
# Dense(tar_vocab_size, activation='softmax')(LSTM(units=256, return_state=True, return_sequences=True)(Input(shape=(None, tar_vocab_size))))

In [None]:
print(decoder_outputs)

In [None]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
print(model)

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [None]:
model.summary()

In [None]:
model.fit(x=[encoder_input, decoder_input], y=decoder_target,
          batch_size=64, epochs=40, validation_split=0.2)

In [None]:
model.save_weights('ed_fra_eng_0705.h5')

In [None]:
model.summary() # training

In [None]:
print(encoder_inputs)
print(decoder_inputs)
print(encoder_states)

In [None]:
encoder_model = Model(inputs=encoder_inputs , outputs=encoder_states)

In [None]:
encoder_model.summary()

In [None]:
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_state_input = [decoder_state_input_h, decoder_state_input_c]
print(decoder_state_input_h)
print(decoder_state_input_c)

In [None]:
decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_input)

In [None]:
decoder_states = [state_h, state_c]
decoder_outputs = decoder_softmax_layer(decoder_outputs)
decoder_model = Model(inputs=[decoder_inputs] + decoder_state_input,
                      outputs=[decoder_outputs, state_h, state_c])

In [None]:
decoder_model.summary()

In [None]:
index_to_src = dict((i, char) for char, i in src_to_index.items())
index_to_tar = dict((i, char) for char, i in tar_to_index.items())

In [None]:
encoder_model.summary()

In [None]:
print(encoder_input.shape)
print(encoder_input[:1].shape)

In [None]:
states_value = encoder_model.predict(encoder_input[:1])

In [None]:
print('encoder_states_value')
print('*'*75)
print('len(states_value): ', len(states_value))
print('-'*70)
print('h: ', states_value[0])
print('h dim: ', states_value[0].shape)
print('-'*70)
print('c: ', states_value[1])
print('c dim: ', states_value[1].shape)
print('-'*70)

In [None]:
# <sos> 입력값 생성
print(np.zeros((2,2)))
print(tar_vocab_size)
print(np.zeros((1,1,tar_vocab_size)))
target_seq = np.zeros((1,1,tar_vocab_size))
print(target_seq.shape)

In [None]:
tar_to_index['\t']

In [None]:
target_seq[0,0,1] = 1

In [None]:
print(target_seq)
print(target_seq.shape)

In [None]:
print('1 target_seq (<sos>, decoder model 예측 시 입력)')
print('*' * 75)
print('target_seq:', '\n', target_seq)
print('target_seq[0][0]: ', target_seq[0][0])
print('len(target_seq[0][0]): ', len(target_seq[0][0]))
print('target_seq.shape: ', target_seq.shape)
print('*' * 75)
print('2 states_value (인코더 모델 히든, 셀 상태 출력을 예측 시 입력)')
print('*' * 75)
print('states_value:', '\n', states_value)
print('len(states_value):', len(states_value))
print('*' * 75)
print('2.1 히든 상태값 (state_value[0])')
print('-' * 70)
print(states_value[0])
print('states_value[0].shape:', states_value[0].shape)
print('-' * 70, '\n')

print('2.2 셀 상태값 (state_value[1])')
print('-' * 70)
print(states_value[1])
print('states_value[1].shape:', states_value[1].shape)
print('-' * 70, '\n')

In [None]:
pre_input = [target_seq] + states_value
print(pre_input)
print(len(pre_input))
print('sos: ', pre_input[0])
print('sos dim: ', pre_input[0].shape)
print('h: ', pre_input[1])
print('h dim: ', pre_input[1].shape)
print('c: ', pre_input[2])
print('c dim: ', pre_input[2].shape)

In [None]:
output_tokens, h, c = decoder_model.predict(pre_input)

In [None]:
print('output_tokens:', '\n', output_tokens)
print('output_tokens.shape: ', output_tokens.shape)
print('sum of output_tokens: ', np.sum(output_tokens))
print('max value index: ', np.argmax(output_tokens))
sampled_token_index = np.argmax(output_tokens)
print('index_to_tar: ', index_to_tar)
print('character: ', index_to_tar[sampled_token_index])
sampled_char = index_to_tar[sampled_token_index]

In [None]:
def decode_sequence(input_seq):
  # 입력으로부터 인코더의 상태를 얻음
  states_value = encoder_model.predict(input_seq)

  # <SOS>에 해당하는 원-핫 벡터 생성
  target_seq = np.zeros((1, 1, tar_vocab_size))
  target_seq[0, 0, tar_to_index['\t']] = 1.

  stop_condition = False
  decoded_sentence = ""

  # stop_condition이 True가 될 때까지 루프 반복
  while not stop_condition:
    # 이점 시점의 상태 states_value를 현 시점의 초기 상태로 사용
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

    # 예측 결과를 문자로 변환
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_char = index_to_tar[sampled_token_index]

    # 현재 시점의 예측 문자를 예측 문장에 추가
    decoded_sentence += sampled_char

    # <eos>에 도달하거나 최대 길이를 넘으면 중단.
    if (sampled_char == '\n' or
        len(decoded_sentence) > max_tar_len):
        stop_condition = True

    # 현재 시점의 예측 결과를 다음 시점의 입력으로 사용하기 위해 저장
    target_seq = np.zeros((1, 1, tar_vocab_size))
    target_seq[0, 0, sampled_token_index] = 1.

    # 현재 시점의 상태를 다음 시점의 상태로 사용하기 위해 저장
    states_value = [h, c]

  return decoded_sentence

In [None]:
print(lines.iloc[10])
print(decode_sequence(encoder_input[10:11]))

In [None]:
for i in range(1, 100):
  print('입력문장:', lines_30.src[i])
  print('출력문장:', lines_30.tar[i][2:-1])
  print('예측문장:', decode_sequence(encoder_input[i:i+1])[1:-1])
  print('-'*30)