## Visualizing A Neural Machine Translation Model (Mechanics of Seq2seq Models With Attention)
https://nlpinkorean.github.io/visualizing-neural-machine-translation-mechanics-of-seq2seq-models-with-attention/

In [1]:
import os
import shutil
import zipfile

import pandas as pd
import tensorflow as tf
import urllib3
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
!wget http://www.manythings.org/anki/fra-eng.zip

--2023-09-06 05:24:51--  http://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7757635 (7.4M) [application/zip]
Saving to: ‘fra-eng.zip’


2023-09-06 05:24:52 (14.5 MB/s) - ‘fra-eng.zip’ saved [7757635/7757635]



In [3]:
!unzip ./fra-eng.zip

Archive:  ./fra-eng.zip
  inflating: _about.txt              
  inflating: fra.txt                 


In [4]:
lines = pd.read_csv('fra.txt', names = ['src', 'tar', 'lic'], sep = '\t')
del lines['lic']
print('전체 샘플의 개수: ', len(lines))

전체 샘플의 개수:  227815


In [5]:
lines = lines.loc[:, 'src':'tar']
lines = lines[0:60000] # 6만개만 저장
lines.sample(10)

Unnamed: 0,src,tar
25228,Break time's over.,Fin de la récréation.
52768,We can't do that now.,Nous ne pouvons pas faire cela maintenant.
10170,You'll go far.,Vous irez loin.
36226,These are my terms.,Voici mes conditions.
39468,Don't wake the baby.,Ne réveille pas le bébé.
12523,Please come in.,"Entrez, je vous prie."
6372,They're dead.,Elles sont décédées.
267,Get out.,Dégagez !
17910,Tom kept eating.,Tom continuait à manger.
52868,We met last Thursday.,Nous nous sommes rencontrées jeudi dernier.


In [6]:
lines['tar'] = lines.tar.apply(lambda x : '\t '+ x + ' \n')
# '\t ' 시작토큰 대용, ' \n' 종료토큰 대용
lines.sample(10)

Unnamed: 0,src,tar
22584,Stop nit-picking.,\t Arrête de chercher la petite bête. \n
32656,He heard the sound.,\t Il a entendu le son. \n
35820,Take a closer look.,\t Regardez de plus près ! \n
56647,I had to go to Boston.,\t Je devais aller à Boston. \n
48386,I have gained weight.,\t J'ai pris du poids. \n
30811,Where do we begin?,\t Par où commençons-nous ? \n
56886,I know you are clever.,\t Je sais que tu es intelligent. \n
30937,Who's your father?,\t Qui est ton père ? \n
7629,I am not deaf.,\t Je ne suis pas sourd. \n
26947,I made a proposal.,\t J'ai fait une proposition. \n


In [7]:
src_vocab = set()
for line in lines.src: # 1줄씩 읽음
  for char in line: # 1개의 문자씩 읽음
    src_vocab.add(char)

tar_vocab = set()
for line in lines.tar:
  for char in line:
    tar_vocab.add(char)

In [8]:
src_vocab_size = len(src_vocab) + 1
tar_vocab_size = len(tar_vocab) + 1
print('source 문장의 char 집합: ', src_vocab_size)
print('target 문장의 char 집합: ', tar_vocab_size)

source 문장의 char 집합:  80
target 문장의 char 집합:  104


In [9]:
scr_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))
print(scr_vocab[45:])
print(tar_vocab[45:])

['W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'é', 'ï', '’', '€']
['T', 'U', 'V', 'W', 'X', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\xa0', '«', '»', 'À', 'Ç', 'É', 'Ê', 'Ô', 'à', 'â', 'ç', 'è', 'é', 'ê', 'ë', 'î', 'ï', 'ô', 'ù', 'û', 'œ', '\u2009', '‘', '’', '\u202f', '‽']


In [10]:
src_to_index = dict([(word, i+1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word, i+1) for i, word in enumerate(tar_vocab)])
print(src_to_index)
print(tar_to_index)

{'C': 1, '"': 2, "'": 3, 'P': 4, 's': 5, 'M': 6, '9': 7, '8': 8, '4': 9, 'a': 10, '/': 11, 'é': 12, '.': 13, 'n': 14, 'U': 15, 'h': 16, 'N': 17, 'x': 18, 'X': 19, 'o': 20, 'l': 21, 't': 22, '1': 23, 'c': 24, '-': 25, '%': 26, '0': 27, 'E': 28, 'Z': 29, '5': 30, 'e': 31, 'i': 32, 'y': 33, 'b': 34, 'T': 35, ' ': 36, 'f': 37, '$': 38, 'I': 39, 'S': 40, 'r': 41, 'q': 42, 'Y': 43, '7': 44, '6': 45, ':': 46, 'ï': 47, '€': 48, '!': 49, 'u': 50, 'Q': 51, 'A': 52, 'H': 53, 'L': 54, 'j': 55, '3': 56, '?': 57, 'V': 58, 'w': 59, 'g': 60, 'F': 61, 'W': 62, '’': 63, 'J': 64, 'G': 65, ',': 66, '2': 67, 'O': 68, 'p': 69, 'k': 70, 'd': 71, 'R': 72, 'D': 73, 'm': 74, 'v': 75, 'B': 76, 'z': 77, 'K': 78, '&': 79}
{'\t': 1, '\n': 2, ' ': 3, '!': 4, '"': 5, '$': 6, '%': 7, '&': 8, "'": 9, '(': 10, ')': 11, ',': 12, '-': 13, '.': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 3

In [11]:
encoder_input = []

for line in lines.src:
  encoded_line = []
  # 각 줄에서 1개의 char
  for char in line:
    # 각 char을 정수로 변환
    encoded_line.append(src_to_index[char])
  encoder_input.append(encoded_line)
print('source 문장의 정수 인코딩: ', encoder_input[:5])

source 문장의 정수 인코딩:  [[65, 20, 13], [65, 20, 13], [65, 20, 13], [65, 20, 13], [53, 32, 13]]


In [12]:
decoder_input = []

for line in lines.tar:
  encoded_line = []
  for char in line:
    encoded_line.append(tar_to_index[char])
  decoder_input.append(encoded_line)
print('target 문장의 정수 인코딩: ', decoder_input[:5])

target 문장의 정수 인코딩:  [[1, 3, 48, 52, 3, 4, 3, 2], [1, 3, 39, 52, 69, 54, 59, 56, 14, 3, 2], [1, 3, 31, 65, 3, 69, 66, 72, 71, 56, 3, 4, 3, 2], [1, 3, 28, 66, 72, 58, 56, 3, 4, 3, 2], [1, 3, 45, 52, 63, 72, 71, 3, 4, 3, 2]]


In [13]:
decoder_target = []

for line in lines.tar:
  timestep = 0
  encoded_line = []
  for char in line:
    if timestep > 0:
      encoded_line.append(tar_to_index[char])
    timestep = timestep + 1
  decoder_target.append(encoded_line)
print('target 문장레이블의 정수인코딩: ', decoder_target[:5])

target 문장레이블의 정수인코딩:  [[3, 48, 52, 3, 4, 3, 2], [3, 39, 52, 69, 54, 59, 56, 14, 3, 2], [3, 31, 65, 3, 69, 66, 72, 71, 56, 3, 4, 3, 2], [3, 28, 66, 72, 58, 56, 3, 4, 3, 2], [3, 45, 52, 63, 72, 71, 3, 4, 3, 2]]


In [14]:
max_src_len = max([len(line) for line in lines.src])
max_tar_len = max([len(line) for line in lines.tar])
print('source 문장의 최대길이: ', max_src_len)
print('target 문장의 최대길이: ', max_tar_len)

source 문장의 최대길이:  22
target 문장의 최대길이:  76


In [15]:
encoder_input = pad_sequences(encoder_input, maxlen = max_src_len,
                              padding = 'post')
decoder_input = pad_sequences(decoder_input, maxlen = max_tar_len,
                              padding = 'post')
decoder_target = pad_sequences(decoder_target, maxlen = max_tar_len,
                              padding = 'post')

In [16]:
encoder_input.shape

(60000, 22)

In [17]:
encoder_input[:2]

array([[65, 20, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0],
       [65, 20, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0]], dtype=int32)

In [18]:
encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)

In [19]:
encoder_input.shape

(60000, 22, 80)

In [20]:
encoder_input[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [21]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
import numpy as np

In [22]:
encoder_inputs = Input(shape = (None, src_vocab_size))
encoder_lstm = LSTM(units = 256, return_state = True)

# encoder_outputs은 여기서 불필요
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)

# LSTM은 RNN과는 달리 상태가 두 개. 은닉 상태와 셀 상태
encoder_states = [state_h, state_c]

In [23]:
decoder_inputs = Input(shape = (None, tar_vocab_size))
decoder_lstm = LSTM(units = 256, return_sequences = True, return_state = True)

# 디코더에게 인코더의 은닉상태, 셀상태를 전달
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state = encoder_states)

decoder_softmax_layer = Dense(tar_vocab_size, activation = 'softmax')
decoder_outputs = decoder_softmax_layer(decoder_outputs)
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy')

In [24]:
model.fit(x = [encoder_input, decoder_input], y = decoder_target,
          batch_size = 64, epochs = 40, validation_split = 0.2)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x7e849af9aec0>

In [25]:
encoder_model = Model(inputs = encoder_inputs, outputs = encoder_states)

In [26]:
decoder_state_input_h = Input(shape = (256, ))
decoder_state_input_c = Input(shape = (256, ))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs,
                                                 initial_state = decoder_states_inputs)

# 훈련과정에서와 달리 LSTM의 리턴하는 은닉상태와 셀상태를 버리지 않음.
decoder_states = [state_h, state_c]
decoder_outputs = decoder_softmax_layer(decoder_outputs)
decoder_model = Model(inputs = [decoder_inputs] + decoder_states_inputs,
                      outputs = [decoder_outputs] + decoder_states)

In [27]:
index_to_src = dict((i, char) for char, i in src_to_index.items())
index_to_tar = dict((i, char) for char, i in tar_to_index.items())

In [28]:
index_to_src

{1: 'C',
 2: '"',
 3: "'",
 4: 'P',
 5: 's',
 6: 'M',
 7: '9',
 8: '8',
 9: '4',
 10: 'a',
 11: '/',
 12: 'é',
 13: '.',
 14: 'n',
 15: 'U',
 16: 'h',
 17: 'N',
 18: 'x',
 19: 'X',
 20: 'o',
 21: 'l',
 22: 't',
 23: '1',
 24: 'c',
 25: '-',
 26: '%',
 27: '0',
 28: 'E',
 29: 'Z',
 30: '5',
 31: 'e',
 32: 'i',
 33: 'y',
 34: 'b',
 35: 'T',
 36: ' ',
 37: 'f',
 38: '$',
 39: 'I',
 40: 'S',
 41: 'r',
 42: 'q',
 43: 'Y',
 44: '7',
 45: '6',
 46: ':',
 47: 'ï',
 48: '€',
 49: '!',
 50: 'u',
 51: 'Q',
 52: 'A',
 53: 'H',
 54: 'L',
 55: 'j',
 56: '3',
 57: '?',
 58: 'V',
 59: 'w',
 60: 'g',
 61: 'F',
 62: 'W',
 63: '’',
 64: 'J',
 65: 'G',
 66: ',',
 67: '2',
 68: 'O',
 69: 'p',
 70: 'k',
 71: 'd',
 72: 'R',
 73: 'D',
 74: 'm',
 75: 'v',
 76: 'B',
 77: 'z',
 78: 'K',
 79: '&'}

In [29]:
def decode_sequence(input_seq):
  # 입력으로부터 인코더의 상태를 얻음
  states_value = encoder_model.predict(input_seq)

  # <SOS>에 해당하는 원-핫 벡터 생성
  target_seq = np.zeros((1, 1, tar_vocab_size))
  target_seq[0, 0, tar_to_index['\t']] = 1.

  stop_condition = False
  decoded_sentence = ''

  # stop_condition이 True가 될 때까지 루프 반복
  while not stop_condition:
    # 이점시점의 상태 state_value를 현시점의 초기 상태로 사용
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

    # 예측 결과를 문자로 변환
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_char = index_to_tar[sampled_token_index]
    # 현재 시점의 예측 문자를 예측 문장에 추가
    decoded_sentence += sampled_char

    # <eos>에 도달하거나 최대 길이를 넘기면 중단
    if(sampled_char == '\n' or
       len(decoded_sentence) > max_tar_len):
      stop_condition = True

    # 현재 시점의 예측 결과를 다음 시점의 입력으로 사용하기 위해 저장
    target_seq = np.zeros((1, 1, tar_vocab_size))
    target_seq[0, 0, sampled_token_index] = 1.

    # 현재 시점의 상태를 다음 시점의 상태로 사용하기 위해 저장
    states_value = [h, c]

  return decoded_sentence

In [30]:
for seq_index in [3, 50, 100, 300, 1001]: # 입력 문장의 인덱스
  input_seq = encoder_input[seq_index:seq_index + 1]
  decoded_sentence = decode_sequence(input_seq)
  print(35 * '-')
  print('입력 문장: ', lines.src[seq_index])
  print('정답 문장: ', lines.tar[seq_index][2:len(lines.tar[seq_index]) - 1])
  print('번역 문장: ', decoded_sentence[1:len(decoded_sentence) - 1])

-----------------------------------
입력 문장:  Go.
정답 문장:  Bouge ! 
번역 문장:  Commencez ! 
-----------------------------------
입력 문장:  Hello!
정답 문장:  Bonjour ! 
번역 문장:  Aide-moi ! 
-----------------------------------
입력 문장:  Got it!
정답 문장:  J'ai pigé ! 
번역 문장:  Décange ! 
-----------------------------------
입력 문장:  Go home.
정답 문장:  Rentre à la maison. 
번역 문장:  Allez au chien ! 
-----------------------------------
입력 문장:  Get going.
정답 문장:  En avant. 
번역 문장:  Dépasse-toi ! 
