# 시퀸스-투-시퀸스(Sequence-to-Sequence, Seq2Seq)
: 하나의 시퀸스를 입력받아 또 다른 시퀸스를 출력하는 모델

1. 인코더(Encoder) : 입력 시퀸스를 고정된 길이의 컨텍스트 벡터(Context Vector)로 변환한다.
- RNN, LSTM, GRU와 같은 순환 신경망이 사용된다.
- 인코더는 입력 시퀀스를 시간 단계별로 처리하여 각 단계의 은닉 상태를 생성한다.

2. 디코더(Decoder) : 인코더에서 생성된 컨텍스트 벡터를 사용하여 시퀀스를 생성한다.
- RNN, LSTM, GRU와 같은 순환 신경망이 사용된다.
- 인코더의 마지막 은닉 상태를 초기 상태로 사용하고, 이전 단계의 출력을 다음 시간 단계의 입력으로 사용한다.

In [3]:
import requests
import zipfile
import os

url = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
filename = 'spa-eng.zip'
response = requests.get(url)

with open(filename, 'wb') as file:
    file.write(response.content)

with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall('.')

os.remove(filename)

In [4]:
text_file = 'spa-eng/spa.txt'

with open(text_file, encoding='utf-8') as f:
    lines = f.read().split('\n')[:-1]

text_pairs = []
for line in lines:
    english, spanish = line.split('\t')
    spanish = '[start] ' + spanish + ' [end]'
    text_pairs.append((english, spanish))

In [5]:
import random
print(random.choice(text_pairs))

("Tom's new girlfriend is quite attractive.", '[start] La nueva novia de Tom es bastante atractiva. [end]')


In [10]:
import random
random.shuffle(text_pairs)
num_val_samples = int(0.15 * len(text_pairs))
num_train_samples = len(text_pairs) - 2 * num_val_samples

train_pairs = text_pairs[:num_train_samples]
val_pairs = text_pairs[num_train_samples:num_train_samples + num_val_samples]
test_pairs = text_pairs[num_train_samples + num_val_samples:]

In [11]:
print(len(train_pairs), len(val_pairs), len(test_pairs))

83276 17844 17844


In [17]:
# 영어와 스페인어 벡터화 처리
import tensorflow as tf
from tensorflow.keras import layers
import string
import re

# 특수문자 제거(제거할 특수문자 지정)
strip_chars = string.punctuation + '¿'
strip_chars = strip_chars.replace('[', '')
strip_chars = strip_chars.replace(']', '')

# 사용자 정의 전처리 함수
def custom_standardization(input_string):
    lowercase = tf.strings.lower(input_string) # 문자열을 소문자로 변환
    return tf.strings.regex_replace(lowercase, f'[{re.escape(strip_chars)}]', '') # 특수문자 제거

vocab_size = 15000
sequence_length = 20

source_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length,
)

target_vectorization = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length + 1, # Q. 스페인어에만 +1이 들어가 있는 이유
    # A. 시작 토큰 추가, 종료 토큰 예측
    standardize=custom_standardization,
)

train_english_texts = [pair[0] for pair in train_pairs]
train_spanish_texts = [pair[1] for pair in train_pairs]
source_vectorization.adapt(train_english_texts)
target_vectorization.adapt(train_spanish_texts)

In [18]:
train_english_texts[0], train_spanish_texts[0]

("Tom didn't feel well, but he went to work anyway.",
 '[start] Tom no se sentía bien, pero de todos modos fue a trabajar. [end]')

In [19]:
batch_size = 64

def format_dataset(eng, spa):
    eng = source_vectorization(eng)
    spa = target_vectorization(spa)
    return ({
        'english':eng,
        'spanish':spa[:, :-1], # 종료 토큰 제외 (Seq2Seq 모델의 입력)
    }, spa[:, 1:]) # 시작 토큰 제외 (Seq2Seq 모델의 출력 타겟)

def make_dataset(paris):
    eng_texts, spa_texts = zip(*paris)
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts)) # 각각 텐서 플로우 데이터 셋으로 변환
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls = 4)
    return dataset.shuffle(42).prefetch(16).cache() # 데이터 셋 셔플, prefetch(16) : 16개의 배치를 미리 가져오고 캐싱을 수행한다.

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [20]:
for inputs, targets in train_ds.take(1):
    print(f"inputs['english'].shape : {inputs['english'].shape}")
    print(f"inputs['spanish'].shape : {inputs['spanish'].shape}")
    print(f"targets.shape : {targets.shape}")

inputs['english'].shape : (64, 20)
inputs['spanish'].shape : (64, 20)
targets.shape : (64, 20)


2024-06-20 14:21:30.938832: W tensorflow/core/kernels/data/cache_dataset_ops.cc:854] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [21]:
from tensorflow import keras

embed_dim = 256
latent_dim = 1024

source = keras.Input(shape=(None, ), dtype='int64', name='english')
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(source)

encoded_source = layers.Bidirectional(layers.GRU(latent_dim), merge_mode='sum')(x)

In [26]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Define constants (example values, you should set these appropriately)
vocab_size = 15000
sequence_length = 20
embed_dim = 256
latent_dim = 512

# Define the source input
source = keras.Input(shape=(None,), dtype='int64', name='english')

# Define the encoder (this should be previously defined)
# Assuming encoded_source is obtained from an encoder model
# Example encoder definition (should be replaced with your actual encoder model)
encoder_embedding = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim, mask_zero=True)(source)
encoder_gru = layers.GRU(latent_dim, return_state=True)
encoded_source, state_h = encoder_gru(encoder_embedding)

# Define the past target input
past_target = keras.Input(shape=(None,), dtype='int64', name='spanish')
x = layers.Embedding(vocab_size, embed_dim, mask_zero=True)(past_target)

# Define the decoder GRU
decoder_gru = layers.GRU(latent_dim, return_sequences=True, return_state=True)
x, _ = decoder_gru(x, initial_state=state_h)
x = layers.Dropout(0.5)(x)

# Define the output layer
target_next_step = layers.Dense(vocab_size, activation='softmax')(x)

# Define the seq2seq model
seq2seq_rnn = keras.Model([source, past_target], target_next_step)

# Print the model summary
seq2seq_rnn.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 english (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 spanish (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 embedding_6 (Embedding)     (None, None, 256)            3840000   ['english[0][0]']             
                                                                                                  
 embedding_7 (Embedding)     (None, None, 256)            3840000   ['spanish[0][0]']             
                                                                                            

In [None]:
seq2seq_rnn.compile(
    optimizer = 'rmsprop',
    loss = 'sparse_catehorical_crossentropy',
    metrics=['accuracy'])

seq2seq_rnn.fit(train_df, epochs=15, validation_data=val_ds)

In [33]:
import numpy as np
import random
from tensorflow.keras.models import load_model  

# 데이터 준비
spa_vocab = target_vectorization.get_vocabulary() # 스페인어 단어 집합
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab)) # 인덱스 단어 매핑
max_decoded_sentence_length = 20 # 최대 길이

# 문장 번역 함수
# 20번동안 문장 해석
def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence]) # 입력된 영어 문장 토큰화
    decoded_sentence = '[start]' # 시작 토큰

    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence]) # 현재까지 생성된 스페인어 문장의 토큰
        next_token_predictions = seq2seq_rnn.predict([tokenized_input_sentence, tokenized_target_sentence])
        # 입력된 영어 문장과 현재까지 생성된 스페인어 문장을 통해 다음 토큰 예측
        sampled_token_index = np.argmax(next_token_predictions[0, i, :])
        # 예측 결과 중 가장 확률이 높은 토큰(인덱스를 처리해준다.)
        sampled_token = spa_index_lookup[sampled_token_index]
        # 선택된 토큰을 실제 스페인어로 변환
        decoded_sentence += ' ' + sampled_token
        # 공백을 붙여 출력해주겠다.
        # 해석 중간에 종료 토큰을 만나면 break
        if sampled_token == '[end]': # 종료 토큰
            break
    
    return decoded_sentence # 번역된 문장 반환

test_eng_texts = [pair[0] for pair in test_pairs]

for _ in range(20):
    input_sentence = random.choice(test_eng_texts)
    print('-'*45)
    print("Input sentence:", input_sentence)
    print("Decoded sentence:", decode_sequence(input_sentence))

---------------------------------------------
Input sentence: Stop moving!
Decoded sentence: [start] terminamos ¡ah ríes extremo pagando traigas naciste científico reconocería vampiros trago rinda sentaron dándose estupendo jugó abordaron terminarme tiritando usemos
---------------------------------------------
Input sentence: This CD belongs to my son.
Decoded sentence: [start] ¡aléjate prisión revisas almorzando álbum abandonada respondé ipod presidenta benjamín echaras ¡apúrate incertidumbre antoja pesadillas voltios contenía inapropiado podido pegué
---------------------------------------------
Input sentence: Tom is generous and kind.
Decoded sentence: [start] jugaban empeorar atrapar hambre hipoteca aspecto ausente flotar trofeo vístete conocido evitarse tráete nombró importar bar tócalo guárdame enseño abrace
---------------------------------------------
Input sentence: I didn't know Tom couldn't speak French.
Decoded sentence: [start] terminamos vuélvelo drásticamente subió ¡pe

In [58]:
# 트랜스포머를 사용한 Seq2Seq
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# TransformerDecoder 클래스 수정
class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.attention_2 = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential([
            layers.Dense(dense_dim, activation='relu'),
            layers.Dense(embed_dim),
        ])
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self._compute_causal_mask(inputs)
        padding_mask = tf.cast(mask, dtype=tf.float32)[:, tf.newaxis, :] if mask is not None else None
        combined_mask = tf.minimum(padding_mask, causal_mask) if padding_mask is not None else causal_mask

        attn_output_1 = self.attention_1(inputs, inputs, attention_mask=combined_mask)
        out_1 = self.layernorm_1(inputs + attn_output_1)

        attn_output_2 = self.attention_2(out_1, encoder_outputs, attention_mask=padding_mask)
        out_2 = self.layernorm_2(out_1 + attn_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def _compute_causal_mask(self, inputs):
        input_shape = tf.shape(inputs)
        batch_size, seq_length = input_shape[0], input_shape[1]
        i = tf.range(seq_length)[:, tf.newaxis]
        j = tf.range(seq_length)
        mask = i >= j
        mask = tf.cast(mask, dtype=tf.float32)
        mask = tf.reshape(mask, (1, seq_length, seq_length))
        return mask

In [59]:
# 위치 정보를 포함한 위치 인베딩층 작업
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)
        
    def get_config(self):
        config = super().get_config()
        config.update({
            'sequence_length': self.sequence_length,
            'input_dim': self.input_dim,
            'output_dim': self.output_dim,
        })
        return config

In [60]:
# 트랜스포터 인코더 제작
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation='relu'),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attn_output = self.attention(inputs, inputs, attention_mask=mask)
        out_1 = self.layernorm_1(inputs + attn_output)
        proj_output = self.dense_proj(out_1)
        return self.layernorm_2(out_1 + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            'embed_dim': self.embed_dim,
            'dense_dim': self.dense_dim,
            'num_heads': self.num_heads,
        })
        return config

In [61]:
embed_dim = 256
dense_dim = 2048
num_heads = 8
sequence_length = 20
vocab_size = 15000

# 입력 정의
encoder_inputs = keras.Input(shape=(None,), dtype='int64', name='english')
decoder_inputs = keras.Input(shape=(None,), dtype='int64', name='spanish')

# 인코더 정의
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)

# 디코더 정의
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
x = layers.Dropout(0.5)(x)

# 출력 레이어 정의
decoder_outputs = layers.Dense(vocab_size, activation='softmax')(x)

# 모델 정의
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

# 모델 요약
transformer.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 english (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 spanish (InputLayer)        [(None, None)]               0         []                            
                                                                                                  
 positional_embedding_9 (Po  (None, None, 256)            3845120   ['english[0][0]']             
 sitionalEmbedding)                                                                               
                                                                                                  
 positional_embedding_10 (P  (None, None, 256)            3845120   ['spanish[0][0]']       

In [63]:
transformer.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
transformer.fit(train_ds, epochs=10, validation_data=val_ds)

Epoch 1/10
  80/1302 [>.............................] - ETA: 1:37:47 - loss: 2.8665 - accuracy: 0.6653


KeyboardInterrupt



In [66]:
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow import keras

# vocabulary 설정 및 인덱스 조회
spa_vocab = target_vectorization.get_vocabulary()
spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
max_decoded_sentence_length = 20

def decode_sequence(input_sentence):
    tokenized_input_sentence = source_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = target_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = spa_index_lookup[sampled_token_index]
        decoded_sentence += ' ' + sampled_token
        if sampled_token == '[end]':
            break
    return decoded_sentence

# 테스트 쌍에서 영어 텍스트 가져오기
test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(20):
    input_sentence = random.choice(test_eng_texts)
    print('-' * 45)
    print(input_sentence)
    print(decode_sequence(input_sentence))


---------------------------------------------
I never for a moment imagined that I would win.
[start] tom tom no [end]
---------------------------------------------
Tom emptied the water out of the bottle before he refilled it with fresh water.
[start] tom tom no [end]
---------------------------------------------
She's going to sit on the yellow couch.
[start] tom tom no [end]
---------------------------------------------
Come again.
[start] tom tom no [end]
---------------------------------------------
Let's play basketball after school.
[start] tom tom no [end]
---------------------------------------------
Despite all his wealth, he is stingy.
[start] tom tom no [end]
---------------------------------------------
Where do you know each other from?
[start] tom tom no [end]
---------------------------------------------
There was a castle here many years ago.
[start] tom tom no [end]
---------------------------------------------
Where does the airport bus leave from?
[start] tom tom no