<a href="https://colab.research.google.com/github/xhdixhfl/last_project/blob/main/%EC%98%81%EC%96%B4-%ED%94%84%EB%9E%91%EC%8A%A4%EC%96%B4%20%EA%B8%B0%EA%B3%84%20%EB%B2%88%EC%97%AD%20%EB%AA%A8%EB%8D%B8%EB%A7%81.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 데이터 셋 로딩

In [1]:
## 데이터 셋 로딩
# https://www.manythings.org/anki의 영어 - 프랑스 번역 셋 이용
!wget http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip
!unzip -q fra-eng.zip

--2023-01-06 01:24:47--  http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.79.128, 108.177.127.128, 172.217.218.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.79.128|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3423204 (3.3M) [application/zip]
Saving to: ‘fra-eng.zip’


2023-01-06 01:24:48 (4.78 MB/s) - ‘fra-eng.zip’ saved [3423204/3423204]



In [3]:
# 데이터 살펴보기
text = 'fra.txt'
with open(text) as f:
    lines = f.read().split('\n')[:-1]
text_pairs = []
for line in lines: # 라인별 처리
    eng, fra = line.split('\t')
    fra = '[start]' + fra + '[end]'
    text_pairs.append((eng, fra))
    
# 랜덤 문장보기    
import random
print(random.choice(text_pairs))

("I'm extremely happy.", '[start]Je suis extrêmement heureux.[end]')


## 데이터 세트 분리

In [4]:
# 셔플후 세트 분리
random.shuffle(text_pairs)
num_val_sam = int(0.15 * len(text_pairs))
num_train_sam = len(text_pairs) - 2 * num_val_sam
train_pairs = text_pairs[: num_train_sam]
val_pairs = text_pairs[num_train_sam : num_train_sam + num_val_sam]
test_pairs = text_pairs[num_train_sam + num_val_sam : ]

## TV층 (TextVectorization)준비
- 영어층, 프랑스어층
- 문자열 전처리 방식 커스텀 (fra_standard)

In [6]:
# 라이브러리 
import tensorflow as tf
from tensorflow.keras import layers
import string
import re


# 프랑스어 TV층에 적용하기 위해 특수 문자들 삭제
str_chars = string.punctuation 
str_chars = str_chars.replace("[","")
str_chars = str_chars.replace("]", "")
# 문자열 표준화 함수 정의
def fra_standard(input_string):
    lowercase = tf.strings.lower(input_string)
    return tf.strings.regex_replace(
    lowercase, f"[{re.escape(str_chars)}]", "")

# 범위제한을 위한 작업(예시라서 제한함)
vocab_size = 15000
sequence_length = 20

#영어층
source_vec = layers.TextVectorization(
    max_tokens = vocab_size,
    output_mode = 'int',
    output_sequence_length = sequence_length,
)
# french layer
target_vec = layers.TextVectorization(
    max_tokens = vocab_size,
    output_mode = 'int',
    output_sequence_length = sequence_length + 1,
    standardize = fra_standard
)

# 훈련후 어휘 사전 만들기
train_eng_texts = [pair[0] for pair in train_pairs]
train_fra_texts = [pair[1] for pair in train_pairs]
source_vec.adapt(train_eng_texts)
target_vec.adapt(train_fra_texts)

## 번역작업을 위한 데이터셋 준비

In [8]:
batch_size = 64

def format_dataset(eng, fra):
    eng = source_vec(eng)
    fra = target_vec(fra)
    return ({ # 이 dict가 inputs
        'english': eng,
        'french' : fra[:,:-1],
    }, fra[:, 1:]) # fra가 target

def make_dataset(pairs):
    eng_texts, fra_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    fra_texts = list(fra_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, fra_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset, num_parallel_calls = 4)
    return dataset.shuffle(2048).prefetch(16).cache() # 전처리 속도리를 높이기 위한 캐싱

train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [11]:
# 크기 확인
for inputs, targets in train_ds.take(1):
  print(f"inputs['eng'].shape : {inputs['english'].shape}")
  print(f"inputs['fra'].shape : {inputs['french'].shape}")
  print(f"targets.shape: {targets.shape}")

inputs['eng'].shape : (64, 20)
inputs['fra'].shape : (64, 20)
targets.shape: (64, 20)


## RNN 을 사용한 StoS모델
- 시퀀스_투_시퀀스 모델

In [12]:
from tensorflow import keras
from tensorflow.keras import layers

# GRU기반 인코더
embed_dim = 256
latent_dim = 1024
# 영어 소스 문장이 입력됨 (이름 지정시 입력 디셔너리로 모델 훈련 가능(키값))
source = keras.Input(shape=(None,), dtype='int64', name= 'english')
x = layers.Embedding(vocab_size, embed_dim, mask_zero = True)(source) # 마스킹 중요
encoded_source = layers.Bidirectional(
    layers.GRU(latent_dim), merge_mode = 'sum'
)(x) # 양방향 GRU의 마지막 출력

- GRU기반 디코더와 엔드 투 엔드 모델

In [14]:
past_tar = keras.Input(shape=(None,), dtype = 'int64', name = 'french') # 타깃 시퀀스
x = layers.Embedding(vocab_size, embed_dim, mask_zero = True)(past_tar)
decoder_gru = layers.GRU(latent_dim, return_sequences = True)
x = decoder_gru(x, initial_state = encoded_source) # 디코더 GRU의 초기 상태
x = layers.Dropout( 0.5)(x)
target_next_step = layers.Dense(vocab_size, activation = 'softmax')(x) # 다음 토큰 예측
s2s_rnn = keras.Model([source, past_tar], target_next_step)
# 엔투엔 모델은 소스 시퀀스와 타겟 시퀀스를 한 스템 앞의 타깃 시퀀스에 매핑

In [15]:
# rnn 기반 s2s모델 훈련
s2s_rnn.compile(
    optimizer = 'rmsprop',
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy']
)
s2s_rnn.fit(train_ds, epochs = 15, validation_data = val_ds)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f169c662d30>

## RNN인코더와 디코더로 새로운 문장 번역

In [17]:
import numpy as np

# 예측된 인덱스를 문자열 ㅇ토큰으로 변환(딕셔너리로)
fra_vocab = target_vec.get_vocabulary()
fra_idx_lookup = dict(zip(range(len(fra_vocab)), fra_vocab))
max_decoded_sentence_length = 20

# 함수정의
def decode_sequence(input_sen):
  tokenized_input_sen = source_vec([input_sen])
  decoded_sen = '[start]'
  for i in range(max_decoded_sentence_length):
    tokenized_target_sen = target_vec([decoded_sen])
    next_token_preds =s2s_rnn.predict(
        [tokenized_input_sen, tokenized_target_sen]
    ) 
    sampled_token_idx = np.argmax(next_token_preds[0, i, :])  # 다음 토큰 샘플링
    sampled_token = fra_idx_lookup[sampled_token_idx] # 예측된 토큰을 문자열로
    decoded_sen += " " + sampled_token # 생성된 문장 추가
    if sampled_token == "[end]":     # 종료 조건 (최대길이 도달 또는 end)
      break
  return decoded_sen

test_eng_texts = [pair[0] for pair in text_pairs]
for _ in range(20):
  input_sen = random.choice(test_eng_texts)
  print("-")
  print(input_sen)
  print(decode_sequence(input_sen))



-
Our fighters averaged 430 missions a day.
[start] [UNK] a une [UNK] de [UNK]  [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]
-
I had fun.
[start] me suis [UNK]  [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]
-
I ran out of the house.
[start] [UNK] de la maison[end]  [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]
-
We all agree with you.
[start] nous sommes tous [UNK]  [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]
-
Tom is almost ready to go.
[start] est presque prêt à y aller[end]  [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]
-
We all knew it.
[start] le [UNK]  [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK]
-
I never saw Tom again.
[start] na jamais vu tom[end]  [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] 

##  트랜스 포머를 사용한 s2s모델
- 트랜스포머 디코더 클래스 정의

In [26]:
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras import layers

class TransformerDecoder(layers.Layer):
  def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
    super().__init__(**kwargs)
    self.embed_dim = embed_dim # 입력 토큰 벡터의 크기 
    self.dense_dim = dense_dim # 내부 밀집 층의 크기
    self.num_heads = num_heads # 어텐션 해드 개수
    self.attention_1 = layers.MultiHeadAttention(
        num_heads = num_heads, key_dim = embed_dim
    )
    self.attention_2 = layers.MultiHeadAttention(
        num_heads = num_heads, key_dim = embed_dim
    )
    self.dense_proj = keras.Sequential(
        [layers.Dense(dense_dim, activation = 'relu'),
         layers.Dense(embed_dim),]
    )
    self.layernorm_1 = layers.LayerNormalization()
    self.layernorm_2 = layers.LayerNormalization()
    self.layernorm_3 = layers.LayerNormalization()
    self.supports_masking = True

# 모델 저장을 위한 직렬화(직렬형태여애 저장이 가능)
  def get_config(self):
    config = super().get_config()
    config.update({
        'embed_dim' : self.embed_dim,
        'num_heads' : self.num_heads,
        'dense_dim' : self.dense_dim,
    })
    return config

# 코잘마스킹을 생성하는 함수  (코잘 패딩을고려하여 어텐션 층 전달을 위한 행렬을 만듦)
  def get_causal_attention_mask(self, inputs):
    input_shape = tf.shape(inputs)
    batch_size, sequence_length = input_shape[0], input_shape[1]
    i = tf.range(sequence_length)[:, tf.newaxis] 
    j = tf.range(sequence_length)
    mask = tf.cast(i >= j, dtype ='int32') # 절반은 1이고, 나머지는 0인  행렬
    mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
    mult = tf.concat(
            [tf.expand_dims(batch_size, -1),
             tf.constant([1, 1], dtype=tf.int32)], axis=0)
    return tf.tile(mask, mult)


# 정방향 패스를 구현하는 완전한 연산함수?
  def call(self, inputs, encoder_outputs, mask = None): 
    causal_mask = self.get_causal_attention_mask(inputs) # 코잘 마스킹 추출
    if mask is not None:
      padding_mask = tf.cast(
          mask[:,tf.newaxis, :], dtype ='int32'
      )
      padding_mask = tf.minimum(padding_mask, causal_mask) # 두 마스킹을 함침
      attention_output_1 = self.attention_1(
          query = inputs,
          value = inputs,
          key = inputs,
          attention_mask = causal_mask
      ) # 코잘 마스킹을 타깃 시퀀스에 대해 셀프 어텐션을 수행하는 첫번째 어텐션 층으로 전달
      attention_output_1 = self.layernorm_1(inputs + attention_output_1) 
      attention_output_2 = self.attention_2(
          query = attention_output_1,
          value = encoder_outputs,
          key = encoder_outputs,
          attention_mask = padding_mask
      ) # 마스킹 소시 시퀀스와 타깃 시퀀스를 연관 시키는 두번째 어텐션 층
      attention_output_2 = self.layernorm_2(
        attention_output_1 + attention_output_2)
      proj_output = self.dense_proj(attention_output_2)
      return self.layernorm_3(attention_output_2 + proj_output) 


## 기계번역을 위한 트랜스포머

In [19]:
# 위치 임베딩 층
class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, input_dim, output_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=input_dim, output_dim=output_dim)
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=output_dim)
        self.sequence_length = sequence_length
        self.input_dim = input_dim
        self.output_dim = output_dim

    def call(self, inputs):
        length = tf.shape(inputs)[-1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        return tf.math.not_equal(inputs, 0)

    def get_config(self):
        config = super(PositionalEmbedding, self).get_config()
        config.update({
            "output_dim": self.output_dim,
            "sequence_length": self.sequence_length,
            "input_dim": self.input_dim,
        })
        return config

In [20]:
# 엔드투엔드 트랜스포머
class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim)
        self.dense_proj = keras.Sequential(
            [layers.Dense(dense_dim, activation="relu"),
             layers.Dense(embed_dim),]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()

    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        attention_output = self.attention(
            inputs, inputs, attention_mask=mask)
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "dense_dim": self.dense_dim,
        })
        return config

In [29]:
embed_dim = 256
dense_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="english")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, dense_dim, num_heads)(x) # 소스 문장 인코딩

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="french")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, dense_dim, num_heads)(x, encoder_outputs)
# 타깃 시퀀스를 인코딩하고 인코딩된 소스 문장과 합침
x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x) # 출력위치의 단어 예측
transformer = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [30]:
# s2s트랜스포머 훈련
transformer.compile(
    optimizer="rmsprop",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"])
transformer.fit(train_ds, epochs=30, validation_data=val_ds)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f16867836a0>

# 트랜스포머 모델을 사용한 문장번역(시투시)

In [31]:
import numpy as np

fra_vocab = target_vec.get_vocabulary()
fra_idx_lookup = dict(zip(range(len(fra_vocab)), fra_vocab))
max_decoded_sentence_length = 20

def decode_sen(input_sen):
  tokenized_input_sen = source_vec([input_sen])
  decoded_sen = "[start]"
  for i in range(max_decoded_sentence_length):
    tokenized_target_sen = target_vec(
        [decoded_sen])[:,:-1]
    preds = transformer(
        [tokenized_input_sen, tokenized_target_sen]
    )
    sampled_token_idx = np.argmax(preds[0,i,:]) # 다음 토큰을 샘플링
    sampled_token = fra_idx_lookup[sampled_token_idx]
    decoded_sen += " " + sampled_token
    if sampled_token == '[end]':
      break
  return decoded_sen

test_eng_texts = [pair[0] for pair in test_pairs]
for _ in range(20):
  input_sen = random.choice(test_eng_texts)
  print("-")
  print(input_sen)
  print(decode_sen(input_sen))

-
Are you pleased with your new job?
[start] êtesvous ravi votre nouveau [UNK]  [end]
-
I almost left my umbrella in the train.
[start] presque mon [UNK] dans le train[end]     dans      dans  [end]
-
I just don't want to hurt anyone.
[start] tout simplement pas faire mal à personne[end]       à   à   à
-
I can't tell you the truth.
[start] ne pouvezvous pas te dire la vérité[end]             
-
I have to stay for a while.
[start] devraisje rester un moment[end]             [end]
-
Do I look presentable?
[start] [UNK] [UNK]  [end]
-
I thought that was a great story.
[start] [UNK] que cétait une belle histoire [UNK]             
-
Tom gave this apple to me.
[start] me [UNK] cette carte  pour [end]
-
Don't make me angry.
[start] le [UNK]  en [end]
-
People are complicated.
[start] ont des gens[end]              [end]
-
I'd say you did well.
[start] le fait pas bien[end]          [end]
-
You're very clever.
[start] sommes très [UNK]              [end]
-
I don't even know where.
[start] ne