In [29]:
import re
import unicodedata
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [30]:
def load_preprocessed_data():
    encoder_input, decoder_input, decoder_target = [], [], []

    with open('fra.txt', 'r', encoding='utf-8') as lines:
        for i, line in enumerate(lines):
            src_line, tar_line, _ = line.strip().split('\t')  # source, target 가져오기

            src_line = [w for w in preprocess_sentence(src_line).split()]  # 단어 기준

            tar_line = preprocess_sentence(tar_line)  # 문장 기준
            tar_line_in = [w for w in ('<sos> ' + tar_line).split()]
            tar_line_out = [w for w in (tar_line + ' <eos>').split()]
            encoder_input.append(src_line)
            decoder_input.append(tar_line_in)
            decoder_target.append(tar_line_out)

            if i == 59999:
                break

    return encoder_input, decoder_input, decoder_target

In [31]:
def unicode_to_ascii(s):
    # 프랑스어 악센트 삭제
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

In [32]:
def preprocess_sentence(sent):
    sent = unicode_to_ascii(sent.lower())
    sent = re.sub(r'([?.!,¿])', r' \1', sent) # 구두점이 나오면 공백 추가
    sent = re.sub(r'[^a-zA-Z!.?]+', r' ', sent) # 알파벳, !, ., ? 제외하고는 전부 삭제
    sent = re.sub(r'\s+', ' ', sent) # 공백이 여러개 나오면 하나의 공백으로 변환
    return sent.strip()

In [33]:
en_sent = u"Have you had dinner?"
fr_sent = u"Avez-vous déjà diné?"

print('전처리 전 영어 문장 : ', en_sent)
print('전처리 후 영어 문장 : ', preprocess_sentence(en_sent))
print('전처리 전 불어 문장 : ', fr_sent)
print('전처리 후 불어 문장 : ', preprocess_sentence(fr_sent))

전처리 전 영어 문장 :  Have you had dinner?
전처리 후 영어 문장 :  have you had dinner ?
전처리 전 불어 문장 :  Avez-vous déjà diné?
전처리 후 불어 문장 :  avez vous deja dine ?


In [34]:
en_sent = u"Have you had dinner?"
fr_sent = u"Avez-vous déjà diné?"

print('전처리 전 영어 문장 : ', en_sent)
print('전처리 후 영어 문장 : ', preprocess_sentence(en_sent))
print('전처리 전 불어 문장 : ', fr_sent)
print('전처리 후 불어 문장 : ', preprocess_sentence(fr_sent))

전처리 전 영어 문장 :  Have you had dinner?
전처리 후 영어 문장 :  have you had dinner ?
전처리 전 불어 문장 :  Avez-vous déjà diné?
전처리 후 불어 문장 :  avez vous deja dine ?


In [35]:
sents_en_in, sents_fra_in, sents_fra_out = load_preprocessed_data()

In [36]:
print('인코더의 입력 : ', sents_en_in[:5])
print('디코더의 입력 : ', sents_fra_in[:5])
print('디코더의 레이블 : ', sents_fra_out[:5])

인코더의 입력 :  [['go', '.'], ['go', '.'], ['go', '.'], ['go', '.'], ['hi', '.']]
디코더의 입력 :  [['<sos>', 'va', '!'], ['<sos>', 'marche', '.'], ['<sos>', 'en', 'route', '!'], ['<sos>', 'bouge', '!'], ['<sos>', 'salut', '!']]
디코더의 레이블 :  [['va', '!', '<eos>'], ['marche', '.', '<eos>'], ['en', 'route', '!', '<eos>'], ['bouge', '!', '<eos>'], ['salut', '!', '<eos>']]


In [37]:
tokenizer_en = Tokenizer(filters='', lower=False)
tokenizer_en.fit_on_texts(sents_en_in)

encoder_input = tokenizer_en.texts_to_sequences(sents_en_in)
encoder_input = pad_sequences(encoder_input, padding='post')

tokenizer_fra = Tokenizer(filters='', lower=False)
tokenizer_fra.fit_on_texts(sents_fra_in)
tokenizer_fra.fit_on_texts(sents_fra_out)

decoder_input = tokenizer_fra.texts_to_sequences(sents_fra_in)
decoder_input = pad_sequences(decoder_input, padding='post')

decoder_target = tokenizer_fra.texts_to_sequences(sents_fra_out)
decoder_target = pad_sequences(decoder_target, padding='post')

In [38]:
print(encoder_input.shape, decoder_input.shape, decoder_target.shape)

(60000, 8) (60000, 17) (60000, 17)


In [39]:
src_to_index = tokenizer_en.word_index
index_to_src = tokenizer_en.index_word
tar_to_index = tokenizer_fra.word_index
index_to_tar = tokenizer_fra.index_word

In [40]:
max_src_len = encoder_input.shape[1]
max_tar_len = decoder_input.shape[1]

print(max_src_len, max_tar_len)

8 17


In [41]:
src_vocab_size = len(tokenizer_en.word_index)+1
tar_vocab_size = len(tokenizer_fra.word_index)+1

print(src_vocab_size, tar_vocab_size)

6443 10934


In [42]:
encoder_input[:5]

array([[ 32,   1,   0,   0,   0,   0,   0,   0],
       [ 32,   1,   0,   0,   0,   0,   0,   0],
       [ 32,   1,   0,   0,   0,   0,   0,   0],
       [ 32,   1,   0,   0,   0,   0,   0,   0],
       [984,   1,   0,   0,   0,   0,   0,   0]], dtype=int32)

In [43]:
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)
print(indices)

[44980 15862 35909 ... 31098 38134 43181]


In [44]:
encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_target = decoder_target[indices]

In [45]:
encoder_input[:5]

array([[   6,  171,   10, 1964,    1,    0,    0,    0],
       [   2,   78,  222,    1,    0,    0,    0,    0],
       [4218,   69, 1023,    1,    0,    0,    0,    0],
       [   3,   12,  190,    1,    0,    0,    0,    0],
       [  43,   19,  264,   26,  881,    1,    0,    0]], dtype=int32)

In [46]:
n_of_val = 6000

encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

In [47]:
print(encoder_input_train.shape, decoder_input_train.shape, decoder_target_train.shape)
print(encoder_input_test.shape, decoder_input_test.shape, decoder_target_test.shape)

(54000, 8) (54000, 17) (54000, 17)
(6000, 8) (6000, 17) (6000, 17)


In [48]:
# 모델 설계
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Masking
from tensorflow.keras.models import Model

In [49]:
embedding_dim = 128
hidden_units = 64

In [50]:
# 인코더
encoder_inputs = Input(shape=(None, ))
enc_emb = Embedding(src_vocab_size, embedding_dim)(encoder_inputs) # 임베딩 층
enc_masking = Masking(mask_value=0.0)(enc_emb) # padding 0을 연산에서 제외
encoder_lstm = LSTM(hidden_units, return_state=True) 
encoder_outputs, state_h, state_c = encoder_lstm(enc_masking)
encoder_states = [state_h, state_c]

In [51]:
decoder_inputs = Input(shape=(None, ))
dec_emb_layer = Embedding(tar_vocab_size, hidden_units)
dec_emb = dec_emb_layer(decoder_inputs)
dec_masking = Masking(mask_value=0.0)(dec_emb)

decoder_lstm = LSTM(hidden_units, return_sequences=True, return_state=True)

# 인코더의 상태를 디코더의 초기 상태로 사용
decoder_outputs, _, _ = decoder_lstm(dec_masking, initial_state=encoder_states)

# 모든 시점에 대한 결과를 소프트 맥스 함수로 단어 예측
decoder_dense = Dense(tar_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [52]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [53]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['acc'])

In [54]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, None, 128)    824704      ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, None, 64)     699776      ['input_4[0][0]']                
                                                                                            

In [55]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)

In [56]:
model.fit(x=[encoder_input_train, decoder_input_train], y=decoder_target_train, batch_size=128, epochs=50, 
          validation_data=([encoder_input_test, decoder_input_test], decoder_target_test) , callbacks=[es])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 43: early stopping


<keras.callbacks.History at 0x16109b130>

In [80]:
encoder_model = Model(encoder_inputs, encoder_states)

In [58]:
decoder_state_input_h = Input(shape=(hidden_units,))
decoder_state_input_c = Input(shape=(hidden_units,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state=decoder_states_inputs)
decoder_state2 = [state_h2, state_c2]

decoder_outputs2 = decoder_dense(decoder_outputs2)

2024-07-15 16:38:50.013309: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-15 16:38:50.013945: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-15 16:38:50.014799: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [60]:
decoder_model = Model([decoder_inputs] + decoder_states_inputs, [decoder_outputs2] + decoder_state2)

In [61]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq) # 인코더 모델이기 때문에 인코더 상태를 반환받는다

    # <SOS>에 해당하는 정수 생성(문장의 시작)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tar_to_index['<sos>']

    stop_condition = False
    decoded_sentence = ''

    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value) # 이전 시점의 상태를 현 시점의 초기 상태

        # 가장 높은 확률의 단어 찾기
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = index_to_tar[sampled_token_index]

        decoded_sentence += ' ' + sampled_char

        if (sampled_char == '<eos>' or len(decoded_sentence) > 76):
            stop_condition = True

        # 현 시점의 예측을 다음 시점의 입력으로 사용
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index

        # 현 시점의 상태를 다음 시점의 초기 상태로 사용
        states_value = [h, c]

    return decoded_sentence

In [62]:
def seq_to_src(input_seq): # 정수 데이터를 문자 데이터로
    sentence = ''
    for encoded_word in input_seq:
        if(encoded_word!=0):
            sentence += index_to_src[encoded_word] + ' '
    return sentence

In [63]:
def seq_to_tar(input_seq):
    sentence = ''
    for encoded_word in input_seq:
        if(encoded_word != 0 and encoded_word != tar_to_index['<sos>'] and encoded_word != tar_to_index['<eos>']):
            sentence += index_to_tar[encoded_word] + ' '

    return sentence

In [66]:
for seq_index in [1,2,3,4,5,6,7,8]:
    input_seq = encoder_input_train[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)

    print('입력 문장 : ', seq_to_src(encoder_input_train[seq_index])
    print('정답 문장 : ', seq_to_tar(decoder_input_train[seq_index])
    print('번역 문장 : ', decoded_sentence[1:-5])

SyntaxError: invalid syntax (4151176579.py, line 6)

In [65]:
for seq_index in [1,2,3,4,5,6,7,8]:
    input_seq = encoder_input_test[seq_index: seq_index+1]
    decoded_sentence = decode_sequence(input_seq)

    print('입력 문장 : ', seq_to_src(encoder_input_test[seq_index])
    print('정답 문장 : ', seq_to_tar(decoder_input_test[seq_index])
    print('번역 문장 : ', decoded_sentence[1:-5])

SyntaxError: invalid syntax (2990389357.py, line 6)

In [67]:
import numpy as np
from collections import Counter
from nltk import ngrams

In [68]:
def simple_count(tokens, n): # 단순히 n-gram을 세주는 코드
    return Counter(ngrams(tokens, n))

In [69]:
candidate = 'It is a guide to action which ensures that the military always obeys the commands of the party.'
tokens = candidate.split() # split 처리
result = simple_count(tokens, 1) # split 처리 count

print(result)

Counter({('the',): 3, ('It',): 1, ('is',): 1, ('a',): 1, ('guide',): 1, ('to',): 1, ('action',): 1, ('which',): 1, ('ensures',): 1, ('that',): 1, ('military',): 1, ('always',): 1, ('obeys',): 1, ('commands',): 1, ('of',): 1, ('party.',): 1})


In [70]:
candidate = 'the the the the the the the the'
tokens = candidate.split()
result = simple_count(tokens, 1)

print(result)

Counter({('the',): 8})


In [71]:
# count clip
def count_clip(candidate, reference_list, n): # n-gram의 등장횟수를 보정(clipping)하여 정확한 평가를 할 수 있게 돕는다.
    ca_cnt = simple_count(candidate, n) # condidate : 번역된 문장, reference_list : 참조 문장들, n : n-gram
    # condidate에서 n-gram
    max_ref_cnt_dict = dict()

    for ref in reference_list:
        # 각 reference 문장에서 n-gram
        ref_cnt = simple_count(ref, n)

        # 각 reference 문장에서 n-gram의 최대 횟수를 계산
        for n_gram in ref_cnt:
            if n_gram in max_ref_cnt_dict:
                max_ref_cnt_dict[n_gram] = max(ref_cnt[n_gram], max_ref_cnt_dict[n_gram])
            else:
                max_ref_cnt_dict[n_gram] = ref_cnt[n_gram]

    return {n_gram : min(ca_cnt.get(n_gram, 0), max_ref_cnt_dict.get(n_gram, 0)) for n_gram in ca_cnt} # 참조문장의 횟수를 초과하지 않는다.

In [72]:
references = ['the cat is on the mat', 'there is a cat on the mat']

result = count_clip(candidate.split(), list(map(lambda ref : ref.split(), references)), 1)
print(result)

{('the',): 2}


In [73]:
def modified_precision(candidate, reference_list, n):
    # 분지
    clip_cnt = count_clip(candidate, reference_list, n)
    total_clip_cnt = sum(clip_cnt.values())

    # 분모
    cnt = simple_count(candidate, n)
    total_cnt = sum(cnt.values())

    if total_cnt == 0:
        total_cnt = 1

    return total_clip_cnt / total_cnt # count_clip의 합 / 단순 count 합  = 보정된 정밀도

In [74]:
result = modified_precision(candidate.split(), list(map(lambda ref : ref.split(), references)), n=1)
print(result)

0.25


In [75]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

reference = [['this', 'is', 'a', 'small', 'test']]
candidate = ['this', 'is', 'a', 'test']

smoothie = SmoothingFunction().method4
bleu_score = sentence_bleu(reference, candidate, smoothing_function=smoothie)

print(bleu_score)

0.3610854011224141


In [76]:
def closest_ref_length(candidate, reference_list):
    ca_len = len(candidate) # 번역된 문장 길이
    ref_lens = len(ref) for ref in reference_list) # 참조 문장들의 길이

        closest_ref_len = min(ref_lens, key=lambda ref_len : (abs(ref_len - ca_len), ref_len))
        return closest_ref_len

SyntaxError: invalid syntax (1661692658.py, line 3)

In [77]:
def brevity_penalty(candidate, reference_list):
    ca_len = len(candidate)
    ref_len = closest_ref_length(candidate, reference_list)

    if ca_len > ref_len:
        return 1

    elif ca_len == 0:
        return 0

    else:
        return np.exp(1-ref_len/ca_len)

In [78]:
def bleu_score(candidate, reference_list, weights=[0.25, 0.25, 0.25, 0.25]):
    bp = brevity_penalty(candidate, reference_list)

    p_n = [modified_precision(candidate, reference_list, n=n) for n, _ in enumerate(weights, start=1)] # 정밀도 계산
    score = np.sum([w_i * np.log(p_i) if p_i != 0 else 0 for w_i, p_i in zip(weights, p_n)])

    return bp*np.exp(score)

# 4. 점수 합산 : n-gram의 정밀도를 기하평균으로 계산한다.
# -> BLEU = BP * exp(∑ w(n) * log(p(n)))
# -> w(n)은 각 n-gram의 가중치

In [79]:
import nltk.translate.bleu_score as bleu

candidate = 'It is a guide to action which ensures that the military always obeys the commands of the party.'
references = [
    'It is a guide to action that ensures that the military will forever heed Party commands',
    'It is the guiding principle which guarantees the military forces always being under the command of the Party',
    'It is the practical guide for the army always to heed the directions of the party'
]

print('실습 코드 BLEU : ', bleu_score(candidate.split(), list(map(lambda ref : ref.split(), references))))
print('NLTK의 BLEU : ', bleu.sentence_bleu(list(map(lambda ref : ref.split(), references)), candidate,split()))

NameError: name 'closest_ref_length' is not defined