## 자연어 처리 Exp 4
## 201800839 김수연

In [1]:
import numpy as np
import pandas as pd
import re
import shutil
import os
import unicodedata
import urllib3
import zipfile
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
http = urllib3.PoolManager()
url = 'http://www.manythings.org/anki/fra-eng.zip'
filename = 'fra-eng.zip'
path = os.getcwd()
zipfilename = os.path.join(path, filename)
with http.request('GET', url, preload_content = False) as r, open(zipfilename, 'wb') as out_file:
    shutil.copyfileobj(r, out_file)

with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
    zip_ref.extractall(path)

In [3]:
num_samples = 33000

In [4]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

In [5]:
def preprocess_sentence(sent):
    sent = unicode_to_ascii(sent.lower())
    
    sent = re.sub(r"([?.!,¿])", r" \1", sent)
    
    sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)
    
    sent = re.sub(r"\s+", " ", sent)
    
    return sent

In [6]:
en_sent = u"Have you had dinner?"
fr_sent = u"Avez-vous déjà diné?"
print(preprocess_sentence(en_sent))
print(preprocess_sentence(fr_sent).encode('utf-8'))

have you had dinner ?
b'avez vous deja dine ?'


In [7]:
def load_preprocessed_data():
    encoder_input, decoder_input, decoder_target = [], [], []
    
    with open("fra.txt", "r", encoding = 'utf8') as lines:
        for i, line in enumerate(lines):
            src_line, tar_line, _ = line.strip().split('\t')
            
            src_line_input = [w for w in preprocess_sentence(src_line).split()]
            
            tar_line = preprocess_sentence(tar_line)
            tar_line_input = [w for w in ("<sos> " + tar_line).split()]
            tar_line_target = [w for w in (tar_line + " <eos>").split()]
            
            encoder_input.append(src_line_input)
            decoder_input.append(tar_line_input)
            decoder_target.append(tar_line_target)
            
            if i == num_samples - 1:
                break
                
    return encoder_input, decoder_input, decoder_target

In [8]:
sents_en_in, sents_fra_in, sents_fra_out = load_preprocessed_data()
print(sents_en_in[:5])
print(sents_fra_in[:5])
print(sents_fra_out[:5])

[['go', '.'], ['hi', '.'], ['hi', '.'], ['run', '!'], ['run', '!']]
[['<sos>', 'va', '!'], ['<sos>', 'salut', '!'], ['<sos>', 'salut', '.'], ['<sos>', 'cours', '!'], ['<sos>', 'courez', '!']]
[['va', '!', '<eos>'], ['salut', '!', '<eos>'], ['salut', '.', '<eos>'], ['cours', '!', '<eos>'], ['courez', '!', '<eos>']]


In [9]:
tokenizer_en = Tokenizer(filters = "", lower = False)
tokenizer_en.fit_on_texts(sents_en_in)
encoder_input = tokenizer_en.texts_to_sequences(sents_en_in)

tokenizer_fra = Tokenizer(filters = "", lower = False)
tokenizer_fra.fit_on_texts(sents_fra_in)
tokenizer_fra.fit_on_texts(sents_fra_out)
decoder_input = tokenizer_fra.texts_to_sequences(sents_fra_in)
decoder_target = tokenizer_fra.texts_to_sequences(sents_fra_out)

In [10]:
encoder_input = pad_sequences(encoder_input, padding = "post")
decoder_input = pad_sequences(decoder_input, padding = "post")
decoder_target = pad_sequences(decoder_target, padding = "post")

In [11]:
print(encoder_input.shape)
print(decoder_input.shape)
print(decoder_target.shape)

(33000, 8)
(33000, 16)
(33000, 16)


In [12]:
src_vocab_size = len(tokenizer_en.word_index) + 1
tar_vocab_size = len(tokenizer_fra.word_index) + 1
print("영어 단어 집합의 크기 : {:d}, 프랑스어 단어 집합의 크기 : {:d}".format(src_vocab_size, tar_vocab_size))


영어 단어 집합의 크기 : 4678, 프랑스어 단어 집합의 크기 : 8032


In [13]:
src_to_index = tokenizer_en.word_index
index_to_src = tokenizer_en.index_word

tar_to_index = tokenizer_fra.word_index
index_to_tar = tokenizer_fra.index_word

In [14]:
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)
print(indices)

[17653 24948 26770 ...  5351 31728 13102]


In [15]:
encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_target = decoder_target[indices]

In [16]:
encoder_input[839]

array([ 20, 117,  11, 165,   1,   0,   0,   0])

In [17]:
decoder_input[839]

array([  2,  25,  26,   5,  10, 314,  12,   0,   0,   0,   0,   0,   0,
         0,   0,   0])

In [18]:
decoder_target[839]

array([ 25,  26,   5,  10, 314,  12,   3,   0,   0,   0,   0,   0,   0,
         0,   0,   0])

In [19]:
n_of_val = int(33000*0.1)
print(n_of_val)

3300


In [20]:
encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

In [21]:
print(encoder_input_train.shape)
print(decoder_input_train.shape)
print(decoder_target_train.shape)
print(encoder_input_test.shape)
print(decoder_input_test.shape)
print(decoder_target_test.shape)

(29700, 8)
(29700, 16)
(29700, 16)
(3300, 8)
(3300, 16)
(3300, 16)


In [22]:
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Masking
from tensorflow.keras.models import Model

In [23]:
latent_dim = 50

In [24]:
# 인코더
encoder_inputs = Input(shape = (None, ))
enc_emb = Embedding(src_vocab_size, latent_dim)(encoder_inputs)
enc_masking = Masking(mask_value = 0.0)(enc_emb)
encoder_lstm = LSTM(latent_dim, return_state = True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_masking)
encoder_states = [state_h, state_c]

In [25]:
# 디코더
decoder_inputs = Input(shape = (None, ))
dec_emb_layer = Embedding(tar_vocab_size, latent_dim)
dec_emb = dec_emb_layer(decoder_inputs)
dec_masking = Masking(mask_value = 0.0)(dec_emb)

decoder_lstm = LSTM(latent_dim, return_sequences = True, return_state = True)

decoder_outputs, _, _ = decoder_lstm(dec_masking, initial_state = encoder_states)

decoder_dense = Dense(tar_vocab_size, activation = 'softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [26]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [27]:
model.compile(optimizer = 'rmsprop',
              loss = 'sparse_categorical_crossentropy',
              metrics = ['acc'])

In [28]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 50)     233900      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 50)     401600      input_2[0][0]                    
_______________________________________________________________________________________

In [29]:
model.fit(x = [encoder_input_train, decoder_input_train],
          y = decoder_target_train,
          validation_data = ([encoder_input_test, decoder_input_test], decoder_target_test),
          batch_size = 128, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x23aaec7af08>

In [30]:
encoder_model = Model(encoder_inputs, encoder_states)

In [31]:
decoder_state_input_h = Input(shape = (latent_dim, ))
decoder_state_input_c = Input(shape = (latent_dim, ))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)

decoder_outputs2, state_h2, state_c2 = decoder_lstm(dec_emb2, initial_state = decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]

decoder_outputs2 = decoder_dense(decoder_outputs2)

In [32]:
decoder_model = Model([decoder_inputs] + decoder_states_inputs,
                      [decoder_outputs2] + decoder_states2)

In [33]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = tar_to_index['<sos>']
    
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)
        
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = index_to_tar[sampled_token_index]
        
        decoded_sentence += ' ' + sampled_char
        
        if (sampled_char == '<eos>' or len(decoded_sentence) > 50):
            stop_condition = True
        
        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        
        states_value = [h, c]
        
    return decoded_sentence

In [34]:
def seq2src(input_seq):
    temp = ''
    for i in input_seq:
        if(i != 0):
            temp = temp + index_to_src[i] + ' '
    return temp

def seq2tar(input_seq):
    temp = ''
    for i in input_seq:
        if((i != 0 and i != tar_to_index['<sos>']) and i != tar_to_index['<eos>']):
            temp = temp + index_to_tar[i] + ' '
    return temp


In [35]:
# 201800839
# [39, 839, 239, 139, 1039, 1239, 1839, 2039, 2139, 2839]
for seq_index in [39, 839, 239, 139, 1039, 1239, 1839, 2039, 2139, 2839]:
    input_seq = encoder_input_train[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    
    print("원문 : ", seq2src(encoder_input_train[seq_index]))
    print("번역문 : ", seq2tar(decoder_input_train[seq_index]))
    print("예측문 : ", decoded_sentence[:-5])
    print("\n")

원문 :  what a nice family ! 
번역문 :  quelle chouette famille ! 
예측문 :   quelle etait voiture ! 


원문 :  this isn t funny . 
번역문 :  ce n est pas drole ! 
예측문 :   ce n est pas drole . 


원문 :  i like what i do . 
번역문 :  j apprecie ce que je fais . 
예측문 :   je suis ce que je aime pas les ? 


원문 :  we saved your life . 
번역문 :  nous t avons sauve la vie . 
예측문 :   nous nous avons tous vu seul . 


원문 :  hop in . 
번역문 :  montez . 
예측문 :   prends la lumiere . 


원문 :  how exciting ! 
번역문 :  comme c est excitant ! 
예측문 :   c est beau ! 


원문 :  that s what i heard . 
번역문 :  c est ce que j ai entendu . 
예측문 :   c est ce que je l ai vu . 


원문 :  she has no fear . 
번역문 :  elle n a pas peur . 
예측문 :   elle n a pas de reponse . 


원문 :  are you relaxed ? 
번역문 :  es tu detendu ? 
예측문 :   etes vous deux ? 


원문 :  stop gawking . 
번역문 :  arrete de bayer aux corneilles . 
예측문 :   arrete de nouveau les yeux . 




In [36]:
# 201800839
# [39, 839, 239, 139, 1039, 1239, 1839, 2039, 2139, 2839]
for seq_index in [39, 839, 239, 139, 1039, 1239, 1839, 2039, 2139, 2839]:
    input_seq = encoder_input_test[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    
    print("원문 : ", seq2src(encoder_input_test[seq_index]))
    print("번역문 : ", seq2tar(decoder_input_test[seq_index]))
    print("예측문 : ", decoded_sentence[:-5])
    print("\n")

원문 :  i m not strong . 
번역문 :  je ne suis pas fort . 
예측문 :   je ne suis pas en train de partir . 


원문 :  i was worried . 
번역문 :  j etais inquiete . 
예측문 :   j etais en train de etait une faute . 


원문 :  everyone escaped . 
번역문 :  tout le monde s est echappe . 
예측문 :   tout le monde s est passe un fait . 


원문 :  is it that obvious ? 
번역문 :  est ce si evident ? 
예측문 :   est ce que c est libre ? 


원문 :  tom needs help now . 
번역문 :  tom a besoin d aide en ce moment . 
예측문 :   tom a besoin de la voiture . 


원문 :  back off . 
번역문 :  retirez vous . 
예측문 :   maintenant ! 


원문 :  i never wanted that . 
번역문 :  je n ai jamais voulu ca . 
예측문 :   je ne l ai jamais vu . 


원문 :  that s a surprise . 
번역문 :  c est une surprise . 
예측문 :   c est une bonne . 


원문 :  i barely knew tom . 
번역문 :  je connaissais a peine tom . 
예측문 :   je devrais etre qu elle a l exterieur . 


원문 :  we were terrified . 
번역문 :  nous etions terrifiees . 
예측문 :   nous etions en train de manger . 


