In [1]:
import io
import json
import numpy as np
import pandas as pd
import random
import re
import tensorflow as tf

import unicodedata
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [2]:
#Loaf dataset
with open('./dataset/en_sents.txt') as file:
  train_input = [line.rstrip() for line in file]

In [3]:
with open('./dataset/vi_sents.txt') as file:
  train_target = [line.rstrip() for line in file]

In [4]:
print(train_input[:3], "Length: ", len(train_input))
print(train_target[:3], "Length: ", len(train_target))

['Please put the dustpan in the broom closet', 'Be quiet for a moment.', 'Read this'] Length:  254090
['xin vui lòng đặt người quét rác trong tủ chổi', 'im lặng một lát', 'đọc này'] Length:  254090


In [5]:
#load model
model = tf.keras.models.load_model('./models/eng_vie_no_attention')

2024-05-09 00:46:52.870462: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 42 outputs. Output shapes may be inaccurate.
2024-05-09 00:46:52.877105: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond' has 5 outputs but the _output_shapes attribute specifies shapes for 42 outputs. Output shapes may be inaccurate.
2024-05-09 00:46:52.920095: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 42 outputs. Output shapes may be inaccurate.
2024-05-09 00:46:53.146672: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 42 outputs. Output shapes may be inaccurate.
2024-05-09 00:46:53.155230: W tensorflow/core/common_runtime/graph_constructor.cc:840] Node 'cond/while' has 13 outputs but the _ou

In [6]:
model.summary()

Model: "eng_vi_seq2seq_nmt_no_attention"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 encoder_inputs (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 decoder_inputs (InputLayer  [(None, None)]               0         []                            
 )                                                                                                
                                                                                                  
 encoder_embeddings (Embedd  (None, None, 128)            2664192   ['encoder_inputs[0][0]']      
 ing)                                                               

In [7]:
embedding_dim = 128
hidden_dim = 256
default_dropout=0.2
batch_size = 32
epochs = 30

In [8]:
with open('./models/tokenizer/source_tokenizer.json') as f:
    data = json.load(f)
    source_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

with open('./models/tokenizer/target_tokenizer.json') as f:
    data = json.load(f)
    target_tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(data)

In [9]:
#Get layer encoder
encoder_inputs = model.get_layer('encoder_inputs').input

encoder_embedding_layer = model.get_layer('encoder_embeddings')
encoder_embeddings = encoder_embedding_layer(encoder_inputs)

encoder_lstm = model.get_layer('encoder_lstm')

_, encoder_state_h, encoder_state_c = encoder_lstm(encoder_embeddings)

encoder_states = [encoder_state_h, encoder_state_c]

# Our stand-alone encoder model. encoder_inputs is the input to the encoder,
# and encoder_states is the expected output.
encoder_model_no_attention = tf.keras.Model(encoder_inputs, encoder_states)

In [10]:
#Get layer decoder
decoder_inputs = model.get_layer('decoder_inputs').input

decoder_embedding_layer = model.get_layer('decoder_embeddings')
decoder_embeddings = decoder_embedding_layer(decoder_inputs)

# Inputs to represent the decoder's LSTM hidden and cell states. We'll populate
# these manually using the encoder's output for the initial state.
decoder_input_state_h = tf.keras.Input(shape=(hidden_dim,), name='decoder_input_state_h')
decoder_input_state_c = tf.keras.Input(shape=(hidden_dim,), name='decoder_input_state_c')
decoder_input_states = [decoder_input_state_h, decoder_input_state_c]

decoder_lstm = model.get_layer('decoder_lstm')

decoder_sequence_outputs, decoder_output_state_h, decoder_output_state_c = decoder_lstm(
    decoder_embeddings, initial_state=decoder_input_states
)

# Update hidden and cell states for the next time step.
decoder_output_states = [decoder_output_state_h, decoder_output_state_c]

decoder_dense = model.get_layer('decoder_dense')
y_proba = decoder_dense(decoder_sequence_outputs)

decoder_model_no_attention = tf.keras.Model(
    [decoder_inputs] + decoder_input_states,
    [y_proba] + decoder_output_states
)

In [11]:
def translate_without_attention(sentence: str,
                                source_tokenizer, encoder,
                                target_tokenizer, decoder,
                                max_translated_len = 30):

  # Vectorize the source sentence and run it through the encoder.
  input_seq = source_tokenizer.texts_to_sequences([sentence])

  # Get the tokenized sentence to see if there are any unknown tokens.
  tokenized_sentence = source_tokenizer.sequences_to_texts(input_seq)

  states = encoder.predict(input_seq)

  current_word = '<sos>'
  decoded_sentence = []

  while len(decoded_sentence) < max_translated_len:

    # Set the next input word for the decoder.
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = target_tokenizer.word_index[current_word]

    # Determine the next word.
    target_y_proba, h, c = decoder.predict([target_seq] + states)
    target_token_index = np.argmax(target_y_proba[0, -1, :])
    current_word = target_tokenizer.index_word[target_token_index]

    if (current_word == '<eos>'):
      break

    decoded_sentence.append(current_word)
    states = [h, c]

  return tokenized_sentence[0], ' '.join(decoded_sentence)


In [12]:
def preprocess_sentence(s):
    #s = normalize_unicode(s)
    s = re.sub(r"([?.!,¿])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    s = s.strip()
    s = s.lower()  # Chuyển đổi thành chữ thường
    return s


In [13]:
def translate_sentences(test_input, translation_func, source_tokenizer, encoder,
                        target_tokenizer, decoder):
    test_input = preprocess_sentence(test_input)
    tokenized_sentence, translated = translation_func(test_input, source_tokenizer, encoder,
                                                      target_tokenizer, decoder)
    print(tokenized_sentence)
    print(translated)
 

In [19]:
test_sentence = "she said she played soccer yesterday."
test_3 = "Tom makes great cookies."

translate_sentences("the patient may pass away at any moment .", translate_without_attention, source_tokenizer, encoder_model_no_attention,
                                                             target_tokenizer, decoder_model_no_attention)

the patient may pass away at any moment .
bệnh nhân có thể vượt qua bất cứ lúc nào .
