In [4]:
pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

class MachineTranslationModel:
    def __init__(self, max_vocab_size=10000, max_sequence_length=50):
        self.max_vocab_size = max_vocab_size
        self.max_sequence_length = max_sequence_length

        # Tokenizers for Vietnamese and English
        self.vi_tokenizer = Tokenizer(num_words=max_vocab_size, oov_token='<OOV>')
        self.en_tokenizer = Tokenizer(num_words=max_vocab_size, oov_token='<OOV>')

    def preprocess_data(self, vi_texts, en_texts):
        # Fit tokenizers
        self.vi_tokenizer.fit_on_texts(vi_texts)
        self.en_tokenizer.fit_on_texts(en_texts)

        # Convert texts to sequences
        vi_sequences = self.vi_tokenizer.texts_to_sequences(vi_texts)
        en_sequences = self.en_tokenizer.texts_to_sequences(en_texts)

        # Pad sequences
        vi_padded = pad_sequences(vi_sequences, maxlen=self.max_sequence_length, padding='post')
        en_padded = pad_sequences(en_sequences, maxlen=self.max_sequence_length, padding='post')

        return vi_padded, en_padded

    def create_lstm_model_without_attention(self, vi_vocab_size, en_vocab_size, embedding_dim=256, hidden_dim=512):
        # Encoder
        encoder_inputs = tf.keras.layers.Input(shape=(self.max_sequence_length,))
        encoder_embedding = tf.keras.layers.Embedding(vi_vocab_size, embedding_dim)(encoder_inputs)
        encoder_lstm = tf.keras.layers.LSTM(hidden_dim, return_sequences=True, return_state=True)
        encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

        # Decoder
        decoder_inputs = tf.keras.layers.Input(shape=(self.max_sequence_length,))
        decoder_embedding = tf.keras.layers.Embedding(en_vocab_size, embedding_dim)(decoder_inputs)
        decoder_lstm = tf.keras.layers.LSTM(hidden_dim, return_sequences=True, return_state=True)
        decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=[state_h, state_c])

        # Output layer
        decoder_dense = tf.keras.layers.Dense(en_vocab_size, activation='softmax')
        decoder_outputs = decoder_dense(decoder_outputs)

        # Create model
        model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

        return model

    def create_lstm_model_with_attention(self, vi_vocab_size, en_vocab_size, embedding_dim=256, hidden_dim=512):
        # Attention Mechanism
        class AttentionLayer(tf.keras.layers.Layer):
            def __init__(self, **kwargs):
                super().__init__(**kwargs)

            def call(self, encoder_outputs, decoder_hidden):
                # Calculate attention scores
                score = tf.keras.layers.Dense(1)(tf.concat([encoder_outputs, tf.expand_dims(decoder_hidden, 1)], axis=-1))
                attention_weights = tf.nn.softmax(score, axis=1)

                # Context vector
                context_vector = attention_weights * encoder_outputs
                context_vector = tf.reduce_sum(context_vector, axis=1)

                return context_vector, attention_weights

        # Encoder
        encoder_inputs = tf.keras.layers.Input(shape=(self.max_sequence_length,))
        encoder_embedding = tf.keras.layers.Embedding(vi_vocab_size, embedding_dim)(encoder_inputs)
        encoder_lstm = tf.keras.layers.LSTM(hidden_dim, return_sequences=True, return_state=True)
        encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)

        # Decoder with Attention
        decoder_inputs = tf.keras.layers.Input(shape=(self.max_sequence_length,))
        decoder_embedding = tf.keras.layers.Embedding(en_vocab_size, embedding_dim)(decoder_inputs)
        decoder_lstm = tf.keras.layers.LSTM(hidden_dim, return_sequences=True, return_state=True)

        # Attention Layer
        attention_layer = AttentionLayer()

        # Decoder outputs with attention
        decoder_lstm_outputs = []
        decoder_state_h, decoder_state_c = state_h, state_c

        for t in range(self.max_sequence_length):
            decoder_lstm_input = decoder_embedding[:, t, :]
            decoder_lstm_input = tf.expand_dims(decoder_lstm_input, 1)

            context_vector, attention_weights = attention_layer(encoder_outputs, decoder_state_h)

            lstm_input = tf.concat([decoder_lstm_input, tf.expand_dims(context_vector, 1)], axis=-1)

            decoder_lstm_output, decoder_state_h, decoder_state_c = decoder_lstm(
                lstm_input,
                initial_state=[decoder_state_h, decoder_state_c]
            )

            decoder_lstm_outputs.append(decoder_lstm_output)

        # Stack decoder outputs
        decoder_outputs = tf.stack(decoder_lstm_outputs, axis=1)

        # Output layer
        decoder_dense = tf.keras.layers.Dense(en_vocab_size, activation='softmax')
        decoder_outputs = decoder_dense(decoder_outputs)

        # Create model
        model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

        return model

    def train_model(self, model, vi_input, en_input, epochs=10, batch_size=32):
        # Prepare input for training
        x_train = [vi_input, en_input[:, :-1]]
        y_train = en_input[:, 1:]

        # Train model
        history = model.fit(
            x_train,
            y_train,
            epochs=epochs,
            batch_size=batch_size,
            validation_split=0.2
        )

        return history

    def translate_example(self, model, input_sentence, vi_tokenizer, en_tokenizer):
        # Preprocess input sentence
        input_sequence = vi_tokenizer.texts_to_sequences([input_sentence])
        input_sequence = pad_sequences(input_sequence, maxlen=self.max_sequence_length, padding='post')

        # Prepare decoder input
        decoder_input = np.zeros((1, self.max_sequence_length))
        decoder_input[0, 0] = en_tokenizer.word_index['<start>']

        # Translate
        translated_sentence = []
        for i in range(1, self.max_sequence_length):
            predictions = model.predict([input_sequence, decoder_input])
            predicted_token = np.argmax(predictions[0, i-1, :])

            if predicted_token == en_tokenizer.word_index['<end>']:
                break

            translated_sentence.append(en_tokenizer.index_word.get(predicted_token, ''))
            decoder_input[0, i] = predicted_token

        return ' '.join(translated_sentence)

# Example usage
def main():
    # Giả sử dữ liệu đã được chuẩn bị
    vi_texts = ["Xin chào", "Tôi là sinh viên"]
    en_texts = ["Hello", "I am a student"]

    # Khởi tạo mô hình
    mt_model = MachineTranslationModel()

    # Tiền xử lý dữ liệu
    vi_input, en_input = mt_model.preprocess_data(vi_texts, en_texts)

    # Tạo mô hình không có Attention
    model_without_attention = mt_model.create_lstm_model_without_attention(
        vi_vocab_size=len(mt_model.vi_tokenizer.word_index) + 1,
        en_vocab_size=len(mt_model.en_tokenizer.word_index) + 1
    )

    # Tạo mô hình có Attention
    model_with_attention = mt_model.create_lstm_model_with_attention(
        vi_vocab_size=len(mt_model.vi_tokenizer.word_index) + 1,
        en_vocab_size=len(mt_model.en_tokenizer.word_index) + 1
    )

    # Huấn luyện mô hình
    history_without_attention = mt_model.train_model(model_without_attention, vi_input, en_input)
    history_with_attention = mt_model.train_model(model_with_attention, vi_input, en_input)