In [1]:
import os
import shutil
import zipfile
import urllib3
import requests
import gc
from typing import List, Dict

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

2024-12-06 13:38:29.657907: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Config

In [2]:
base_dir = "../data"
os.makedirs(base_dir, exist_ok=True)

# dataset path
output_zip_path = os.path.join(base_dir, "fra-eng.zip")
output_zip_dir = os.path.join(base_dir, "fra-eng")
output_csv_path = os.path.join(base_dir, output_zip_dir, "fra.txt")

## Download Dataset

In [3]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [4]:
def download_zip(url, output_path):
    response = requests.get(url, headers=headers, stream=True)
    if response.status_code == 200:
        with open(output_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"ZIP file downloaded to {output_path}")
    else:
        print(f"Failed to download. HTTP Response Code: {response.status_code}")

url = "http://www.manythings.org/anki/fra-eng.zip"
download_zip(url, output_zip_path)

# unzip
with zipfile.ZipFile(output_zip_path, "r") as zip_ref:
    zip_ref.extractall(output_zip_dir)

ZIP file downloaded to ../data/fra-eng.zip


## Read Dataset

In [5]:
data = pd.read_csv(output_csv_path, header=None, names=["source", "target", "license"], sep="\t")

In [6]:
data.head()

Unnamed: 0,source,target,license
0,Go.,Va !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Marche.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,En route !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Go.,Bouge !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4,Hi.,Salut !,CC-BY 2.0 (France) Attribution: tatoeba.org #5...


In [7]:
# drop license column
data.drop(["license"], axis=1, inplace=True)

In [8]:
data.shape

(232736, 2)

In [9]:
# use first 60,000 rows to train the model
data_sample = data.iloc[:60000].copy()

np.random.seed(1234)
data_sample.sample(10)

Unnamed: 0,source,target
30329,Tom seems sincere.,Tom semble sincère.
44957,Tom wants to try it.,Tom veut l'essayer.
30866,We're quite alone.,Nous sommes tout à fait seuls.
40447,"Hey, listen to this.","Eh, écoutez ceci."
25580,Come talk with me.,Venez me parler.
6216,Stop arguing.,Arrête de te quereller !
26373,How are you doing?,Comment vas-tu ?
9010,Pick a weapon.,Choisis une arme !
23445,Tom felt nothing.,Tom n'a rien ressenti.
108,Hug me.,Serre-moi dans tes bras !


## Preprocess Dataset

In [10]:
# add <sos>, <eos> symbol
data_sample["target"] = "<sos> " + data_sample["target"] + " <eos>"
data_sample.head()

Unnamed: 0,source,target
0,Go.,<sos> Va ! <eos>
1,Go.,<sos> Marche. <eos>
2,Go.,<sos> En route ! <eos>
3,Go.,<sos> Bouge ! <eos>
4,Hi.,<sos> Salut ! <eos>


In [11]:
# create source/target vocab
source_vocab = set()
target_vocab = set()
for _, row in data_sample.iterrows():
    source_vocab.update(list(row.source))
    target_vocab.update(["<sos>"] + list(row.target.lstrip("<sos>").rstrip("<eos>")) + ["<eos>"])

source_vocab_size = len(source_vocab) + 1
target_vocab_size = len(target_vocab) + 1
print(f"Length of Source Vocab: {source_vocab_size}")
print(f"Length of Target Vocab: {target_vocab_size}")

Length of Source Vocab: 80
Length of Target Vocab: 102


In [12]:
# char to idx
source_to_index = {w: i+1 for i, w in enumerate(source_vocab)}
target_to_index = {w: i+1 for i, w in enumerate(target_vocab)}

In [13]:
encoder_inputs = []
decoder_inputs = []
decoder_targets = []
for _, row in data_sample.iterrows():
    encoder_inputs.append([source_to_index[c] for c in row.source])

    target_encoded = [target_to_index[c] for c in row.target.lstrip("<sos>").rstrip("<eos>")]
    decoder_inputs.append([target_to_index["<sos>"]] + target_encoded + [target_to_index["<eos>"]])
    decoder_targets.append(target_encoded + [target_to_index["<eos>"]])

In [14]:
encoder_inputs[:5]

[[59, 32, 8], [59, 32, 8], [59, 32, 8], [59, 32, 8], [35, 76, 8]]

In [15]:
decoder_inputs[:5]

[[58, 37, 83, 41, 37, 36, 37, 89],
 [58, 37, 26, 41, 25, 40, 12, 48, 11, 37, 89],
 [58, 37, 99, 51, 37, 25, 39, 6, 93, 48, 37, 36, 37, 89],
 [58, 37, 75, 39, 6, 82, 48, 37, 36, 37, 89],
 [58, 37, 62, 41, 49, 6, 93, 37, 36, 37, 89]]

In [16]:
decoder_targets[:5]

[[37, 83, 41, 37, 36, 37, 89],
 [37, 26, 41, 25, 40, 12, 48, 11, 37, 89],
 [37, 99, 51, 37, 25, 39, 6, 93, 48, 37, 36, 37, 89],
 [37, 75, 39, 6, 82, 48, 37, 36, 37, 89],
 [37, 62, 41, 49, 6, 93, 37, 36, 37, 89]]

In [17]:
max_source_len = max(map(len, encoder_inputs))
max_target_len = max(map(len, decoder_inputs))

print(f"max source length: {max_source_len}")
print(f"max target length: {max_target_len}")

max source length: 22
max target length: 76


In [18]:
encoder_inputs = pad_sequences(encoder_inputs, maxlen=max_source_len, padding="post")
decoder_inputs = pad_sequences(decoder_inputs, maxlen=max_target_len, padding="post")
decoder_targets = pad_sequences(decoder_targets, maxlen=max_target_len, padding="post")

print(f"encoder input shape: {encoder_inputs.shape}")
print(f"decoder input shape: {decoder_inputs.shape}")
print(f"decoder target shape: {decoder_targets.shape}")

encoder input shape: (60000, 22)
decoder input shape: (60000, 76)
decoder target shape: (60000, 76)


In [19]:
encoder_inputs = to_categorical(encoder_inputs)
decoder_inputs = to_categorical(decoder_inputs)
decoder_targets = to_categorical(decoder_targets)

# note that len(source_vocab) = 79, len(target_vocab) = 101
print(f"encoder input shape: {encoder_inputs.shape}")
print(f"decoder input shape: {decoder_inputs.shape}")
print(f"decoder target shape: {decoder_targets.shape}")

encoder input shape: (60000, 22, 80)
decoder input shape: (60000, 76, 102)
decoder target shape: (60000, 76, 102)


## Build Vanilla seq2seq (without Attention)

In [20]:
class VanillaSeq2Seq:
    def __init__(
        self,        
        souce_vocab_size: int,
        target_vocab_size: int,
        num_hidden: int = 256,
        initial_learning_rate: float = 1e-3
    ) -> None:
        self._num_hidden = num_hidden
        self._source_vocab_size = souce_vocab_size
        self._target_vocab_size = target_vocab_size
        self._initial_learning_rate = initial_learning_rate
    
    def build(self) -> None:
        encoder_inputs = Input(shape=(None, self._source_vocab_size))
        encoder_lstm = LSTM(units=self._num_hidden, return_state=True)
        encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
        encoder_states = [state_h, state_c]

        decoder_inputs = Input(shape=(None, self._target_vocab_size))
        decoder_lstm = LSTM(units=self._num_hidden, return_sequences=True, return_state=True)
        decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
        decoder_dense = Dense(self._target_vocab_size, activation="softmax")
        decoder_outputs = decoder_dense(decoder_outputs)

        model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_outputs)
        model.compile(optimizer=Adam(self._initial_learning_rate), loss="categorical_crossentropy")
        
        self.model = model

        # used to predict the sequence
        self.encoder_model = Model(inputs=encoder_inputs, outputs=encoder_states)
        decoder_state_input_h = Input(shape=(self._num_hidden,))
        decoder_state_input_c = Input(shape=(self._num_hidden,))
        decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]
        decoder_outputs, state_h, state_c = decoder_lstm(decoder_inputs, initial_state=decoder_state_inputs)
        decoder_states = [state_h, state_c]
        decoder_outputs = decoder_dense(decoder_outputs)
        self.decoder_model = Model(
            inputs=[decoder_inputs] + decoder_state_inputs,
            outputs=[decoder_outputs] + decoder_states,
        )

    def fit(
        self,
        X: List[np.ndarray],
        y: np.ndarray,
        batch_size: int = 128,
        epochs: int = 50,
        validation_split: float = 0.2,
    ) -> None:
        self.model.fit(
            x=X,
            y=y,
            batch_size=batch_size,
            epochs=epochs,
            validation_split=validation_split,
        )

    def predict_sequence(
        self,
        input_sequence: np.ndarray,
        index_to_target: Dict[int, str],
        sos_token_index: int,
    ) -> str:
        if len(input_sequence.shape) != 2:
            raise ValueError("input sequence must be 2-dimensional")

        inputs = np.expand_dims(input_sequence, axis=0)
        states = self.encoder_model.predict(inputs, verbose=False)
        
        target_sequence = np.zeros((1, 1, self._target_vocab_size))
        target_sequence[0, 0, sos_token_index] = 1
        stop_condition = False
        decoded_sentence = ""
        while not stop_condition:
            out, h, c = self.decoder_model.predict([target_sequence] + states, verbose=False)
            token_idx = np.argmax(out[0, -1, :])
            char = index_to_target.get(token_idx, " ")
            decoded_sentence += char
        
            if char == "<eos>" or len(decoded_sentence) > max_target_len:
                stop_condition = True
        
            target_sequence = np.zeros((1, 1, self._target_vocab_size))
            target_sequence[0, 0, token_idx] = 1.
            states = [h, c]
        return decoded_sentence

In [21]:
# to prevent memory leak
tf.keras.backend.clear_session()
gc.collect()

0

In [22]:
seq2seq = VanillaSeq2Seq(
    souce_vocab_size=source_vocab_size,
    target_vocab_size=target_vocab_size,
    initial_learning_rate=1e-2,
)
seq2seq.build()

2024-12-06 13:38:40.749699: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-06 13:38:40.757249: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-06 13:38:40.757742: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-06 13:38:40.759907: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

In [23]:
seq2seq.fit(
    X=[encoder_inputs, decoder_inputs],
    y=decoder_targets,
    batch_size=512,
    epochs=30,
    validation_split=0.2
)

2024-12-06 13:38:41.812283: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 1488384000 exceeds 10% of free system memory.
2024-12-06 13:38:42.424903: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 1488384000 exceeds 10% of free system memory.
2024-12-06 13:38:43.156510: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 1488384000 exceeds 10% of free system memory.
2024-12-06 13:38:43.596256: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 1488384000 exceeds 10% of free system memory.


Epoch 1/30


2024-12-06 13:38:46.212838: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8100
2024-12-06 13:38:46.586163: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x1eeaf700 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-12-06 13:38:46.586190: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 2070, Compute Capability 7.5
2024-12-06 13:38:46.589185: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-12-06 13:38:46.683162: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [24]:
seq2seq.model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None, 80)]   0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None, 102)]  0           []                               
                                                                                                  
 lstm (LSTM)                    [(None, 256),        345088      ['input_1[0][0]']                
                                 (None, 256),                                                     
                                 (None, 256)]                                                     
                                                                                              

In [25]:
index_to_target = {v: k for k, v in target_to_index.items()}

In [26]:
np.random.seed(1)
examples = data_sample.sample(10)
for rn, row in examples.iterrows():
    pred = seq2seq.predict_sequence(
        encoder_inputs[rn],
        sos_token_index=target_to_index["<sos>"],
        index_to_target=index_to_target,
    )
    print("====="*10)
    print(f"Source: {row.source}")
    print(f"Predict: {pred.lstrip(' ').rstrip(' <eos>')}")
    print(f"Answer: {row.target.lstrip('<sos> ').rstrip(' <eos>')}")    

Source: I can't fake it.
Predict: Je ne peux pas le faire.
Answer: Je ne peux pas le simuler.
Source: I'll beat you up!
Predict: Je suis en train de parler.
Answer: Je vais te cogner !
Source: Tom got embarrassed.
Predict: Tom a été prévenue.
Answer: Tom a été embarrassé.
Source: Tom's cheerful.
Predict: Tom est plein de travail.
Answer: Tom est de bonne humeur.
Source: Either is acceptable.
Predict: Fais ce que j'ai tort.
Answer: N'importe laquelle est acceptable.
Source: We were eating pizza.
Predict: Nous avons toutes des mousins.
Answer: Nous mangions de la pizza.
Source: I have to find that.
Predict: Je dois te laisser.
Answer: Je dois trouver ça.
Source: Look closely.
Predict: Regarde ce que tu veux.
Answer: Regarde attentivement.
Source: They saved us.
Predict: Elles ont tous deux chiens.
Answer: Elles nous ont sauvés.
Source: I know her very well.
Predict: Je sais que tu es sorti.
Answer: Je la connais très bien.
