In [1]:
import sys
sys.path.append("../")

In [2]:
import os
import random
import shutil
import zipfile
import urllib3
import requests
import gc
from typing import List, Dict

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from src.models.seq2seq import Encoder, Decoder, Seq2Seq
from src.utils.session import reset_session

2024-12-21 07:52:25.789647: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Config

In [3]:
base_dir = "../data"
os.makedirs(base_dir, exist_ok=True)

# dataset path
output_zip_path = os.path.join(base_dir, "fra-eng.zip")
output_zip_dir = os.path.join(base_dir, "fra-eng")
output_csv_path = os.path.join(base_dir, output_zip_dir, "fra.txt")

## Download Dataset

In [4]:
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

In [5]:
def download_zip(url, output_path):
    response = requests.get(url, headers=headers, stream=True)
    if response.status_code == 200:
        with open(output_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"ZIP file downloaded to {output_path}")
    else:
        print(f"Failed to download. HTTP Response Code: {response.status_code}")

url = "http://www.manythings.org/anki/fra-eng.zip"
download_zip(url, output_zip_path)

# unzip
with zipfile.ZipFile(output_zip_path, "r") as zip_ref:
    zip_ref.extractall(output_zip_dir)

ZIP file downloaded to ../data/fra-eng.zip


## Read Dataset

In [6]:
data = pd.read_csv(output_csv_path, header=None, names=["source", "target", "license"], sep="\t")

In [7]:
data.head()

Unnamed: 0,source,target,license
0,Go.,Va !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Marche.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,En route !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Go.,Bouge !,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4,Hi.,Salut !,CC-BY 2.0 (France) Attribution: tatoeba.org #5...


In [8]:
# drop license column
data.drop(["license"], axis=1, inplace=True)

In [9]:
data.shape

(232736, 2)

In [10]:
# use first 60,000 rows to train the model
data_sample = data.iloc[:60000].copy()

np.random.seed(1234)
data_sample.sample(10)

Unnamed: 0,source,target
30329,Tom seems sincere.,Tom semble sincère.
44957,Tom wants to try it.,Tom veut l'essayer.
30866,We're quite alone.,Nous sommes tout à fait seuls.
40447,"Hey, listen to this.","Eh, écoutez ceci."
25580,Come talk with me.,Venez me parler.
6216,Stop arguing.,Arrête de te quereller !
26373,How are you doing?,Comment vas-tu ?
9010,Pick a weapon.,Choisis une arme !
23445,Tom felt nothing.,Tom n'a rien ressenti.
108,Hug me.,Serre-moi dans tes bras !


## Preprocess Dataset

In [11]:
# add <sos>, <eos> symbol
data_sample["target"] = "<sos> " + data_sample["target"] + " <eos>"
data_sample.head()

Unnamed: 0,source,target
0,Go.,<sos> Va ! <eos>
1,Go.,<sos> Marche. <eos>
2,Go.,<sos> En route ! <eos>
3,Go.,<sos> Bouge ! <eos>
4,Hi.,<sos> Salut ! <eos>


In [12]:
# create source/target vocab
source_vocab = set()
target_vocab = set()
for _, row in data_sample.iterrows():
    source_vocab.update(list(row.source))
    target_vocab.update(["<sos>"] + list(row.target.lstrip("<sos>").rstrip("<eos>")) + ["<eos>"])

source_vocab_size = len(source_vocab) + 1
target_vocab_size = len(target_vocab) + 1
print(f"Length of Source Vocab: {source_vocab_size}")
print(f"Length of Target Vocab: {target_vocab_size}")

Length of Source Vocab: 80
Length of Target Vocab: 102


In [13]:
# char to idx
source_to_index = {w: i+1 for i, w in enumerate(source_vocab)}
target_to_index = {w: i+1 for i, w in enumerate(target_vocab)}

index_to_source = {v: k for k, v in source_to_index.items()}
index_to_target = {v: k for k, v in target_to_index.items()}

In [14]:
inputs_idx = {
    "encoder_input": [],
    "decoder_input": [],
    "decoder_target": []
}
for _, row in data_sample.iterrows():
    inputs_idx["encoder_input"].append([source_to_index[c] for c in row.source])

    target_encoded = [target_to_index[c] for c in row.target.lstrip("<sos>").rstrip("<eos>")]
    inputs_idx["decoder_input"].append([target_to_index["<sos>"]] + target_encoded + [target_to_index["<eos>"]])
    inputs_idx["decoder_target"].append(target_encoded + [target_to_index["<eos>"]])

In [15]:
inputs_idx["encoder_input"][:5]

[[42, 67, 75], [42, 67, 75], [42, 67, 75], [42, 67, 75], [4, 12, 75]]

In [16]:
inputs_idx["decoder_input"][:5]

[[68, 67, 12, 39, 67, 42, 67, 97],
 [68, 67, 95, 39, 16, 2, 79, 83, 96, 67, 97],
 [68, 67, 75, 41, 67, 16, 87, 17, 101, 83, 67, 42, 67, 97],
 [68, 67, 3, 87, 17, 5, 83, 67, 42, 67, 97],
 [68, 67, 89, 39, 92, 17, 101, 67, 42, 67, 97]]

In [17]:
inputs_idx["decoder_target"][:5]

[[67, 12, 39, 67, 42, 67, 97],
 [67, 95, 39, 16, 2, 79, 83, 96, 67, 97],
 [67, 75, 41, 67, 16, 87, 17, 101, 83, 67, 42, 67, 97],
 [67, 3, 87, 17, 5, 83, 67, 42, 67, 97],
 [67, 89, 39, 92, 17, 101, 67, 42, 67, 97]]

In [18]:
max_source_len = max(map(len, inputs_idx["encoder_input"]))
max_target_len = max(map(len, inputs_idx["decoder_input"]))

print(f"max source length: {max_source_len}")
print(f"max target length: {max_target_len}")

max source length: 22
max target length: 76


In [19]:
inputs_pad = {}

inputs_pad["encoder_input"] = pad_sequences(inputs_idx["encoder_input"], maxlen=max_source_len, padding="post")
inputs_pad["decoder_input"] = pad_sequences(inputs_idx["decoder_input"], maxlen=max_target_len, padding="post")
inputs_pad["decoder_target"] = pad_sequences(inputs_idx["decoder_target"], maxlen=max_target_len-1, padding="post")

print(f"encoder input shape: {inputs_pad['encoder_input'].shape}")
print(f"decoder input shape: {inputs_pad['decoder_input'].shape}")
print(f"decoder target shape: {inputs_pad['decoder_target'].shape}")

encoder input shape: (60000, 22)
decoder input shape: (60000, 76)
decoder target shape: (60000, 75)


In [20]:
inputs = {}

inputs["encoder_input"] = to_categorical(inputs_pad["encoder_input"])
inputs["decoder_input"] = to_categorical(inputs_pad["decoder_input"])
inputs["decoder_target"] = to_categorical(inputs_pad["decoder_target"])

# note that len(source_vocab) = 79, len(target_vocab) = 101
print(f"encoder input shape: {inputs['encoder_input'].shape}")
print(f"decoder input shape: {inputs['decoder_input'].shape}")
print(f"decoder target shape: {inputs['decoder_target'].shape}")

encoder input shape: (60000, 22, 80)
decoder input shape: (60000, 76, 102)
decoder target shape: (60000, 75, 102)


In [21]:
# ' '.join([index_to_source[v] for v in tf.argmax(inputs["encoder_input"][0], axis=1).numpy() if v != 0])
# ' '.join([index_to_target[v] for v in tf.argmax(inputs["decoder_input"][0], axis=1).numpy() if v != 0])
# ' '.join([index_to_target[v] for v in tf.argmax(inputs["decoder_target"][0], axis=1).numpy() if v != 0])

## Train Model

In [22]:
reset_session()

In [23]:
seq2seq = Seq2Seq(
    input_dim=source_vocab_size,
    output_dim=target_vocab_size,
    hidden_dim=256,
    num_layers=3,
    max_length=max_target_len,
)

2024-12-21 07:52:37.583982: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-21 07:52:37.586436: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-21 07:52:37.586576: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:981] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2024-12-21 07:52:37.587009: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorF

In [24]:
seq2seq.build(optimizer=Adam(1e-2))

In [25]:
inp1, inp2 = inputs["encoder_input"], inputs["decoder_input"]
X = [inp1, inp2]
y = np.argmax(inputs["decoder_target"], axis=-1)

In [26]:
# TODO: add early stopping callback
seq2seq.fit(
    X=X,
    y=y,
    batch_size=512,
    epochs=30,
    validation_split=0.2,
)

Epoch 1/30


2024-12-21 07:56:03.267687: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8100
2024-12-21 07:56:07.948887: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x7f5e7cef03b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-12-21 07:56:07.948912: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA GeForce RTX 2070, Compute Capability 7.5
2024-12-21 07:56:07.986475: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-12-21 07:56:08.352183: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f5f38403d00>

In [27]:
examples = data_sample.sample(10)
for rn, row in examples.iterrows():
    pred = seq2seq.predict_sequence(
        inputs["encoder_input"][rn],
        sos_token_index=target_to_index["<sos>"],
        index_to_target=index_to_target,
    )
    print("====="*10)
    print(f"Source: {row.source}")
    print(f"Predict: {pred.lstrip(' ').rstrip(' <eos>')}")
    print(f"Answer: {row.target.lstrip('<sos> ').rstrip(' <eos>')}")

Source: We all suffered.
Predict: Nous aoons de la maite.
Answer: Nous avons tous souffert.
Source: Why do you think so?
Predict: Pourquoi ne peux-tu pas ?
Answer: Pourquoi pensez-vous ça ?
Source: Just take one.
Predict: Attendsz de lois !
Answer: Prenez-en seulement un.
Source: Tom misled Mary.
Predict: Tom a me les chausss.
Answer: Tom trompait Marie.
Source: I'm busy, too.
Predict: Je suis très contraré.
Answer: Je suis également affairé.
Source: Here's your milk.
Predict: Montrezmoi une cha
Answer: Voici ton lait.
Source: This book is smaller.
Predict: Ce livre est malhe.
Answer: Ce livre est plus petit.
Source: Nobody likes you.
Predict: Personne ne peut parer
Answer: Personne ne t'aime.
Source: I'm glad to hear that.
Predict: Je suis très contraré.
Answer: C'est bon à entendre.
Source: Where is the exit?
Predict: Où est lon chambre ?
Answer: Où est la sortie ?


In [28]:
# bs = 32
# outs = model([inp1[:bs], inp2[:bs]], training=True)
# tars = y[:bs]

# tars.shape
# outs.shape

# seq2seq.loss_function(tars, outs)

In [29]:
# bs = 32
# inp1, inp2 = inputs["encoder_input"][:bs], inputs["decoder_input"][:bs]
# X = [inp1, inp2]
# y = np.argmax(inputs["decoder_target"], axis=-1)
# enc_states = seq2seq.encoder.initialize_hidden_state(bs)

# enc_out, *enc_states = seq2seq.encoder(inp1, enc_states)
# dec_out, *dec_states = seq2seq.decoder(inp2, enc_states)

# # encoder initial states
# batch_size = tf.shape(inp1)[0]
# enc_states = seq2seq.encoder.initialize_hidden_state(bs)

# # encoder outputs & states
# _, *enc_states = seq2seq.encoder(inp1, states=enc_states)
# dec_states = enc_states

# dec_outs = []
# dec_inp = tf.expand_dims(inp2[:, 0], axis=1)
# for t in range(1, seq2seq.max_length):
#     out, *dec_states = seq2seq.decoder(dec_inp, dec_states)
#     dec_outs.append(out)
#     dec_inp = tf.expand_dims(inp2[:, t], axis=1)

# # tf.concat(dec_outs, axis=1).shape
# dec_outs = seq2seq.call(inp1, inp2, True)