Train an encoder–decoder model that can convert a date string from one format to another (e.g., from “April 22, 2019” to “2019-04-22”).

In [185]:
from datetime import date
import numpy as np
import tensorrt
import tensorflow as tf
from pathlib import Path

In [126]:
def gen_dates():
    res = []
    for x in range(date(1900, 1, 1).toordinal(), date(2053, 1, 1).toordinal()):
        d = date.fromordinal(x)
        res.append((d.strftime('%B %d, %Y'), f'^{d.isoformat()}', f'{d.isoformat()}$'))
    return res

data = gen_dates()
np.random.shuffle(data)
print(len(data))

55883


In [127]:
training_arr = data[:45000]
validation_arr = data[45000:50000]
test_arr = data[50000:]

In [128]:
def to_dataset(data, shuffle=False, batch_size=32):
    encoder_inputs, decoder_inputs, labels = zip(*data)
    encoder_inputs = tf.constant(encoder_inputs, dtype=tf.string)
    decoder_inputs = tf.constant(decoder_inputs, dtype=tf.string)
    labels = tf.constant(labels, dtype=tf.string)
    ds = tf.data.Dataset.from_tensor_slices(((encoder_inputs, decoder_inputs), labels))
    if shuffle:
        ds = ds.shuffle(60000)
    return ds.cache().batch(batch_size).prefetch(1)

In [129]:
training_ds = to_dataset(training_arr, shuffle=True)
validation_ds = to_dataset(validation_arr)
test_ds = to_dataset(test_arr)

In [130]:
for (enc_in, dec_in), labels in validation_ds.take(1):
    print(enc_in[:5])
    print(dec_in[:5])
    print(labels[:5])

tf.Tensor(
[b'February 04, 1959' b'September 19, 1915' b'March 07, 1943'
 b'September 14, 1922' b'November 17, 1976'], shape=(5,), dtype=string)
tf.Tensor(
[b'^1959-02-04' b'^1915-09-19' b'^1943-03-07' b'^1922-09-14'
 b'^1976-11-17'], shape=(5,), dtype=string)
tf.Tensor(
[b'1959-02-04$' b'1915-09-19$' b'1943-03-07$' b'1922-09-14$'
 b'1976-11-17$'], shape=(5,), dtype=string)


# Character based

In [176]:
enc_in, dec_in, labels = zip(*data)
all_strings = enc_in + dec_in + labels
max_tokens = tf.reduce_max(tf.strings.length(all_strings))
text_vec_layer = tf.keras.layers.TextVectorization(standardize='lower', split='character')
text_vec_layer.adapt(all_strings)

vocab_size = text_vec_layer.vocabulary_size()
print(max_tokens, vocab_size)
print(text_vec_layer.get_vocabulary())

tf.Tensor(18, shape=(), dtype=int32) 38
['', '[UNK]', '1', '0', '-', '2', '9', ' ', '3', '4', '5', '^', ',', '$', '8', '7', '6', 'e', 'r', 'a', 'u', 'm', 'b', 'y', 'c', 't', 'o', 'j', 'n', 's', 'l', 'p', 'h', 'g', 'd', 'v', 'i', 'f']


In [177]:
data[0]
text_vec_layer('October 05, 1985')

<tf.Tensor: shape=(16,), dtype=int64, numpy=array([26, 24, 25, 26, 22, 17, 18,  7,  3, 10, 12,  7,  2,  6, 14, 10])>

In [178]:
def tokenize(inputs, labels):
    enc_in, dec_in = inputs
    return (text_vec_layer(enc_in), text_vec_layer(dec_in)), text_vec_layer(labels)

training = training_ds.map(tokenize)
validation = validation_ds.map(tokenize)
test = test_ds.map(tokenize)

for (enc_in, dec_in), labels in test.take(1):
    print(enc_in[:3])
    print(dec_in[:3])
    print(labels[:3])

tf.Tensor(
[[27 19 28 20 19 18 23  7  3  2 12  7  2  6 16  5  0  0]
 [27 20 30 23  7  8  2 12  7  5  3  5  3  0  0  0  0  0]
 [29 17 31 25 17 21 22 17 18  7  3 16 12  7  2  6 14  9]], shape=(3, 18), dtype=int64)
tf.Tensor(
[[11  2  6 16  5  4  3  2  4  3  2]
 [11  5  3  5  3  4  3 15  4  8  2]
 [11  2  6 14  9  4  3  6  4  3 16]], shape=(3, 11), dtype=int64)
tf.Tensor(
[[ 2  6 16  5  4  3  2  4  3  2 13]
 [ 5  3  5  3  4  3 15  4  8  2 13]
 [ 2  6 14  9  4  3  6  4  3 16 13]], shape=(3, 11), dtype=int64)


In [184]:
encoder_tokens = tf.keras.Input(shape=[None], dtype=tf.int64)
decoder_tokens = tf.keras.Input(shape=[None], dtype=tf.int64)

# Already tokenized
# encoder_tokens = text_vec_layer(encoder_input)
# decoder_tokens = text_vec_layer(decoder_input)

embeddings_layer = tf.keras.layers.Embedding(vocab_size, 30, mask_zero=True)
encoder_embeddings = embeddings_layer(encoder_tokens)
decoder_embeddings = embeddings_layer(decoder_tokens)

encoder_layer = tf.keras.layers.Bidirectional(
    tf.keras.layers.GRU(128, return_sequences=True, return_state=True)
)
encoder_output, *encoder_states = encoder_layer(encoder_embeddings)
bidir_encoder_states = tf.concat(encoder_states, axis=-1)

decoder_layer = tf.keras.layers.GRU(256, return_sequences=True)
decoder_output = decoder_layer(decoder_embeddings, initial_state=bidir_encoder_states)

attention_layer = tf.keras.layers.Attention()
attention_output = attention_layer([decoder_output, encoder_output])

dense_layer = tf.keras.layers.Dense(vocab_size, activation='softmax')
output = dense_layer(attention_output)

model = tf.keras.Model(inputs=[encoder_tokens, decoder_tokens], outputs=[output])
model.summary()

Model: "model_9"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_50 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 input_49 (InputLayer)          [(None, None)]       0           []                               
                                                                                                  
 embedding_24 (Embedding)       (None, None, 30)     1140        ['input_49[0][0]',               
                                                                  'input_50[0][0]']               
                                                                                                  
 bidirectional_20 (Bidirectiona  [(None, None, 256),  122880     ['embedding_24[0][0]']     

In [186]:
folder = Path() / 'data' / '05-exrc-9' / 'checkpoints'
checkpoints_cb = tf.keras.callbacks.ModelCheckpoint(folder, save_best_only=True)
earlystop_cb = tf.keras.callbacks.EarlyStopping(restore_best_weights=True, patience=3)

model.compile(
    loss=tf.keras.losses.sparse_categorical_crossentropy,
    optimizer=tf.keras.optimizers.Nadam(),
    metrics=[tf.keras.metrics.sparse_categorical_accuracy]
)
hist = model.fit(training, epochs=10, validation_data=validation, callbacks=[checkpoints_cb, earlystop_cb])

Epoch 1/10



INFO:tensorflow:Assets written to: data/05-exrc-9/checkpoints/assets


INFO:tensorflow:Assets written to: data/05-exrc-9/checkpoints/assets


Epoch 2/10



INFO:tensorflow:Assets written to: data/05-exrc-9/checkpoints/assets


INFO:tensorflow:Assets written to: data/05-exrc-9/checkpoints/assets


Epoch 3/10



INFO:tensorflow:Assets written to: data/05-exrc-9/checkpoints/assets


INFO:tensorflow:Assets written to: data/05-exrc-9/checkpoints/assets


Epoch 4/10



INFO:tensorflow:Assets written to: data/05-exrc-9/checkpoints/assets


INFO:tensorflow:Assets written to: data/05-exrc-9/checkpoints/assets


Epoch 5/10
Epoch 6/10
Epoch 7/10



INFO:tensorflow:Assets written to: data/05-exrc-9/checkpoints/assets


INFO:tensorflow:Assets written to: data/05-exrc-9/checkpoints/assets


Epoch 8/10
Epoch 9/10
Epoch 10/10



INFO:tensorflow:Assets written to: data/05-exrc-9/checkpoints/assets


INFO:tensorflow:Assets written to: data/05-exrc-9/checkpoints/assets




In [187]:
model.evaluate(test)



[4.676641401601955e-05, 1.0]

In [188]:
for (enc_in, dec_in), label in test.take(1):
    print([text_vec_layer.get_vocabulary()[c] for c in enc_in[0]])


['j', 'a', 'n', 'u', 'a', 'r', 'y', ' ', '0', '1', ',', ' ', '1', '9', '6', '2', '', '']


In [204]:
def infer(date_input):
    result = ''
    encoder_input = text_vec_layer([date_input])
    for _ in range(max_tokens):
        decoder_input = text_vec_layer(['^' + result])
        y_proba = model.predict([encoder_input, decoder_input], verbose=False)[0, -1]
        token = y_proba.argmax()
        char = text_vec_layer.get_vocabulary()[token]
        if char == '$':
            break
        result += char
    return result

print(infer('December 17, 2025'))
print(infer('May 01, 1910'))

2025-12-17
1910-05-01


# Book's solution

The first approach of the book's solution is an interesting one. It sends the output of the encoder X times as input to the decoder (since the input sequence to the decoder is fixed length). This means it doesn't need to initialize the internal state (which is essentially a hack needed since the "teacher forcing" technique takes over the input). The book calls `tf.keras.layers.RepeatVector` to duplicate the output of the encoder as many times needed for the decoder's sequence.

It's also worth noting that the embeddings of the encoder and decoder are different.

In [205]:
from datetime import date

# cannot use strftime()'s %B format since it depends on the locale
MONTHS = ["January", "February", "March", "April", "May", "June",
          "July", "August", "September", "October", "November", "December"]

def random_dates(n_dates):
    min_date = date(1000, 1, 1).toordinal()
    max_date = date(9999, 12, 31).toordinal()

    ordinals = np.random.randint(max_date - min_date, size=n_dates) + min_date
    dates = [date.fromordinal(ordinal) for ordinal in ordinals]

    x = [MONTHS[dt.month - 1] + " " + dt.strftime("%d, %Y") for dt in dates]
    y = [dt.isoformat() for dt in dates]
    return x, y

In [206]:
np.random.seed(42)

n_dates = 3
x_example, y_example = random_dates(n_dates)
print("{:25s}{:25s}".format("Input", "Target"))
print("-" * 50)
for idx in range(n_dates):
    print("{:25s}{:25s}".format(x_example[idx], y_example[idx]))

Input                    Target                   
--------------------------------------------------
September 20, 7075       7075-09-20               
May 15, 8579             8579-05-15               
January 11, 7103         7103-01-11               


In [207]:
INPUT_CHARS = "".join(sorted(set("".join(MONTHS) + "0123456789, ")))
INPUT_CHARS

' ,0123456789ADFJMNOSabceghilmnoprstuvy'

In [208]:
OUTPUT_CHARS = "0123456789-"

In [209]:
def date_str_to_ids(date_str, chars=INPUT_CHARS):
    return [chars.index(c) for c in date_str]

In [210]:
date_str_to_ids(x_example[0], INPUT_CHARS)

[19, 23, 31, 34, 23, 28, 21, 23, 32, 0, 4, 2, 1, 0, 9, 2, 9, 7]

In [211]:
date_str_to_ids(y_example[0], OUTPUT_CHARS)

[7, 0, 7, 5, 10, 0, 9, 10, 2, 0]

In [212]:
def prepare_date_strs(date_strs, chars=INPUT_CHARS):
    X_ids = [date_str_to_ids(dt, chars) for dt in date_strs]
    X = tf.ragged.constant(X_ids, ragged_rank=1)
    return (X + 1).to_tensor() # using 0 as the padding token ID

def create_dataset(n_dates):
    x, y = random_dates(n_dates)
    return prepare_date_strs(x, INPUT_CHARS), prepare_date_strs(y, OUTPUT_CHARS)

In [213]:
np.random.seed(42)

X_train, Y_train = create_dataset(10000)
X_valid, Y_valid = create_dataset(2000)
X_test, Y_test = create_dataset(2000)

In [214]:
Y_train[0]

<tf.Tensor: shape=(10,), dtype=int32, numpy=array([ 8,  1,  8,  6, 11,  1, 10, 11,  3,  1], dtype=int32)>

In [216]:
Y_train.shape

TensorShape([10000, 10])

In [215]:
embedding_size = 32
max_output_length = Y_train.shape[1]

np.random.seed(42)
tf.random.set_seed(42)

encoder = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(INPUT_CHARS) + 1,
                           output_dim=embedding_size,
                           input_shape=[None]),
    tf.keras.layers.LSTM(128)
])

decoder = tf.keras.Sequential([
    tf.keras.layers.LSTM(128, return_sequences=True),
    tf.keras.layers.Dense(len(OUTPUT_CHARS) + 1, activation="softmax")
])

model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.RepeatVector(max_output_length),
    decoder
])

optimizer = tf.keras.optimizers.Nadam()
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy"])
history = model.fit(X_train, Y_train, epochs=20,
                    validation_data=(X_valid, Y_valid))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [217]:
prepare_date_strs(["May 02, 2020", "July 14, 1789"])

<tf.Tensor: shape=(2, 13), dtype=int32, numpy=
array([[17, 21, 38,  1,  3,  5,  2,  1,  5,  3,  5,  3,  0],
       [16, 36, 28, 38,  1,  4,  7,  2,  1,  4, 10, 11, 12]], dtype=int32)>