# Date Conversion with Encoder-Decoder RNN in TensorFlow and PyTorch
In this project, I build Encoder-Decoder RNN from scratch to convert date format: from - April 22, 2019 - to another format - 2019-04-22 in two frameworks: Tensorflow and PyTorch.

* [Generate Dataset](#Generate-Datasetle)
* [PyTorch Implementation](#PyTorch-Implementation)
    * [Data Preparation](#Data-Preparation)
    * [Build RNN](#build-rnn)
    * [Train the model](#train-the-model)
    * [Inference](#inference)
* [TensorFlow Implementation](#tensorflow-implementation)
    * [Build RNN in TF](#build-rnn-in-tf)
    * [Train the model in TF](#train-the-model-in-tf)
    * [Inference in TF](#inference-in-tf)


In [1]:
from datetime import datetime
import random
import time
import pandas as pd

import tensorflow as tf
from tensorflow import keras

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
import tensorflow as tf

## Generate Dataset

In [3]:
def generate_dataset(start, end, time_format, size):

    stime = time.mktime(time.strptime(start, time_format))
    etime = time.mktime(time.strptime(end, time_format))
    
    random_dates = []
    
    for i in range(size):
        prop = random.random()
        rtime = stime + prop * (etime - stime)
        rtime = time.strftime(time_format, time.localtime(rtime))
        random_dates.append(rtime)

    return random_dates

In [4]:
train_size = 10000
y_train = generate_dataset('1900-01-01', '2024-12-31', '%Y-%m-%d', train_size)
X_train = [datetime.strptime(x, '%Y-%m-%d').strftime("%B %d, %Y") for x in y_train]

In [5]:
test_size = 5
y_test = generate_dataset('1900-01-01', '2024-12-31', '%Y-%m-%d', test_size)
X_test = [datetime.strptime(x, '%Y-%m-%d').strftime("%B %d, %Y") for x in y_test]

In [6]:
print("Source date:", X_train[10])
print("Target date:",y_train[10])

Source date: February 01, 1900
Target date: 1900-02-01


## PyTorch Implementation

### Data Preparation
Creat `DatesDataset` class that:
- Tokenizes and maps date strings to indices for both source and target sequences.
- Adds special tokens for sequence start, end, and unknown values.
- Converts each date string into a PyTorch tensor for training.
- Computes vocab sizes and maximum sequence lengths for handling different date formats.

In [None]:
class DatesDataset(Dataset):
    """
    A PyTorch dataset for date conversion tasks. This dataset prepares input
    and target tensors by tokenizing and encoding date strings.

    Attributes:
        source (list): List of input date strings.
        target (list): List of target date strings.
        input_vocab (set): Set of unique tokens in the input dates.
        output_vocab (set): Set of unique tokens in the target dates.
        input_tensor (Tensor): Encoded tensor of input date sequences.
        output_tensor (Tensor): Encoded tensor of target date sequences.
        input_vocab_dim (int): Size of the input vocabulary.
        output_vocab_dim (int): Size of the output vocabulary.
        input_max_seq_length (int): Maximum sequence length for input dates.
        output_max_seq_length (int): Maximum sequence length for target dates.
    """
     
    def __init__(self, X_train, y_train):
        self.source = X_train
        self.target = y_train
        
        self.input_vocab = set()
        self.output_vocab =set()
        
        self.input_tensor = self.prepare_source_data(self.source)
        self.output_tensor = self.prepare_target_data(self.target)

        self.input_vocab_dim = len(self.human_char_idx)
        self.output_vocab_dim = len(self.machine_char_idx)
        
        self.input_max_seq_length = max(len(txt) for txt in self.source)
        self.output_max_seq_length = max(len(txt) for txt in self.target)
        
    def __len__(self):
        return len(self.source)

    def __getitem__(self, idx):
        source_date = self.input_tensor[idx]
        target_date = self.output_tensor[idx]
        
        return source_date, target_date
    
    def prepare_source_data(self, data):
        """
        Tokenizes and encodes input date sequences.
        
        Args:
            data (list): List of input date strings.
        
        Returns:
            Tensor: Tensor of encoded input sequences.
        """
        data_token = [x.replace(',', ' ,').split() for x in data]
        
        for token in data_token:
            self.input_vocab.update(token)

        special_tokens = ['<START>', '<END>', '<UNK>']
        self.input_vocab.update(special_tokens)
     
        self.human_char_idx = {token: i for i, token in enumerate(self.input_vocab)}
        self.human_idx_char = {i: token for i, token in enumerate(self.input_vocab)}
        
        indices = []
        for tokens in data_token:
            tokens = ['<START>'] + tokens + ['<END>']
            index = [self.human_char_idx.get(token, self.human_char_idx['<UNK>']) for token in tokens]
            indices.append(index)
        tensors = torch.tensor(indices, dtype=torch.long)

        return tensors
    
    def prepare_target_data(self, data):
        """
        Tokenizes and encodes target date sequences.
        
        Args:
            data (list): List of target date strings.
        
        Returns:
            Tensor: Tensor of encoded target sequences.
        """
        data_token = [y.replace('-', ' - ').split() for y in data]
        
        for token in data_token:
            self.output_vocab.update(token)

        special_tokens = ['<START>', '<END>', '<UNK>']
        self.output_vocab.update(special_tokens)

        self.machine_char_idx = {token: i for i, token in enumerate(self.output_vocab)}
        self.machine_idx_char = {i: token for i, token in enumerate(self.output_vocab)}

        indices = []
        for tokens in data_token:
            tokens = ['<START>'] + tokens + ['<END>']
            index = [self.machine_char_idx.get(token, self.machine_char_idx['<UNK>']) for token in tokens]
            indices.append(index)
        tensors = torch.tensor(indices, dtype=torch.long)
        
        return tensors

Load dataset with dataloader

In [8]:
dataset = DatesDataset(X_train, y_train)

### Build RNN
The RNN model is built with an encoder-decoder architecture, where both the encoder and decoder are constructed using Long Short-Term Memory (LSTM) layers. The encoder processes input sequences and captures context, while the decoder generates output sequences based on this encoded information.

The model structure:
**Encoder**
- Receives an input sequence of token IDs (encoder_inputs), which are first embedded using an `Embedding` layer with an emdedding dimension, and then passed through dropout for regularization.
- The embedded sequence is processed by the LSTM, which outputs `hidden` and `cell` states. These states capture the context of the input sequence and are used as input for the decoder

**Decoder**
- The input token is unsqueezed then embedded to produce a dense vector.
- The embedded input is passed through the LSTM along with the `hidden` and `cell` states from the encoder. The LSTM output provides updated hidden and cell states.
- The LSTM’s output at each time step is passed through the linaer layer to produce a probabilities over the output vocabulary, allowing the decoder to predict the next token.

In [42]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.lstm(embedded)
        
        return hidden, cell

In [43]:
class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(1)
        
        embedded = self.embedding(input)

        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc(output.squeeze(1))
        return prediction, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        assert encoder.hidden_size == decoder.hidden_size, "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.num_layers == decoder.num_layers, "Encoder and decoder must have equal number of layers!"

    def forward(self, source, target, teacher_forcing_ratio=0.5):
        batch_size = source.shape[0]
        trg_length = target.shape[1]
        trg_vocab_size = self.decoder.fc.out_features
        outputs = torch.zeros(batch_size, trg_length, trg_vocab_size).to(source.device)
        hidden, cell = self.encoder(source)
    
        input = target[:, 0]

        for t in range(1, trg_length):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = target[:, t] if teacher_force else top1


        return outputs

Connect Model

In [47]:
input_size = len(dataset.human_char_idx)
hidden_size = 256
num_layers = 1
embedding_size = 256
dropout_p = 0.5
encoder = Encoder(
    input_size=input_size,
    embedding_size=embedding_size,
    hidden_size=hidden_size,
    num_layers=num_layers,
    p=dropout_p)

decoder = Decoder(input_size=input_size,
                  embedding_size=embedding_size,
                  hidden_size=hidden_size,
                  output_size=input_size,
                  num_layers=num_layers
                 )

model = Seq2Seq(encoder, decoder)

In [48]:
model

Seq2Seq(
  (encoder): Encoder(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(172, 256)
    (lstm): LSTM(256, 256, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(172, 256)
    (lstm): LSTM(256, 256, batch_first=True)
    (fc): Linear(in_features=256, out_features=172, bias=True)
  )
)

### Train the model

In [131]:
def train(model, dataloader, optimizer, criterion, num_epochs, device):
    model.train()
    for epoch in range(num_epochs):
        print(f"Epoch {epoch} ----------")
        epoch_loss = 0
        for source, target in dataloader:
            source, target = source.to(device), target.to(device)
            
            optimizer.zero_grad()
            
            output = model(source, target)
            
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            target = target[:, 1:].reshape(-1)
            
            assert target.max() < output.size(-1), f"Target value exceeds number of classes: max {target.max()} >= {output.size(-1)}"
            assert target.min() >= 0, f"Target value contains negative indices: min {target.min()} < 0"

            
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            
            epoch_loss += loss.item()
        
        avg_loss = epoch_loss / len(dataloader)
        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss:.4f}')

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
device = torch.device("cpu")

In [None]:
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)
train(model, dataloader, optimizer, criterion, num_epochs=10, device=device)

Epoch 0 ----------
Epoch 1/10, Loss: 2.2459
Epoch 1 ----------
Epoch 2/10, Loss: 0.6260
Epoch 2 ----------
Epoch 3/10, Loss: 0.1328
Epoch 3 ----------
Epoch 4/10, Loss: 0.0363
Epoch 4 ----------
Epoch 5/10, Loss: 0.0171
Epoch 5 ----------
Epoch 6/10, Loss: 0.0102
Epoch 6 ----------
Epoch 7/10, Loss: 0.0070
Epoch 7 ----------
Epoch 8/10, Loss: 0.0051
Epoch 8 ----------
Epoch 9/10, Loss: 0.0038
Epoch 9 ----------
Epoch 10/10, Loss: 0.0030


### Inference

In [136]:
def test(model, x, dataset):
    model.eval()
    with torch.no_grad():
        source = dataset.prepare_source_data([x]).to(model.encoder.embedding.weight.device)
        hidden, cell = model.encoder(source)
    
        target = torch.tensor([[dataset.machine_char_idx['<START>']]], device=source.device)
        output_str = ''
        
        for _ in range(dataset.output_max_seq_length):  # max length of output 
            output, hidden, cell = model.decoder(target.squeeze(1), hidden, cell)
            top1 = output.argmax(1)
            if top1.item() == dataset.machine_char_idx['<END>']:
                break
            output_str += dataset.machine_idx_char[top1.item()]
            target = top1.unsqueeze(0)
        
    return output_str

In [137]:
for i in range(len(X_test)):
    pred =test(model, X_test[i], dataset)
    print(f"test {i}")
    print(f"source date: {X_test[i]}")
    print(f"target date: {y_test[i]}")
    print(f"prediction: {pred}")
    print()

test 0
source date: September 22, 1956
target date: 1956-09-22
prediction: 1956-09-22

test 1
source date: June 11, 2012
target date: 2012-06-11
prediction: 2012-06-11

test 2
source date: July 02, 1988
target date: 1988-07-02
prediction: 1988-07-02

test 3
source date: October 08, 1917
target date: 1917-10-08
prediction: 1917-10-08

test 4
source date: August 17, 1934
target date: 1934-08-17
prediction: 1934-08-17



## TensorFlow Implementation
### Build RNN in TF

In [9]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [None]:
def define_models(num_encoder_tokens, num_decoder_tokens, embed_size, latent_dim):
    encoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
    encoder_embeddings = keras.layers.Embedding(num_encoder_tokens, embed_size)(encoder_inputs)
    encoder_lstm = tf.keras.layers.LSTM(latent_dim, return_state=True)
    encoder_outputs, state_h, state_c = encoder_lstm(encoder_embeddings)
    encoder_states = [state_h, state_c]
    
    decoder_inputs = keras.layers.Input(shape=[None], dtype=np.int32)
    decoder_embeddings = keras.layers.Embedding(num_decoder_tokens, embed_size)(decoder_inputs)
    decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embeddings, initial_state=encoder_states)
    decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
    decoder_outputs = decoder_dense(decoder_outputs)
    
    model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
    
    return model

In [11]:
model = define_models(num_encoder_tokens=dataset.input_vocab_dim,
                     num_decoder_tokens=dataset.output_vocab_dim,
                     embed_size=50,
                     latent_dim=256)

In [12]:
model.summary()

### Train the model in TF

In [44]:
model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

In [45]:
encoder_input_data = dataset.input_tensor
decoder_input_data = dataset.output_tensor[:, :-1]
decoder_target_data = dataset.output_tensor[:, 1:]

In [46]:
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    epochs=10,
    validation_split=0.2,
)

Epoch 1/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.7925 - loss: 0.9364 - val_accuracy: 0.8330 - val_loss: 0.8207
Epoch 2/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.8441 - loss: 0.7498 - val_accuracy: 0.9057 - val_loss: 0.4565
Epoch 3/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.9494 - loss: 0.3211 - val_accuracy: 0.9986 - val_loss: 0.0798
Epoch 4/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 0.9950 - loss: 0.0666 - val_accuracy: 0.9999 - val_loss: 0.0236
Epoch 5/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 1.0000 - loss: 0.0166 - val_accuracy: 1.0000 - val_loss: 0.0104
Epoch 6/10
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 15ms/step - accuracy: 1.0000 - loss: 0.0084 - val_accuracy: 1.0000 - val_loss: 0.0066
Epoch 7/10
[1m250/250

<keras.src.callbacks.history.History at 0x2927b6bd0>

### Inference in TF

In [47]:
def decode_sequence(model, x, dataset):
    encoder_input = dataset.prepare_source_data([x])
    encoder_input = tf.expand_dims(encoder_input, -1)
    decoder_input = tf.constant([[dataset.machine_char_idx['<START>']]])

    target_seq = []
    
    stop_condition = False
    
    while not stop_condition:
        predictions = model([encoder_input, decoder_input])
        predicted_idx = tf.argmax(predictions[0, -1, :], axis=-1).numpy()
        if predicted_idx == dataset.machine_char_idx['<END>']:
            stop_condition = True
        else:
            target_seq.append(predicted_idx)
            decoder_input = tf.concat([decoder_input, [[predicted_idx]]], axis=-1)
            
    target = ''.join([dataset.machine_idx_char.get(id, '<UNK>') for id in target_seq 
                      if id not in [dataset.machine_char_idx['<START>'], dataset.machine_char_idx['<END>']]])
    
    return target

In [48]:
for i in range(len(X_test)):
    
    output = decode_sequence(model, X_test[i], dataset)
    print(f"test {i}")
    print(f"source date: {X_test[i]}")
    print(f"target date: {y_test[i]}")
    print("prediction:", output)
    print()


test 0
source date: April 26, 1950
target date: 1950-04-26
prediction: 1950-04-26

test 1
source date: December 10, 1959
target date: 1959-12-10
prediction: 1959-12-10

test 2
source date: January 20, 2017
target date: 2017-01-20
prediction: 2017-01-20

test 3
source date: March 07, 1961
target date: 1961-03-07
prediction: 1961-03-07

test 4
source date: January 08, 1917
target date: 1917-01-08
prediction: 1917-01-08

