# Date String Converter using Encoder-Decoder Architecture with PyTorch

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vuhung16au/nlp-learning-journey/blob/main/examples/date-string-converter.ipynb)

## Overview

This notebook demonstrates how to build an Encoder-Decoder model that converts date strings from natural language format (e.g., "April 22, 2019") to ISO format (e.g., "2019-04-22"). We'll use a character-level sequence-to-sequence approach with LSTM networks in PyTorch.

**Note**: This repository prioritizes PyTorch over TensorFlow. This notebook has been updated to use PyTorch implementations.

## What You'll Learn

- Sequence-to-sequence modeling for string transformation using PyTorch
- Character-level tokenization for date processing
- LSTM Encoder-Decoder architecture in PyTorch
- Data generation for training date conversion models
- Model training and evaluation techniques with PyTorch
- Inference and prediction on new date strings
- Vietnamese/English date format examples

## Prerequisites

Basic understanding of Python, neural networks, and sequence-to-sequence models.

In [None]:
# Environment Detection and Setup (Required for all notebooks in this repository)
import sys
import subprocess
import os
import time

# Detect the runtime environment
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules
IS_LOCAL = not (IS_COLAB or IS_KAGGLE)

print(f"Environment detected:")
print(f"  - Local: {IS_LOCAL}")
print(f"  - Google Colab: {IS_COLAB}")
print(f"  - Kaggle: {IS_KAGGLE}")

# Platform-specific system setup
if IS_COLAB:
    print("\nSetting up Google Colab environment...")
    !apt update -qq
    !apt install -y -qq libpq-dev
elif IS_KAGGLE:
    print("\nSetting up Kaggle environment...")
    # Kaggle usually has most packages pre-installed
else:
    print("\nSetting up local environment...")

# PyTorch logging setup
def setup_pytorch_logging():
    """Setup platform-specific PyTorch logging directories."""
    if IS_COLAB:
        root_logdir = "/content/pytorch_logs"
    elif IS_KAGGLE:
        root_logdir = "./pytorch_logs"
    else:
        root_logdir = os.path.join(os.getcwd(), "pytorch_logs")
    
    os.makedirs(root_logdir, exist_ok=True)
    return root_logdir

def get_run_logdir(experiment_name="run"):
    """Generate unique run directory for training logs."""
    root_logdir = setup_pytorch_logging()
    run_id = time.strftime(f"{experiment_name}_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

# Install required packages for this notebook
required_packages = [
    "torch",
    "numpy",
    "pandas",
    "matplotlib",
    "seaborn",
    "tqdm"
]

print("\nInstalling required packages...")
for package in required_packages:
    if IS_COLAB or IS_KAGGLE:
        !pip install -q {package}
    else:
        subprocess.run([sys.executable, "-m", "pip", "install", "-q", package], 
                      capture_output=True)
    print(f"✓ {package}")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import random
import datetime
import re
from tqdm import tqdm
from collections import Counter

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
random.seed(42)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA Device: {torch.cuda.get_device_name(0)}")

## 1. Data Generation

We'll generate synthetic training data with various date formats commonly found in natural language text, including Vietnamese/English examples.

In [None]:
class DateDataGenerator:
    """Generate date conversion training data."""
    
    def __init__(self):
        # English month names
        self.months_full = [
            'January', 'February', 'March', 'April', 'May', 'June',
            'July', 'August', 'September', 'October', 'November', 'December'
        ]
        
        self.months_short = [
            'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
            'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'
        ]
        
        # Vietnamese month names (for demonstration)
        self.months_vietnamese = [
            'Tháng một', 'Tháng hai', 'Tháng ba', 'Tháng tư', 'Tháng năm', 'Tháng sáu',
            'Tháng bảy', 'Tháng tám', 'Tháng chín', 'Tháng mười', 'Tháng mười một', 'Tháng mười hai'
        ]
        
        # Number representations
        self.ordinals = {
            1: ['1st', 'first'], 2: ['2nd', 'second'], 3: ['3rd', 'third'],
            4: ['4th', 'fourth'], 5: ['5th', 'fifth'], 6: ['6th', 'sixth'],
            7: ['7th', 'seventh'], 8: ['8th', 'eighth'], 9: ['9th', 'ninth'],
            10: ['10th', 'tenth'], 11: ['11th', 'eleventh'], 12: ['12th', 'twelfth'],
            13: ['13th', 'thirteenth'], 14: ['14th', 'fourteenth'], 15: ['15th', 'fifteenth'],
            16: ['16th', 'sixteenth'], 17: ['17th', 'seventeenth'], 18: ['18th', 'eighteenth'],
            19: ['19th', 'nineteenth'], 20: ['20th', 'twentieth'], 21: ['21st', 'twenty-first'],
            22: ['22nd', 'twenty-second'], 23: ['23rd', 'twenty-third'], 24: ['24th', 'twenty-fourth'],
            25: ['25th', 'twenty-fifth'], 26: ['26th', 'twenty-sixth'], 27: ['27th', 'twenty-seventh'],
            28: ['28th', 'twenty-eighth'], 29: ['29th', 'twenty-ninth'], 30: ['30th', 'thirtieth'],
            31: ['31st', 'thirty-first']
        }
    
    def generate_random_date(self):
        """Generate a random date."""
        year = random.randint(1990, 2030)
        month = random.randint(1, 12)
        
        # Handle different month lengths
        if month in [1, 3, 5, 7, 8, 10, 12]:
            day = random.randint(1, 31)
        elif month in [4, 6, 9, 11]:
            day = random.randint(1, 30)
        else:  # February
            # Simple leap year check
            if year % 4 == 0 and (year % 100 != 0 or year % 400 == 0):
                day = random.randint(1, 29)
            else:
                day = random.randint(1, 28)
        
        return year, month, day
    
    def format_date_natural(self, year, month, day):
        """Generate various natural language date formats."""
        formats = []
        
        # Format 1: Month DD, YYYY
        formats.append(f"{self.months_full[month-1]} {day}, {year}")
        
        # Format 2: DD Month YYYY
        formats.append(f"{day} {self.months_full[month-1]} {year}")
        
        # Format 3: Mon DD, YYYY
        formats.append(f"{self.months_short[month-1]} {day}, {year}")
        
        # Format 4: DD Mon YYYY
        formats.append(f"{day} {self.months_short[month-1]} {year}")
        
        # Format 5: Ordinal format
        if day in self.ordinals:
            ordinal = random.choice(self.ordinals[day])
            formats.append(f"{self.months_full[month-1]} {ordinal}, {year}")
            formats.append(f"the {ordinal} of {self.months_full[month-1]} {year}")
        
        # Format 6: MM/DD/YYYY
        formats.append(f"{month:02d}/{day:02d}/{year}")
        
        # Format 7: DD/MM/YYYY (European style)
        formats.append(f"{day:02d}/{month:02d}/{year}")
        
        # Format 8: Vietnamese style (demonstration)
        formats.append(f"ngày {day} {self.months_vietnamese[month-1]} năm {year}")
        
        return random.choice(formats)
    
    def format_date_iso(self, year, month, day):
        """Format date in ISO format (target)."""
        return f"{year:04d}-{month:02d}-{day:02d}"
    
    def generate_dataset(self, num_samples=10000):
        """Generate a dataset of date conversion pairs."""
        data = []
        
        for _ in range(num_samples):
            year, month, day = self.generate_random_date()
            natural_format = self.format_date_natural(year, month, day)
            iso_format = self.format_date_iso(year, month, day)
            
            data.append({
                'input': natural_format,
                'output': iso_format,
                'year': year,
                'month': month,
                'day': day
            })
        
        return data

# Generate training data
generator = DateDataGenerator()
dataset = generator.generate_dataset(10000)

# Convert to DataFrame for easier manipulation
df = pd.DataFrame(dataset)

print(f"Generated {len(df)} date conversion pairs")
print("\nSample data:")
print(df.head(10))

# Display some Vietnamese/English examples
print("\n🇻🇳🇺🇸 Vietnamese/English Date Examples:")
vietnamese_examples = df[df['input'].str.contains('ngày|tháng|năm', case=False, na=False)]
if not vietnamese_examples.empty:
    print(vietnamese_examples[['input', 'output']].head(3))
else:
    print("No Vietnamese examples in current sample, generating some:")
    for i in range(3):
        year, month, day = generator.generate_random_date()
        vn_format = f"ngày {day} {generator.months_vietnamese[month-1]} năm {year}"
        iso_format = generator.format_date_iso(year, month, day)
        print(f"Vietnamese: '{vn_format}' → ISO: '{iso_format}'")

print("\nEnglish examples:")
english_examples = df[~df['input'].str.contains('ngày|tháng|năm', case=False, na=False)]
print(english_examples[['input', 'output']].head(3))

## 2. Data Preprocessing and Character-Level Tokenization

We'll implement character-level tokenization for our sequence-to-sequence model.

In [None]:
class CharacterTokenizer:
    """Character-level tokenizer for date strings."""
    
    def __init__(self):
        self.char_to_idx = {}
        self.idx_to_char = {}
        self.vocab_size = 0
        
        # Special tokens
        self.SOS_token = '<SOS>'
        self.EOS_token = '<EOS>'
        self.PAD_token = '<PAD>'
        self.UNK_token = '<UNK>'
    
    def build_vocab(self, texts):
        """Build character vocabulary from texts."""
        all_chars = set()
        
        # Collect all unique characters
        for text in texts:
            all_chars.update(list(text))
        
        # Add special tokens
        special_tokens = [self.PAD_token, self.SOS_token, self.EOS_token, self.UNK_token]
        
        # Build mappings
        self.char_to_idx = {}
        self.idx_to_char = {}
        
        # Add special tokens first
        for i, token in enumerate(special_tokens):
            self.char_to_idx[token] = i
            self.idx_to_char[i] = token
        
        # Add regular characters
        for i, char in enumerate(sorted(all_chars), len(special_tokens)):
            self.char_to_idx[char] = i
            self.idx_to_char[i] = char
        
        self.vocab_size = len(self.char_to_idx)
        
        print(f"Built vocabulary with {self.vocab_size} characters")
        print(f"Special tokens: {special_tokens}")
        print(f"Sample characters: {list(sorted(all_chars))[:20]}")
    
    def encode(self, text, max_length=None, add_eos=False):
        """Encode text to indices."""
        indices = []
        
        for char in text:
            if char in self.char_to_idx:
                indices.append(self.char_to_idx[char])
            else:
                indices.append(self.char_to_idx[self.UNK_token])
        
        if add_eos:
            indices.append(self.char_to_idx[self.EOS_token])
        
        # Pad or truncate to max_length
        if max_length is not None:
            if len(indices) < max_length:
                indices.extend([self.char_to_idx[self.PAD_token]] * (max_length - len(indices)))
            else:
                indices = indices[:max_length]
        
        return indices
    
    def decode(self, indices, stop_at_eos=True):
        """Decode indices to text."""
        chars = []
        
        for idx in indices:
            if idx in self.idx_to_char:
                char = self.idx_to_char[idx]
                if char == self.EOS_token and stop_at_eos:
                    break
                elif char not in [self.PAD_token, self.SOS_token]:
                    chars.append(char)
        
        return ''.join(chars)

# Build tokenizer
tokenizer = CharacterTokenizer()

# Collect all text for vocabulary building
all_texts = df['input'].tolist() + df['output'].tolist()
tokenizer.build_vocab(all_texts)

# Test tokenization
sample_input = df['input'].iloc[0]
sample_output = df['output'].iloc[0]

print(f"\nTokenization test:")
print(f"Input: '{sample_input}'")
encoded_input = tokenizer.encode(sample_input, max_length=50)
print(f"Encoded: {encoded_input[:20]}...")
decoded_input = tokenizer.decode(encoded_input)
print(f"Decoded: '{decoded_input}'")

print(f"\nOutput: '{sample_output}'")
encoded_output = tokenizer.encode(sample_output, add_eos=True)
print(f"Encoded: {encoded_output}")
decoded_output = tokenizer.decode(encoded_output)
print(f"Decoded: '{decoded_output}'")

## 3. PyTorch Dataset and DataLoader

Create a custom PyTorch dataset for our date conversion task.

In [None]:
class DateConversionDataset(Dataset):
    """PyTorch dataset for date conversion."""
    
    def __init__(self, dataframe, tokenizer, max_input_length=50, max_output_length=15):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_output_length = max_output_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        
        # Encode input (source)
        input_text = row['input']
        input_encoded = self.tokenizer.encode(input_text, max_length=self.max_input_length)
        
        # Encode output (target)
        output_text = row['output']
        output_encoded = self.tokenizer.encode(output_text, max_length=self.max_output_length, add_eos=True)
        
        # Create decoder input (shifted by one position)
        decoder_input = [self.tokenizer.char_to_idx[self.tokenizer.SOS_token]] + output_encoded[:-1]
        if len(decoder_input) < self.max_output_length:
            decoder_input.extend([self.tokenizer.char_to_idx[self.tokenizer.PAD_token]] * (self.max_output_length - len(decoder_input)))
        else:
            decoder_input = decoder_input[:self.max_output_length]
        
        return {
            'encoder_input': torch.tensor(input_encoded, dtype=torch.long),
            'decoder_input': torch.tensor(decoder_input, dtype=torch.long),
            'decoder_target': torch.tensor(output_encoded, dtype=torch.long),
            'input_text': input_text,
            'output_text': output_text
        }

# Split data
train_size = int(0.8 * len(df))
val_size = int(0.1 * len(df))
test_size = len(df) - train_size - val_size

train_df = df[:train_size].reset_index(drop=True)
val_df = df[train_size:train_size+val_size].reset_index(drop=True)
test_df = df[train_size+val_size:].reset_index(drop=True)

print(f"Dataset split:")
print(f"  Train: {len(train_df)} samples")
print(f"  Validation: {len(val_df)} samples")
print(f"  Test: {len(test_df)} samples")

# Create datasets
train_dataset = DateConversionDataset(train_df, tokenizer)
val_dataset = DateConversionDataset(val_df, tokenizer)
test_dataset = DateConversionDataset(test_df, tokenizer)

# Create data loaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"\nCreated data loaders with batch size {batch_size}")

# Test data loading
sample_batch = next(iter(train_loader))
print(f"\nSample batch shapes:")
print(f"  Encoder input: {sample_batch['encoder_input'].shape}")
print(f"  Decoder input: {sample_batch['decoder_input'].shape}")
print(f"  Decoder target: {sample_batch['decoder_target'].shape}")

# Show first sample
print(f"\nFirst sample:")
print(f"  Input text: '{sample_batch['input_text'][0]}'")
print(f"  Output text: '{sample_batch['output_text'][0]}'")
print(f"  Encoder input: {sample_batch['encoder_input'][0][:20].tolist()}...")
print(f"  Decoder input: {sample_batch['decoder_input'][0].tolist()}")
print(f"  Decoder target: {sample_batch['decoder_target'][0].tolist()}")

## 4. Encoder-Decoder Model Architecture

Implement the LSTM-based encoder-decoder model in PyTorch.

In [None]:
class EncoderLSTM(nn.Module):
    """LSTM Encoder for sequence-to-sequence model."""
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        super(EncoderLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        
    def forward(self, x):
        # x shape: (batch_size, seq_length)
        embedded = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        
        # LSTM forward pass
        outputs, (hidden, cell) = self.lstm(embedded)
        
        return outputs, hidden, cell


class DecoderLSTM(nn.Module):
    """LSTM Decoder for sequence-to-sequence model."""
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        super(DecoderLSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.vocab_size = vocab_size
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.output_projection = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden, cell):
        # x shape: (batch_size, seq_length)
        embedded = self.embedding(x)  # (batch_size, seq_length, embedding_dim)
        
        # LSTM forward pass
        outputs, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        
        # Project to vocabulary size
        predictions = self.output_projection(outputs)  # (batch_size, seq_length, vocab_size)
        
        return predictions, hidden, cell


class Seq2SeqModel(nn.Module):
    """Complete sequence-to-sequence model."""
    
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2):
        super(Seq2SeqModel, self).__init__()
        
        self.encoder = EncoderLSTM(vocab_size, embedding_dim, hidden_dim, num_layers)
        self.decoder = DecoderLSTM(vocab_size, embedding_dim, hidden_dim, num_layers)
        
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        
    def forward(self, encoder_input, decoder_input):
        # Encode
        encoder_outputs, hidden, cell = self.encoder(encoder_input)
        
        # Decode
        decoder_outputs, _, _ = self.decoder(decoder_input, hidden, cell)
        
        return decoder_outputs
    
    def predict(self, encoder_input, tokenizer, max_length=15, device='cpu'):
        """Generate prediction for a given input."""
        self.eval()
        with torch.no_grad():
            # Encode
            encoder_outputs, hidden, cell = self.encoder(encoder_input)
            
            # Initialize decoder input with SOS token
            decoder_input = torch.tensor([[tokenizer.char_to_idx[tokenizer.SOS_token]]], 
                                       dtype=torch.long, device=device)
            
            predictions = []
            
            for _ in range(max_length):
                # Decode one step
                decoder_output, hidden, cell = self.decoder(decoder_input, hidden, cell)
                
                # Get the predicted token
                predicted_token = decoder_output.argmax(dim=-1)
                predictions.append(predicted_token.item())
                
                # Check for EOS token
                if predicted_token.item() == tokenizer.char_to_idx[tokenizer.EOS_token]:
                    break
                
                # Use predicted token as next input
                decoder_input = predicted_token
            
            return predictions

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Seq2SeqModel(
    vocab_size=tokenizer.vocab_size,
    embedding_dim=128,
    hidden_dim=256,
    num_layers=2
).to(device)

print(f"Model initialized on {device}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# Model summary
print(f"\nModel architecture:")
print(f"  Vocabulary size: {tokenizer.vocab_size}")
print(f"  Embedding dimension: 128")
print(f"  Hidden dimension: 256")
print(f"  Number of layers: 2")

# Test forward pass
sample_batch = {k: v.to(device) if isinstance(v, torch.Tensor) else v 
               for k, v in sample_batch.items()}

with torch.no_grad():
    output = model(sample_batch['encoder_input'], sample_batch['decoder_input'])
    print(f"\nTest forward pass output shape: {output.shape}")
    print(f"Expected shape: (batch_size={batch_size}, max_output_length=15, vocab_size={tokenizer.vocab_size})")

## 5. Model Training

Train the sequence-to-sequence model with proper logging.

In [None]:
def train_model(model, train_loader, val_loader, tokenizer, device, epochs=10):
    """Train the sequence-to-sequence model."""
    
    # Setup training
    criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.char_to_idx[tokenizer.PAD_token])
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=2, factor=0.5)
    
    # Training logs
    train_losses = []
    val_losses = []
    
    print(f"Starting training for {epochs} epochs...")
    print(f"Training on {device}")
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        total_train_loss = 0
        
        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - Training")
        for batch in train_pbar:
            # Move batch to device
            encoder_input = batch['encoder_input'].to(device)
            decoder_input = batch['decoder_input'].to(device)
            decoder_target = batch['decoder_target'].to(device)
            
            # Forward pass
            optimizer.zero_grad()
            outputs = model(encoder_input, decoder_input)
            
            # Calculate loss
            loss = criterion(outputs.reshape(-1, tokenizer.vocab_size), 
                           decoder_target.reshape(-1))
            
            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            total_train_loss += loss.item()
            train_pbar.set_postfix({'loss': f"{loss.item():.4f}"})
        
        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        
        # Validation phase
        model.eval()
        total_val_loss = 0
        
        val_pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} - Validation")
        with torch.no_grad():
            for batch in val_pbar:
                # Move batch to device
                encoder_input = batch['encoder_input'].to(device)
                decoder_input = batch['decoder_input'].to(device)
                decoder_target = batch['decoder_target'].to(device)
                
                # Forward pass
                outputs = model(encoder_input, decoder_input)
                
                # Calculate loss
                loss = criterion(outputs.reshape(-1, tokenizer.vocab_size), 
                               decoder_target.reshape(-1))
                
                total_val_loss += loss.item()
                val_pbar.set_postfix({'loss': f"{loss.item():.4f}"})
        
        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        
        # Learning rate scheduling
        scheduler.step(avg_val_loss)
        
        # Print epoch results
        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"  Train Loss: {avg_train_loss:.4f}")
        print(f"  Val Loss: {avg_val_loss:.4f}")
        print(f"  Learning Rate: {optimizer.param_groups[0]['lr']:.6f}")
        
        # Test on a few samples
        if (epoch + 1) % 2 == 0:
            print("\n  Sample predictions:")
            test_samples = val_loader.dataset.data.sample(3)
            for idx, row in test_samples.iterrows():
                input_text = row['input']
                target_text = row['output']
                
                # Encode input
                input_encoded = tokenizer.encode(input_text, max_length=50)
                input_tensor = torch.tensor([input_encoded], dtype=torch.long, device=device)
                
                # Predict
                predictions = model.predict(input_tensor, tokenizer, device=device)
                predicted_text = tokenizer.decode(predictions)
                
                print(f"    Input: '{input_text}'")
                print(f"    Target: '{target_text}'")
                print(f"    Predicted: '{predicted_text}'")
                print()
        
        print("-" * 60)
    
    return train_losses, val_losses

# Train the model
train_losses, val_losses = train_model(
    model, train_loader, val_loader, tokenizer, device, epochs=5
)

print("Training completed!")

## 6. Model Evaluation and Visualization

Evaluate the trained model and visualize results.

In [None]:
# Plot training curves
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss', color='blue')
plt.plot(val_losses, label='Validation Loss', color='red')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(np.diff(train_losses), label='Training Loss Change', color='blue', alpha=0.7)
plt.plot(np.diff(val_losses), label='Validation Loss Change', color='red', alpha=0.7)
plt.title('Loss Change per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Loss Change')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Comprehensive evaluation
def evaluate_model(model, test_loader, tokenizer, device):
    """Comprehensive model evaluation."""
    model.eval()
    
    correct_predictions = 0
    total_predictions = 0
    exact_matches = 0
    
    results = []
    
    print("Evaluating model on test set...")
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluation"):
            encoder_input = batch['encoder_input'].to(device)
            input_texts = batch['input_text']
            target_texts = batch['output_text']
            
            batch_size = encoder_input.size(0)
            
            for i in range(batch_size):
                # Get single sample
                single_input = encoder_input[i:i+1]
                input_text = input_texts[i]
                target_text = target_texts[i]
                
                # Predict
                predictions = model.predict(single_input, tokenizer, device=device)
                predicted_text = tokenizer.decode(predictions)
                
                # Check exact match
                if predicted_text.strip() == target_text.strip():
                    exact_matches += 1
                
                total_predictions += 1
                
                results.append({
                    'input': input_text,
                    'target': target_text,
                    'predicted': predicted_text,
                    'correct': predicted_text.strip() == target_text.strip()
                })
    
    accuracy = exact_matches / total_predictions
    
    print(f"\nEvaluation Results:")
    print(f"  Total samples: {total_predictions}")
    print(f"  Exact matches: {exact_matches}")
    print(f"  Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    
    return results, accuracy

# Evaluate the model
results, accuracy = evaluate_model(model, test_loader, tokenizer, device)

# Show some example predictions
print("\n🎯 Sample Predictions:")
print("=" * 80)

# Show correct predictions
correct_results = [r for r in results if r['correct']]
if correct_results:
    print("\n✅ Correct Predictions:")
    for i, result in enumerate(correct_results[:5]):
        print(f"{i+1}. Input: '{result['input']}'")
        print(f"   Target: '{result['target']}'")
        print(f"   Predicted: '{result['predicted']}'")
        print()

# Show incorrect predictions
incorrect_results = [r for r in results if not r['correct']]
if incorrect_results:
    print("\n❌ Incorrect Predictions:")
    for i, result in enumerate(incorrect_results[:5]):
        print(f"{i+1}. Input: '{result['input']}'")
        print(f"   Target: '{result['target']}'")
        print(f"   Predicted: '{result['predicted']}'")
        print()

# Show Vietnamese/English examples if any
vietnamese_results = [r for r in results if 'ngày' in r['input'] or 'tháng' in r['input']]
if vietnamese_results:
    print("\n🇻🇳 Vietnamese Examples:")
    for i, result in enumerate(vietnamese_results[:3]):
        print(f"{i+1}. Input: '{result['input']}'")
        print(f"   Target: '{result['target']}'")
        print(f"   Predicted: '{result['predicted']}'")
        print(f"   Correct: {'✅' if result['correct'] else '❌'}")
        print()

print(f"\n🎯 Final Model Performance: {accuracy*100:.2f}% accuracy")

## 7. Interactive Inference

Test the model with custom date strings including Vietnamese/English examples.

In [None]:
def predict_date_conversion(model, tokenizer, input_text, device):
    """Predict date conversion for a given input text."""
    model.eval()
    
    # Encode input
    input_encoded = tokenizer.encode(input_text, max_length=50)
    input_tensor = torch.tensor([input_encoded], dtype=torch.long, device=device)
    
    # Predict
    with torch.no_grad():
        predictions = model.predict(input_tensor, tokenizer, device=device)
        predicted_text = tokenizer.decode(predictions)
    
    return predicted_text

# Test with various date formats
test_inputs = [
    # English formats
    "January 15, 2023",
    "15 January 2023",
    "Jan 15, 2023",
    "15 Jan 2023",
    "01/15/2023",
    "15/01/2023",
    "the first of January 2023",
    "March 22nd, 2024",
    "December 31st, 1999",
    
    # Vietnamese formats (if in training data)
    "ngày 15 Tháng một năm 2023",
    "ngày 22 Tháng ba năm 2024",
    "ngày 31 Tháng mười hai năm 1999",
]

print("🧪 Testing Date Conversion Model")
print("=" * 50)

for i, input_text in enumerate(test_inputs, 1):
    predicted = predict_date_conversion(model, tokenizer, input_text, device)
    
    # Determine if it's Vietnamese or English
    lang = "🇻🇳 Vietnamese" if any(word in input_text.lower() for word in ['ngày', 'tháng', 'năm']) else "🇺🇸 English"
    
    print(f"{i:2d}. {lang}")
    print(f"    Input: '{input_text}'")
    print(f"    Predicted: '{predicted}'")
    print()

# Interactive function for custom inputs
def interactive_date_conversion():
    """Interactive date conversion function."""
    print("\n🔄 Interactive Date Conversion")
    print("Enter date strings to convert to ISO format (YYYY-MM-DD)")
    print("Examples:")
    print("  - 'April 22, 2019'")
    print("  - '15 March 2023'")
    print("  - 'ngày 10 Tháng hai năm 2024'")
    print("Type 'quit' to exit\n")
    
    while True:
        user_input = input("Enter date string: ").strip()
        
        if user_input.lower() in ['quit', 'exit', 'q']:
            print("Goodbye!")
            break
        
        if not user_input:
            continue
        
        try:
            predicted = predict_date_conversion(model, tokenizer, user_input, device)
            print(f"Converted: '{predicted}'\n")
        except Exception as e:
            print(f"Error: {e}\n")

# Uncomment the line below to run interactive mode
# interactive_date_conversion()

print("\n✅ Date Conversion Model Demo Complete!")
print("\n📝 Key Achievements:")
print("1. Implemented character-level sequence-to-sequence model in PyTorch")
print("2. Trained encoder-decoder LSTM for date format conversion")
print("3. Achieved reasonable accuracy on date conversion task")
print("4. Demonstrated multilingual support (English/Vietnamese)")
print("5. Created interactive inference capability")

print("\n🇻🇳🇺🇸 Multilingual Examples:")
print("English: 'My name is John' → Vietnamese: 'Tên tôi là John'")
print("English: 'April 22, 2019' → ISO: '2019-04-22'")
print("Vietnamese: 'ngày 22 Tháng tư năm 2019' → ISO: '2019-04-22'")