In [1]:
!pip install Faker
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F # Cần cho Attention
import random
import numpy as np
from faker import Faker

# --- 1. Chuẩn bị Dữ liệu ---
fake = Faker()

vocab = list("0123456789-,/ ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz")
vocab = ['<PAD>', '<SOS>', '<EOS>'] + vocab
char_to_index = {char: i for i, char in enumerate(vocab)}
index_to_char = {i: char for i, char in enumerate(vocab)}

VOCAB_SIZE = len(vocab)
MAX_LENGTH = 30 # Độ dài tối đa

def generate_data(num_samples=10000): # Dùng 10k mẫu
    data = []
    formats = ['%B %d, %Y', '%Y-%m-%d', '%A, %b %d %Y', '%d-%m-%Y']
    output_format = '%d/%m/%Y'
    for _ in range(num_samples):
        d = fake.date_object()
        input_str = d.strftime(random.choice(formats))
        target_str = d.strftime(output_format)
        data.append((input_str[:MAX_LENGTH-2], target_str[:MAX_LENGTH-2]))
    return data

def string_to_tensor(s):
    tensor = [char_to_index['<SOS>']]
    tensor += [char_to_index.get(c, 0) for c in s]
    tensor.append(char_to_index['<EOS>'])

    # Padding
    while len(tensor) < MAX_LENGTH:
        tensor.append(char_to_index['<PAD>'])

    return torch.tensor(tensor, dtype=torch.long).view(-1, 1)

# Tạo mẫu
train_data = generate_data(10000) # 10k mẫu là đủ
print(f"--- Đã tạo {len(train_data)} mẫu ---")
print(f"Ví dụ: {train_data[0][0]} => {train_data[0][1]}")

Collecting Faker
  Downloading faker-38.2.0-py3-none-any.whl.metadata (16 kB)
Downloading faker-38.2.0-py3-none-any.whl (2.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.0/2.0 MB[0m [31m85.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Faker
Successfully installed Faker-38.2.0
--- Đã tạo 10000 mẫu ---
Ví dụ: 1977-06-30 => 30/06/1977


In [2]:
# --- 1. ENCODER (Sửa đổi) ---
# Bây giờ trả về TẤT CẢ outputs, và (hidden, cell)
class EncoderLSTM(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)

    def forward(self, input_tensor):
        embedded = self.embedding(input_tensor)
        # outputs shape: [seq_len, 1, hidden_size]
        # hidden/cell shape: [1, 1, hidden_size]
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell # TRẢ VỀ TẤT CẢ OUTPUTS

# --- 2. ATTENTION (Module MỚI) ---
# Đây là "bộ não" của sự chú ý (Additive Attention)
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        # Lớp Linear để kết hợp hidden_state của Decoder và Encoder
        self.Wa = nn.Linear(hidden_size * 2, hidden_size) # (decoder_hidden + encoder_output)
        self.Va = nn.Linear(hidden_size, 1) # Lấy 1 điểm số duy nhất

    def forward(self, decoder_hidden, encoder_outputs):
        # encoder_outputs shape: [seq_len, hidden_size]
        # decoder_hidden shape: [1, hidden_size]

        seq_len = encoder_outputs.shape[0]

        # Lặp lại decoder_hidden để nó có [seq_len, hidden_size]
        decoder_hidden_expanded = decoder_hidden.repeat(seq_len, 1)

        # Nối (concatenate)
        # [seq_len, hidden_size*2]
        concat_hidden = torch.cat((decoder_hidden_expanded, encoder_outputs), dim=1)

        # [seq_len, hidden_size]
        energy = torch.tanh(self.Wa(concat_hidden))

        # [seq_len, 1]
        attention_scores = self.Va(energy)

        # [seq_len, 1] -> [1, seq_len]
        # Dùng softmax để chuẩn hóa, cho biết nên "chú ý" vào đâu
        weights = F.softmax(attention_scores.T, dim=1)

        # [1, seq_len] @ [seq_len, hidden_size] -> [1, hidden_size]
        # Tính "context vector" (ý niệm) dựa trên sự chú ý
        context_vector = torch.bmm(weights.unsqueeze(0), encoder_outputs.unsqueeze(0))

        return context_vector.squeeze(0), weights.squeeze(0) # Trả về context và trọng số (để debug)

# --- 3. DECODER (Viết lại hoàn toàn) ---
# Bây giờ gọi là AttnDecoderLSTM
class AttnDecoderLSTM(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(AttnDecoderLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = Attention(hidden_size) # Nhúng Attention vào

        # LSTM bây giờ nhận (input + context_vector)
        self.lstm = nn.LSTM(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_char, decoder_hidden, decoder_cell, encoder_outputs):
        # input_char shape: [1, 1]
        embedded = self.embedding(input_char).view(1, 1, -1)

        # Lấy "ý niệm" từ Attention
        # decoder_hidden[0] vì shape của nó là [1, 1, H]
        context_vector, attn_weights = self.attention(decoder_hidden[0], encoder_outputs)

        # Nối (concatenate) input (embedded) và "ý niệm" (context)
        # [1, 1, H] + [1, 1, H] -> [1, 1, 2*H]
        lstm_input = torch.cat((embedded, context_vector.unsqueeze(0)), dim=2)

        # Cho qua LSTM
        output, (hidden, cell) = self.lstm(lstm_input, (decoder_hidden, decoder_cell))

        # Dự đoán
        prediction = self.softmax(self.out(output[0]))
        return prediction, hidden, cell, attn_weights # Trả về cả attn_weights

In [3]:
# --- 1. Định nghĩa tham số ---
hidden_size = 128 # Tăng lên cho mô hình mạnh hơn
learning_rate = 0.001 # Giảm LR đi một chút
n_epochs = 10000 # 10k epochs (mỗi epoch 1 mẫu)
log_every = 500
teacher_forcing_ratio = 0.75 # Tăng teacher forcing

# --- 2. Khởi tạo mô hình ---
encoder = EncoderLSTM(VOCAB_SIZE, hidden_size)
decoder = AttnDecoderLSTM(hidden_size, VOCAB_SIZE) # Dùng AttnDecoder

encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss(ignore_index=char_to_index['<PAD>'])

# --- 3. Vòng lặp Huấn luyện ---
print("--- Bắt đầu huấn luyện (với ATTENTION)... ---")
total_loss_avg = 0

for epoch in range(1, n_epochs + 1):

    input_str, target_str = random.choice(train_data)
    input_tensor = string_to_tensor(input_str)
    target_tensor = string_to_tensor(target_str)

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0

    # --- ENCODER ---
    # Chạy Encoder và lấy TẤT CẢ outputs
    encoder_outputs, encoder_hidden, encoder_cell = encoder(input_tensor)

    # Bỏ đi shape [seq_len, 1, H] -> [seq_len, H]
    encoder_outputs_squeezed = encoder_outputs.squeeze(1)

    # --- DECODER ---
    decoder_input = torch.tensor([[char_to_index['<SOS>']]], dtype=torch.long)
    decoder_hidden = encoder_hidden # Dùng "ý niệm" cuối
    decoder_cell = encoder_cell

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    target_len = 10 # Chỉ dịch 10 ký tự (dd/mm/yyyy)

    if use_teacher_forcing:
        for i in range(target_len):
            decoder_output, decoder_hidden, decoder_cell, _ = decoder(
                decoder_input, decoder_hidden, decoder_cell, encoder_outputs_squeezed
            )
            loss += criterion(decoder_output, target_tensor[i+1])
            decoder_input = target_tensor[i+1].view(1, 1)
    else:
        for i in range(target_len):
            decoder_output, decoder_hidden, decoder_cell, _ = decoder(
                decoder_input, decoder_hidden, decoder_cell, encoder_outputs_squeezed
            )
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach().view(1, 1)

            loss += criterion(decoder_output, target_tensor[i+1])
            if decoder_input.item() == char_to_index['<EOS>']:
                break

    loss.backward()
    encoder_optimizer.step()
    decoder_optimizer.step()
    total_loss_avg += loss.item() / target_len

    if epoch % log_every == 0:
        print(f"Epoch {epoch}/{n_epochs}, Avg Loss: {total_loss_avg / log_every:.4f}")
        total_loss_avg = 0

print("--- Huấn luyện thủ công hoàn tất ---")

--- Bắt đầu huấn luyện (với ATTENTION)... ---
Epoch 500/10000, Avg Loss: 1.5476
Epoch 1000/10000, Avg Loss: 1.0444
Epoch 1500/10000, Avg Loss: 0.6630
Epoch 2000/10000, Avg Loss: 0.1414
Epoch 2500/10000, Avg Loss: 0.0797
Epoch 3000/10000, Avg Loss: 0.0209
Epoch 3500/10000, Avg Loss: 0.0124
Epoch 4000/10000, Avg Loss: 0.0205
Epoch 4500/10000, Avg Loss: 0.0034
Epoch 5000/10000, Avg Loss: 0.0020
Epoch 5500/10000, Avg Loss: 0.0014
Epoch 6000/10000, Avg Loss: 0.0523
Epoch 6500/10000, Avg Loss: 0.0099
Epoch 7000/10000, Avg Loss: 0.0067
Epoch 7500/10000, Avg Loss: 0.0100
Epoch 8000/10000, Avg Loss: 0.0224
Epoch 8500/10000, Avg Loss: 0.0020
Epoch 9000/10000, Avg Loss: 0.0033
Epoch 9500/10000, Avg Loss: 0.0008
Epoch 10000/10000, Avg Loss: 0.0004
--- Huấn luyện thủ công hoàn tất ---


In [4]:
def predict_manual_with_attention(input_str):
    print(f"\nInput: {input_str}")
    input_tensor = string_to_tensor(input_str)

    with torch.no_grad():
        # ENCODER
        encoder_outputs, encoder_hidden, encoder_cell = encoder(input_tensor)
        encoder_outputs_squeezed = encoder_outputs.squeeze(1)

        # DECODER
        decoder_input = torch.tensor([[char_to_index['<SOS>']]], dtype=torch.long)
        decoder_hidden = encoder_hidden
        decoder_cell = encoder_cell

        output_string = ""

        for _ in range(10): # Chỉ sinh 10 ký tự
            decoder_output, decoder_hidden, decoder_cell, attn_weights = decoder(
                decoder_input, decoder_hidden, decoder_cell, encoder_outputs_squeezed
            )

            topv, topi = decoder_output.topk(1)
            char_index = topi.squeeze().item()

            if char_index == char_to_index['<EOS>']:
                break

            output_string += index_to_char[char_index]
            decoder_input = topi.squeeze().view(1, 1)

    print(f"Output: {output_string}")

# Chạy thử
predict_manual_with_attention("July 4, 1990")
predict_manual_with_attention("2024-12-25")
predict_manual_with_attention("Saturday, Sep 20 2025")
predict_manual_with_attention(fake.date_object().strftime(random.choice(['%B %d, %Y', '%Y-%m-%d'])))


Input: July 4, 1990
Output: 19/07/1990

Input: 2024-12-25
Output: 25/12/2024

Input: Saturday, Sep 20 2025
Output: 20/09/2025

Input: November 03, 2008
Output: 03/11/2008
