# Vietnamese Text Summarization (Seq2Seq + Attention)
This notebook is split into cells for clarity. The original code is preserved exactly; cells below separate installation, imports, data cleaning, tokenization, dataset creation, model definition, training, and decoding example.

In [1]:
!pip install datasets underthesea torch tqdm tensorflow --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.3/8.3 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m978.6/978.6 kB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m94.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m78.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import re
import torch
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from underthesea import word_tokenize
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2025-10-05 09:44:17.518255: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759657457.703840      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759657457.755655      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## 1️⃣ Load & Clean Data
Load the `nam194/vietnews` dataset and define a helper `clean_vi` function to preprocess Vietnamese text.

In [None]:
# Load data from nam194/vietnews dataset (first 50,000 samples)
dataset = load_dataset("nam194/vietnews", split="train[:50000]")  

def clean_vi(text):
    """
    Clean and preprocess Vietnamese text.
    
    Args:
        text: Text string to be processed
        
    Returns:
        Cleaned and word-tokenized text string
        
    Processing steps:
        1. Remove URLs
        2. Remove special characters (keep letters, numbers, and Vietnamese characters)
        3. Convert to lowercase
        4. Tokenize Vietnamese words using underthesea
    """
    text = str(text)
    # Remove all URLs from the text
    text = re.sub(r"http\S+", "", text)
    # Keep only letters, numbers, and Vietnamese characters
    text = re.sub(r"[^0-9a-zA-ZÀ-ỹ\s]", "", text)
    # Normalize: convert to lowercase and remove extra whitespace
    text = text.lower().strip()
    # Vietnamese word tokenization
    text = word_tokenize(text, format="text")
    return text

# Clean all articles (newspaper content)
articles = [clean_vi(x['article']) for x in tqdm(dataset, desc="Cleaning articles")]
# Clean all abstracts (summaries)
summaries = [clean_vi(x['abstract']) for x in tqdm(dataset, desc="Cleaning abstracts")]

# Add start <sos> and end <eos> tokens to summaries (standard in Seq2Seq)
summaries = [f"<sos> {s} <eos>" for s in summaries]

# Define maximum sequence lengths for input and output
max_text_len = 150      # Maximum length of articles
max_summary_len = 40    # Maximum length of summaries


README.md:   0%|          | 0.00/748 [00:00<?, ?B/s]

data/train-00000-of-00001-84acb79f6c6547(…):   0%|          | 0.00/170M [00:00<?, ?B/s]

data/validation-00000-of-00001-210cc51bf(…):   0%|          | 0.00/38.3M [00:00<?, ?B/s]

data/test-00000-of-00001-123f98d55067eb7(…):   0%|          | 0.00/38.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/99134 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/22184 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/22498 [00:00<?, ? examples/s]

Cleaning articles: 100%|██████████| 50000/50000 [15:59<00:00, 52.10it/s]
Cleaning abstracts: 100%|██████████| 50000/50000 [01:20<00:00, 624.04it/s]


## Tokenization
Create Keras `Tokenizer` instances for inputs and outputs, convert texts to sequences and pad them.

In [None]:
# ========== TOKENIZER FOR INPUT (ARTICLES) ==========
# Initialize tokenizer for input text
# num_words=15000: keep only the 15,000 most common words
# oov_token: token for out-of-vocabulary words
x_tokenizer = Tokenizer(num_words=15000, oov_token="<unk>")
x_tokenizer.fit_on_texts(articles)  # Build vocabulary from articles
X = x_tokenizer.texts_to_sequences(articles)  # Convert text to sequences of integers
X = pad_sequences(X, maxlen=max_text_len, padding='post')  # Pad to same length
x_vocab = len(x_tokenizer.word_index) + 1  # Vocabulary size (+1 for padding token)

# ========== TOKENIZER FOR OUTPUT (SUMMARIES) ==========
# Initialize tokenizer for summaries (output)
# filters='': don't remove special characters (to keep <sos>, <eos>)
y_tokenizer = Tokenizer(num_words=15000, filters='', oov_token="<unk>")
y_tokenizer.fit_on_texts(summaries)  # Build vocabulary from summaries
Y = y_tokenizer.texts_to_sequences(summaries)  # Convert text to sequences of integers
Y = pad_sequences(Y, maxlen=max_summary_len, padding='post')  # Pad to same length
y_vocab = len(y_tokenizer.word_index) + 1  # Vocabulary size

# Print summary information
print(f"✅ Data ready: {len(X)} samples | vocab_in={x_vocab} | vocab_out={y_vocab}")
print("Ví dụ summary tokenized:", summaries[0])


✅ Data ready: 50000 samples | vocab_in=163601 | vocab_out=56756
Ví dụ summary tokenized: <sos> với bảntính ham chơi lười làm có nhiều tiềnán tiềnsự lại nghiện matuý_thương đã độtnhập vào nhà chú ruột để trộm hơn 1 tạ thóc và hơn 8 triệu đồng mang đi tiêuxài <eos>


## 2️⃣ Dataset & Dataloader
Wrap tokenized data into a PyTorch `Dataset` and `DataLoader`.

In [None]:
class SummarizationDataset(Dataset):
    """
    Custom Dataset for text summarization task.
    Converts numpy arrays to PyTorch tensors.
    """
    def __init__(self, src, tgt):
        """
        Args:
            src: Array of tokenized input sequences (articles)
            tgt: Array of tokenized output sequences (summaries)
        """
        self.src = torch.LongTensor(src)
        self.tgt = torch.LongTensor(tgt)
    
    def __len__(self):
        """Return the number of samples in the dataset."""
        return len(self.src)
    
    def __getitem__(self, idx):
        """Return an (input, target) pair at index idx."""
        return self.src[idx], self.tgt[idx]

# Create DataLoader with batch_size=16 and shuffle=True for training
loader = DataLoader(SummarizationDataset(X, Y), batch_size=16, shuffle=True)


## 3️⃣ Define Seq2Seq + Attention
Model classes: `Encoder`, `Attention`, `Decoder`, and `Seq2Seq`.

In [None]:
# Determine device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class Encoder(nn.Module):
    """
    Encoder for Seq2Seq model.
    Uses Embedding + LSTM to encode input sequences into hidden states.
    """
    def __init__(self, input_dim, emb_dim, hid_dim):
        """
        Args:
            input_dim: Input vocabulary size (number of words)
            emb_dim: Dimensionality of embedding vectors
            hid_dim: Dimensionality of hidden states in LSTM
        """
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, batch_first=True)
    
    def forward(self, src):
        """
        Args:
            src: Input tensor [batch_size, seq_len]
            
        Returns:
            outputs: All hidden states [batch_size, seq_len, hid_dim]
            hidden: Final hidden state [1, batch_size, hid_dim]
            cell: Final cell state [1, batch_size, hid_dim]
        """
        embedded = self.embedding(src)  # [batch_size, seq_len, emb_dim]
        outputs, (hidden, cell) = self.lstm(embedded)
        return outputs, hidden, cell


class Attention(nn.Module):
    """
    Attention mechanism to allow decoder to "attend" to important parts of input.
    Uses Bahdanau (additive) attention.
    """
    def __init__(self, hid_dim):
        """
        Args:
            hid_dim: Dimensionality of hidden states
        """
        super().__init__()
        # Linear layer to combine decoder hidden state and encoder outputs
        self.attn = nn.Linear(hid_dim * 2, hid_dim)
        # Linear layer to compute attention scores
        self.v = nn.Linear(hid_dim, 1, bias=False)
    
    def forward(self, hidden, encoder_outputs):
        """
        Args:
            hidden: Current decoder hidden state [num_layers, batch_size, hid_dim]
            encoder_outputs: All hidden states from encoder [batch_size, src_len, hid_dim]
            
        Returns:
            attention: Attention weights [batch_size, src_len]
        """
        # Reshape and repeat hidden to match encoder_outputs
        hidden = hidden.permute(1, 0, 2)  # [batch_size, 1, hid_dim]
        src_len = encoder_outputs.shape[1]
        hidden = hidden.repeat(1, src_len, 1)  # [batch_size, src_len, hid_dim]
        
        # Calculate energy scores
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        attention = self.v(energy).squeeze(2)  # [batch_size, src_len]
        
        # Normalize with softmax to create attention weights
        return torch.softmax(attention, dim=1)


class Decoder(nn.Module):
    """
    Decoder with attention mechanism.
    Generates output sequence one token at a time based on hidden states and attention.
    """
    def __init__(self, output_dim, emb_dim, hid_dim, attention):
        """
        Args:
            output_dim: Output vocabulary size (number of words)
            emb_dim: Dimensionality of embedding vectors
            hid_dim: Dimensionality of hidden states
            attention: Initialized Attention module
        """
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.attention = attention
        # LSTM receives concatenation of embedded input and context vector
        self.lstm = nn.LSTM(hid_dim + emb_dim, hid_dim, batch_first=True)
        # Fully connected layer to predict next token
        self.fc_out = nn.Linear(hid_dim * 2 + emb_dim, output_dim)

    def forward(self, input, hidden, cell, encoder_outputs):
        """
        Args:
            input: Current token [batch_size]
            hidden: Hidden state [1, batch_size, hid_dim]
            cell: Cell state [1, batch_size, hid_dim]
            encoder_outputs: Outputs from encoder [batch_size, src_len, hid_dim]
            
        Returns:
            pred: Predicted probability distribution for next token [batch_size, output_dim]
            hidden: New hidden state
            cell: New cell state
        """
        input = input.unsqueeze(1)  # [batch_size, 1]
        embedded = self.embedding(input)  # [batch_size, 1, emb_dim]

        # Calculate attention weights and context vector
        attn = self.attention(hidden, encoder_outputs)  # [batch_size, src_len]
        attn = attn.unsqueeze(1)  # [batch_size, 1, src_len]
        context = torch.bmm(attn, encoder_outputs)  # [batch_size, 1, hid_dim]

        # Combine embedded input and context as input to LSTM
        rnn_input = torch.cat((embedded, context), dim=2)  # [batch_size, 1, emb+hid]
        output, (hidden, cell) = self.lstm(rnn_input, (hidden, cell))

        # Concatenate output, context, embedded to predict next token
        concat_out = torch.cat((output, context, embedded), dim=2)  # [batch_size, 1, hid*2+emb]
        pred = self.fc_out(concat_out).squeeze(1)  # [batch_size, output_dim]
        
        return pred, hidden, cell


class Seq2Seq(nn.Module):
    """
    Complete Seq2Seq model combining Encoder and Decoder.
    """
    def __init__(self, encoder, decoder):
        """
        Args:
            encoder: Initialized Encoder module
            decoder: Initialized Decoder module
        """
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        """
        Args:
            src: Input sequence [batch_size, src_len]
            trg: Target sequence [batch_size, trg_len]
            teacher_forcing_ratio: Probability of using ground truth as input for next step
            
        Returns:
            outputs: Predictions for entire target sequence [batch_size, trg_len, trg_vocab_size]
        """
        batch_size = src.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        # Tensor to store all predictions
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(src.device)
        
        # Encode input sequence
        encoder_outputs, hidden, cell = self.encoder(src)
        
        # First token for decoder is <sos>
        input = trg[:, 0]
        
        # Decode one token at a time
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell, encoder_outputs)
            outputs[:, t, :] = output
            
            # Teacher forcing: sometimes use ground truth, sometimes use prediction
            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)  # Token with highest probability
            input = trg[:, t] if teacher_force else top1
        
        return outputs


## 4️⃣ Initialize Model
Set hyperparameters, build encoder/decoder, and prepare optimizer and loss.

In [None]:
# ========== HYPERPARAMETERS ==========
INPUT_DIM = x_vocab    # Input vocabulary size
OUTPUT_DIM = y_vocab   # Output vocabulary size
EMB_DIM = 256          # Embedding dimension (vector representation for each word)
HID_DIM = 512          # Hidden state dimension in LSTM

# ========== MODEL INITIALIZATION ==========
attn = Attention(HID_DIM)  # Initialize attention mechanism
enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM)  # Initialize encoder
dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, attn)  # Initialize decoder with attention
model = Seq2Seq(enc, dec).to(device)  # Combine into Seq2Seq model and move to device

# ========== OPTIMIZER AND LOSS FUNCTION ==========
optimizer = optim.Adam(model.parameters(), lr=0.001)  # Adam optimizer with learning rate 0.001
criterion = nn.CrossEntropyLoss(ignore_index=0)  # CrossEntropy loss, ignore padding tokens (index=0)


## 5️⃣ Train
Train the model for a few epochs.

In [None]:
# ========== TRAINING LOOP ==========
for epoch in range(10):  # Train for 10 epochs
    model.train()  # Set model to training mode
    total_loss = 0
    
    # Iterate through each batch in the dataloader
    for src, tgt in tqdm(loader, desc=f"Epoch {epoch+1}"):
        # Move data to device (GPU or CPU)
        src, tgt = src.to(device), tgt.to(device)
        
        # Reset gradients to zero
        optimizer.zero_grad()
        
        # Forward pass: compute model output
        output = model(src, tgt)
        
        # Reshape for loss calculation
        output_dim = output.shape[-1]
        output = output[:, 1:, :].reshape(-1, output_dim)  # Remove first token (<sos>)
        tgt_flat = tgt[:, 1:].reshape(-1)  # Corresponding ground truth
        
        # Calculate loss between prediction and ground truth
        loss = criterion(output, tgt_flat)
        
        # Backward pass: compute gradients
        loss.backward()
        
        # Update parameters
        optimizer.step()
        
        # Accumulate loss
        total_loss += loss.item()
    
    # Print average loss for the epoch
    print(f"Epoch {epoch+1} Loss: {total_loss/len(loader):.4f}")


Epoch 1: 100%|██████████| 3125/3125 [24:27<00:00,  2.13it/s]


Epoch 1 Loss: 6.2659


Epoch 2: 100%|██████████| 3125/3125 [24:29<00:00,  2.13it/s]


Epoch 2 Loss: 5.5000


Epoch 3: 100%|██████████| 3125/3125 [24:29<00:00,  2.13it/s]


Epoch 3 Loss: 5.1022


Epoch 4: 100%|██████████| 3125/3125 [24:29<00:00,  2.13it/s]


Epoch 4 Loss: 4.7551


Epoch 5: 100%|██████████| 3125/3125 [24:29<00:00,  2.13it/s]


Epoch 5 Loss: 4.4525


Epoch 6: 100%|██████████| 3125/3125 [24:29<00:00,  2.13it/s]


Epoch 6 Loss: 4.2065


Epoch 7: 100%|██████████| 3125/3125 [24:30<00:00,  2.13it/s]


Epoch 7 Loss: 3.9979


Epoch 8: 100%|██████████| 3125/3125 [24:31<00:00,  2.12it/s]


Epoch 8 Loss: 3.8284


Epoch 9: 100%|██████████| 3125/3125 [24:31<00:00,  2.12it/s]


Epoch 9 Loss: 3.6652


Epoch 10: 100%|██████████| 3125/3125 [24:34<00:00,  2.12it/s]

Epoch 10 Loss: 3.5453





## 6️⃣ Decode Example
Run a single example through the model to decode a predicted summary.

In [None]:
# ========== INFERENCE: GENERATE SUMMARY FOR ONE SAMPLE ==========
idx = 0  # Select first sample for testing
src = torch.LongTensor(X[idx:idx+1]).to(device)  # Convert input to tensor

model.eval()  # Set model to evaluation mode (disable dropout, etc.)

# Get token IDs for <sos> and <eos>
sos_token = y_tokenizer.word_index.get("<sos>")
eos_token = y_tokenizer.word_index.get("<eos>")

# Generate summary without computing gradients
with torch.no_grad():
    # Encode input
    enc_out, h, c = model.encoder(src)
    
    # Start with <sos> token
    input_tok = torch.LongTensor([sos_token]).to(device)
    result = []  # Store generated token IDs
    
    # Decode one token at a time until <eos> or max_summary_len is reached
    for _ in range(max_summary_len):
        # Predict next token
        output, h, c = model.decoder(input_tok, h, c, enc_out)
        top1 = output.argmax(1)  # Token with highest probability
        
        # Stop if <eos> token is generated
        if top1.item() == eos_token:
            break
        
        result.append(top1.item())
        input_tok = top1  # Use generated token as input for next step

# ========== CONVERT TOKEN IDs TO WORDS ==========
# Create reverse mapping from ID to word
id2word = {v: k for k, v in y_tokenizer.word_index.items()}
pred_words = [id2word.get(i, "") for i in result]

# ========== DISPLAY RESULTS ==========
print("\n📰 ARTICLE:", articles[idx][:300], "...")  # Print first 300 characters of original article
print("💬 PREDICTED:", " ".join(pred_words))  # Print predicted summary
print("📌 REFERENCE:", summaries[idx])  # Print ground truth summary



📰 ARTICLE: ngày 273 cơquan cảnhsát điềutra côngan tp hưngyên tỉnh hưngyên cho biết đơnvị vừa ra quyếtđịnh khởitố vụ án khởitố bịcan đốivới đốitượng maivănthương sn 1989 trú tại đội 11 thôn anchiểu 1 xã liênphương tp hưngyên để điềutra về hànhvi trộmcắp tàisản theo tàiliệu điềutra của cơquan côngan vàokhoảng 7  ...
💬 PREDICTED: sau khi độtnhập tiềnán tiềnsự ổnhóm nghiện matuý <unk> đã độtnhập tiềnán tiềnsự ổnhóm nghiện matuý đã độtnhập vào nhà chú ruột của mình để mua lại nghiện để kiếm tiền tiêuxài lười
📌 REFERENCE: <sos> với bảntính ham chơi lười làm có nhiều tiềnán tiềnsự lại nghiện matuý_thương đã độtnhập vào nhà chú ruột để trộm hơn 1 tạ thóc và hơn 8 triệu đồng mang đi tiêuxài <eos>
