# Thực hành Transformers

Trong bài này, ta sẽ thực hành cài đặt Transformer

### 1. Cài đặt và import thư viện

In [None]:
!where python


d:\ANACONDA\python.exe
C:\Users\Admin\AppData\Local\Microsoft\WindowsApps\python.exe


In [None]:
!pip3 install spacy dill
!pip3 install torchtext
!pip3 install pandas



In [None]:
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

^C


In [1]:
import torch.nn as nn
import torch
import torchtext
import copy
import math
import torch.nn.functional as F
from torch.autograd import Variable

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### 2. Cài đặt từng module của Transformer

In [3]:
class Embedder(nn.Module):
    def __init__(self, vocab_size, dim):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, dim)
    
    def forward(self, x):
        return self.embed(x)

**Position Embedding Class**:

In [25]:
# Positional encoding
class PositionalEncoder(nn.Module):
    def __init__(self, dim, max_seq_len=2000):
        super().__init__()
        self.dim = dim
        
        # create a constant 'pe' matrix with values dependant on 
        # pos and i
        pe = torch.zeros(max_seq_len, dim)

        for i in range(max_seq_len):
            for j in range(0, dim, 2):
                pe[i, j] = math.sin(i / (1000 ** (2*j/dim)))
                pe[i, j+1] = math.cos(i / (1000 ** (2*(j+1)/dim)))
        
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        # make embeddings relatively larger
        x = x *math.sqrt(self.dim)
        # add constant to embedding
        seq_len = x.size(1)
        x = x + Variable(self.pe[:, :seq_len], requires_grad=False).to(device)
        return x

**Multi Head Attention**: We first start with implementing attention function

Attention of $q$

In [27]:
def attention(q, k, v, d_k, mask=None, dropout=None):
    """
    Scaled Dot-Product Attention
    
    Args:
        q: (batch, heads, seq_len_q, d_k)
        k: (batch, heads, seq_len_k, d_k)
        v: (batch, heads, seq_len_k, d_k)
        mask: 
            - src_mask: (batch, 1, seq_len_k)
            - trg_mask: (batch, seq_len_q, seq_len_q)
    
    Returns:
        output: (batch, heads, seq_len_q, d_k)
    """
    # Calculate attention scores
    # scores: (batch, heads, seq_len_q, seq_len_k)
    scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
    
    if mask is not None:
        # Expand mask dimensions to match scores
        # mask: (batch, 1, seq_len) -> (batch, 1, 1, seq_len)
        # or mask: (batch, seq_len, seq_len) -> (batch, 1, seq_len, seq_len)
        if mask.dim() == 3:
            mask = mask.unsqueeze(1)
        
        # Apply mask
        scores = scores.masked_fill(mask == 0, -1e9)
    
    # Apply softmax
    scores = F.softmax(scores, dim=-1)
    
    if dropout is not None:
        scores = dropout(scores)
    
    # Apply attention to values
    output = torch.matmul(scores, v)
    return output

In [6]:
# Multi-headed attention
class MultiHeadAttention(nn.Module):
    def __init__(self, heads, dim, dropout=0.1):
        super().__init__()
        self.dim = dim
        self.dim_head = dim//heads
        self.h = heads
        self.q_linear = nn.Linear(dim, dim)
        self.k_linear = nn.Linear(dim, dim)
        self.v_linear = nn.Linear(dim, dim)
        self.dropout = nn.Dropout(dropout)
        self.out = nn.Linear(dim, dim)
    
    def forward(self, q, k, v, mask=None):
        bs = q.size(0)
        # perform linear operation and split into h heads
        k = self.k_linear(k).view(bs, -1, self.h, self.dim_head)
        q = self.q_linear(q).view(bs, -1, self.h, self.dim_head)
        v = self.v_linear(v).view(bs, -1, self.h, self.dim_head)
        # transpose to get dimensions bs * h * sl * dim
        k = k.transpose(1, 2)
        q = q.transpose(1, 2)
        v = v.transpose(1, 2)
        # calculate attention using the function we will define next
        scores = attention(q, k, v, self.dim, mask, self.dropout)
        # concatenate heads and put through final linear layer
        concat = scores.transpose(1,2).contiguous().view(bs, -1, self.dim)
        output = self.out(concat)
        return output

In [7]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff=2048, dropout = 0.1):
        super().__init__() 
        # We set d_ff as a default to 2048
        self.linear_1 = nn.Linear(d_model, d_ff)
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model)
    def forward(self, x):
        x = self.dropout(F.relu(self.linear_1(x)))
        x = self.linear_2(x)
        return x

In [23]:
class Norm(nn.Module):
    def __init__(self, d_model, eps = 1e-6):
        super().__init__()
    
        self.size = d_model
        # create two learnable parameters to calibrate normalisation
        self.alpha = nn.Parameter(torch.ones(self.size))
        self.bias = nn.Parameter(torch.zeros(self.size))
        self.eps = eps
    def forward(self, x):
        norm = self.alpha * (x - x.mean(dim=-1, keepdim=True)) \
        / (x.std(dim=-1, keepdim=True) + self.eps) + self.bias
        return norm

In [8]:
# build an encoder layer with one multi-head attention layer and one 
# feed-forward layer
class EncoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout = 0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.attn = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        
    def forward(self, x, mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn(x2, x2, x2, mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.ff(x2))
        return x
        
    

In [9]:
# build a decoder layer with two multi-head attention layers and
# one feed-forward layer
class DecoderLayer(nn.Module):
    def __init__(self, d_model, heads, dropout=0.1):
        super().__init__()
        self.norm_1 = Norm(d_model)
        self.norm_2 = Norm(d_model)
        self.norm_3 = Norm(d_model)
        
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)
        self.dropout_3 = nn.Dropout(dropout)
        
        self.attn_1 = MultiHeadAttention(heads, d_model)
        self.attn_2 = MultiHeadAttention(heads, d_model)
        self.ff = FeedForward(d_model).cuda()
    
    def forward(self, x, e_outputs, src_mask, trg_mask):
        x2 = self.norm_1(x)
        x = x + self.dropout_1(self.attn_1(x2, x2, x2, trg_mask))
        x2 = self.norm_2(x)
        x = x + self.dropout_2(self.attn_2(x2, e_outputs, e_outputs, src_mask))
        x2 = self.norm_3(x)
        x =  x + self.dropout_3(self.ff(x2))
        return x
def get_clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])

In [10]:
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(EncoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
        
    def forward(self, src, mask):
        x = self.embed(src)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, mask)
        return self.norm(x)
    
class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, N, heads):
        super().__init__()
        self.N = N
        self.embed = Embedder(vocab_size, d_model)
        self.pe = PositionalEncoder(d_model)
        self.layers = get_clones(DecoderLayer(d_model, heads), N)
        self.norm = Norm(d_model)
    def forward(self, trg, e_outputs, src_mask, trg_mask):
        x = self.embed(trg)
        x = self.pe(x)
        for i in range(self.N):
            x = self.layers[i](x, e_outputs, src_mask, trg_mask)
        return self.norm(x)

In [11]:
class Transformer(nn.Module):
    def __init__(self, src_vocab, trg_vocab, d_model, N, heads):
        super().__init__()
        self.encoder = Encoder(src_vocab, d_model, N, heads)
        self.decoder = Decoder(trg_vocab, d_model, N, heads)
        self.out = nn.Linear(d_model, trg_vocab)
    def forward(self, src, trg, src_mask, trg_mask):
        e_outputs = self.encoder(src, src_mask)
        d_output = self.decoder(trg, e_outputs, src_mask, trg_mask)
        output = self.out(d_output)
        return output# we don't perform softmax on the output as this will be handled 
# automatically by our loss function

### 3. Chuẩn bị và tiền xử lý dữ liệu

In [12]:
import spacy
import re

# Tokenize

class tokenize(object):
    
    def __init__(self, lang):
        self.nlp = spacy.load(lang)
            
    def tokenizer(self, sentence):
        sentence = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
        sentence = re.sub(r"[ ]+", " ", sentence)
        sentence = re.sub(r"\!+", "!", sentence)
        sentence = re.sub(r"\,+", ",", sentence)
        sentence = re.sub(r"\?+", "?", sentence)
        sentence = sentence.lower()
        return [tok.text for tok in self.nlp.tokenizer(sentence) if tok.text != " "]

In [None]:
# # Creating batch
# from torchtext.legacy import data
# import numpy as np
# from torch.autograd import Variable


# def nopeak_mask(size, opt):
#     np_mask = np.triu(np.ones((1, size, size)),k=1).astype('uint8')
#     np_mask =  Variable(torch.from_numpy(np_mask) == 0)
#     np_mask = np_mask.to(device)
#     return np_mask

# def create_masks(src, trg, opt):
    
#     src_mask = (src != opt.src_pad).unsqueeze(-2)

#     if trg is not None:
#         trg.to(device)
#         trg_mask = (trg != opt.trg_pad).unsqueeze(-2).to(device)
#         size = trg.size(1) # get seq_len for matrix
#         np_mask = nopeak_mask(size, opt)
#         trg_mask = trg_mask & np_mask
        
#     else:
#         trg_mask = None
#     return src_mask, trg_mask

# # patch on Torchtext's batching process that makes it more efficient
# # from http://nlp.seas.harvard.edu/2018/04/03/attention.html#position-wise-feed-forward-networks

# class MyIterator(data.Iterator):
#     def create_batches(self):
#         if self.train:
#             def pool(d, random_shuffler):
#                 for p in data.batch(d, self.batch_size * 100):
#                     p_batch = data.batch(
#                         sorted(p, key=self.sort_key),
#                         self.batch_size, self.batch_size_fn)
#                     for b in random_shuffler(list(p_batch)):
#                         yield b
#             self.batches = pool(self.data(), self.random_shuffler)
            
#         else:
#             self.batches = []
#             for b in data.batch(self.data(), self.batch_size,
#                                           self.batch_size_fn):
#                 self.batches.append(sorted(b, key=self.sort_key))

# global max_src_in_batch, max_tgt_in_batch

# def batch_size_fn(new, count, sofar):
#     "Keep augmenting batch and calculate total number of tokens + padding."
#     global max_src_in_batch, max_tgt_in_batch
#     if count == 1:
#         max_src_in_batch = 0
#         max_tgt_in_batch = 0
#     max_src_in_batch = max(max_src_in_batch,  len(new.src))
#     max_tgt_in_batch = max(max_tgt_in_batch,  len(new.trg) + 2)
#     src_elements = count * max_src_in_batch
#     tgt_elements = count * max_tgt_in_batch
#     return max(src_elements, tgt_elements)

ModuleNotFoundError: No module named 'torchtext.legacy'

In [28]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence

# Assuming you have device defined somewhere
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def nopeak_mask(size, opt):
    """Create mask to prevent attention to future positions"""
    np_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
    np_mask = torch.from_numpy(np_mask) == 0
    np_mask = np_mask.to(device)
    return np_mask


def create_masks(src, trg, opt):
    """Create source and target masks"""
    src_mask = (src != opt.src_pad).unsqueeze(-2)

    if trg is not None:
        trg = trg.to(device)
        trg_mask = (trg != opt.trg_pad).unsqueeze(-2).to(device)
        size = trg.size(1)  # get seq_len for matrix
        np_mask = nopeak_mask(size, opt)
        trg_mask = trg_mask & np_mask
    else:
        trg_mask = None
    
    return src_mask, trg_mask


class TranslationDataset(Dataset):
    def __init__(self, src_data, trg_data):
        self.src_data = src_data
        self.trg_data = trg_data
        
    def __len__(self):
        return len(self.src_data)
    
    def __getitem__(self, idx):
        return {
            'src': torch.tensor(self.src_data[idx], dtype=torch.long),
            'trg': torch.tensor(self.trg_data[idx], dtype=torch.long)
        }


def collate_fn(batch, src_pad_idx=0, trg_pad_idx=0):
   
    src_batch = [item['src'] for item in batch]
    trg_batch = [item['trg'] for item in batch]
    
    # Pad sequences
    src_padded = pad_sequence(src_batch, batch_first=True, padding_value=src_pad_idx)
    trg_padded = pad_sequence(trg_batch, batch_first=True, padding_value=trg_pad_idx)
    
    return src_padded, trg_padded


class MyIterator:
    def __init__(self, dataset, batch_size, device, train=True, 
                 shuffle=True, sort_key=None):
        self.dataset = dataset
        self.batch_size = batch_size
        self.device = device
        self.train = train
        self.shuffle = shuffle
        self.sort_key = sort_key if sort_key else (lambda x: len(x['src']))
        
    def __iter__(self):
        if self.train:
            # Create pool of batches for training
            indices = list(range(len(self.dataset)))
            if self.shuffle:
                np.random.shuffle(indices)
            
            # Pool batches of size batch_size * 100, sort them, then create smaller batches
            pool_size = self.batch_size * 100
            for i in range(0, len(indices), pool_size):
                pool_indices = indices[i:i + pool_size]
                pool_data = [self.dataset[idx] for idx in pool_indices]
                
                # Sort pool by source length
                pool_data_sorted = sorted(pool_data, key=self.sort_key)
                
                # Create batches from sorted pool
                for j in range(0, len(pool_data_sorted), self.batch_size):
                    batch_data = pool_data_sorted[j:j + self.batch_size]
                    if len(batch_data) > 0:
                        yield self._create_batch(batch_data)
        else:
            # For validation/test, just create batches in order
            for i in range(0, len(self.dataset), self.batch_size):
                batch_indices = list(range(i, min(i + self.batch_size, len(self.dataset))))
                batch_data = [self.dataset[idx] for idx in batch_indices]
                batch_data_sorted = sorted(batch_data, key=self.sort_key)
                yield self._create_batch(batch_data_sorted)
    
    def _create_batch(self, batch_data):
        """Create a batch from list of data"""
        src_batch = [item['src'] for item in batch_data]
        trg_batch = [item['trg'] for item in batch_data]
        
        src_padded = pad_sequence(src_batch, batch_first=True, padding_value=0)
        trg_padded = pad_sequence(trg_batch, batch_first=True, padding_value=0)
        
        return src_padded.to(self.device), trg_padded.to(self.device)
    
    def __len__(self):
        return (len(self.dataset) + self.batch_size - 1) // self.batch_size


def batch_size_fn(batch_data, max_src_len=0, max_trg_len=0):
    count = len(batch_data)
    
    for item in batch_data:
        max_src_len = max(max_src_len, len(item['src']))
        max_trg_len = max(max_trg_len, len(item['trg']) + 2)  # +2 for BOS and EOS
    
    src_elements = count * max_src_len
    trg_elements = count * max_trg_len
    
    return max(src_elements, trg_elements)




In [33]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import spacy
from collections import Counter, OrderedDict
import os

# Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# ==================== VOCABULARY CLASS ====================
class Vocabulary:
    """Custom vocabulary class to replace torchtext.vocab"""
    def __init__(self, counter=None, specials=['<pad>', '<unk>'], min_freq=1):
        self.itos = []  # index to string
        self.stoi = {}  # string to index
        self.freqs = counter if counter else Counter()
        
        # Add special tokens first
        for token in specials:
            self.add_token(token)
        
        # Add other tokens based on frequency
        if counter:
            for token, freq in counter.items():
                if freq >= min_freq and token not in self.stoi:
                    self.add_token(token)
    
    def add_token(self, token):
        if token not in self.stoi:
            idx = len(self.itos)
            self.itos.append(token)
            self.stoi[token] = idx
    
    def __len__(self):
        return len(self.itos)
    
    def __getitem__(self, token):
        return self.stoi.get(token, self.stoi.get('<unk>', 1))


class Field:
    """Custom Field class to replace torchtext.legacy.data.Field"""
    def __init__(self, tokenize=None, lower=False, init_token=None, eos_token=None, 
                 pad_token='<pad>', unk_token='<unk>'):
        self.tokenize = tokenize if tokenize else str.split
        self.lower = lower
        self.init_token = init_token
        self.eos_token = eos_token
        self.pad_token = pad_token
        self.unk_token = unk_token
        self.vocab = None
        
    def preprocess(self, text):
        """Tokenize and lowercase if needed"""
        if self.lower:
            text = text.lower()
        tokens = self.tokenize(text)
        
        # Add special tokens
        if self.init_token:
            tokens = [self.init_token] + tokens
        if self.eos_token:
            tokens = tokens + [self.eos_token]
        
        return tokens
    
    def build_vocab(self, dataset, min_freq=1):
        """Build vocabulary from dataset"""
        counter = Counter()
        for example in dataset:
            tokens = self.preprocess(example)
            counter.update(tokens)
        
        specials = [self.pad_token, self.unk_token]
        if self.init_token:
            specials.append(self.init_token)
        if self.eos_token:
            specials.append(self.eos_token)
        
        self.vocab = Vocabulary(counter, specials=specials, min_freq=min_freq)
    
    def numericalize(self, text):
        """Convert text to indices"""
        tokens = self.preprocess(text)
        return [self.vocab[token] for token in tokens]


# ==================== TOKENIZER ====================
class Tokenizer:
    """Wrapper for spacy tokenizer"""
    def __init__(self, lang):
        self.lang = lang
        try:
            self.nlp = spacy.load(lang)
        except:
            print(f"Spacy model '{lang}' not found. Installing...")
            os.system(f'python -m spacy download {lang}')
            self.nlp = spacy.load(lang)
    
    def tokenizer(self, text):
        return [tok.text for tok in self.nlp.tokenizer(text)]


def tokenize(lang):
    """Create tokenizer for given language"""
    lang_dict = {
        'en': 'en_core_web_sm',
        'fr': 'fr_core_news_sm',
        'de': 'de_core_news_sm',
        'es': 'es_core_news_sm',
        'pt': 'pt_core_news_sm',
        'it': 'it_core_news_sm',
        'nl': 'nl_core_news_sm'
    }
    
    lang_code = lang[0:2]
    if lang_code in lang_dict:
        return Tokenizer(lang_dict[lang_code])
    else:
        raise ValueError(f"Language {lang} not supported")


# ==================== DATASET ====================
class TranslationDataset(Dataset):
    """Custom Dataset for translation tasks"""
    def __init__(self, src_data, trg_data, src_field, trg_field):
        self.src_data = src_data
        self.trg_data = trg_data
        self.src_field = src_field
        self.trg_field = trg_field
        
    def __len__(self):
        return len(self.src_data)
    
    def __getitem__(self, idx):
        src_text = self.src_data[idx]
        trg_text = self.trg_data[idx]
        
        # Numericalize
        src_indices = self.src_field.numericalize(src_text)
        trg_indices = self.trg_field.numericalize(trg_text)
        
        return {
            'src': torch.tensor(src_indices, dtype=torch.long),
            'trg': torch.tensor(trg_indices, dtype=torch.long)
        }


# ==================== ITERATOR ====================
class MyIterator:
    """Custom iterator with batch pooling and sorting"""
    def __init__(self, dataset, batch_size, device, train=True, 
                 shuffle=True, sort_key=None, batch_size_fn=None, repeat=False):
        self.dataset = dataset
        self.batch_size = batch_size
        self.device = device
        self.train = train
        self.shuffle = shuffle
        self.sort_key = sort_key if sort_key else (lambda x: len(x['src']))
        self.batch_size_fn = batch_size_fn
        self.repeat = repeat
        
    def __iter__(self):
        while True:
            if self.train:
                # Create pool of batches for training
                indices = list(range(len(self.dataset)))
                if self.shuffle:
                    np.random.shuffle(indices)
                
                # Pool batches of size batch_size * 100, sort them, then create smaller batches
                pool_size = self.batch_size * 100
                for i in range(0, len(indices), pool_size):
                    pool_indices = indices[i:i + pool_size]
                    pool_data = [self.dataset[idx] for idx in pool_indices]
                    
                    # Sort pool by source and target length
                    pool_data_sorted = sorted(pool_data, key=self.sort_key)
                    
                    # Create batches from sorted pool
                    for j in range(0, len(pool_data_sorted), self.batch_size):
                        batch_data = pool_data_sorted[j:j + self.batch_size]
                        if len(batch_data) > 0:
                            yield self._create_batch(batch_data)
            else:
                # For validation/test, just create batches in order
                for i in range(0, len(self.dataset), self.batch_size):
                    batch_indices = list(range(i, min(i + self.batch_size, len(self.dataset))))
                    batch_data = [self.dataset[idx] for idx in batch_indices]
                    batch_data_sorted = sorted(batch_data, key=self.sort_key)
                    yield self._create_batch(batch_data_sorted)
            
            if not self.repeat:
                break
    
    def _create_batch(self, batch_data):
        """Create a batch from list of data"""
        src_batch = [item['src'] for item in batch_data]
        trg_batch = [item['trg'] for item in batch_data]
        
        src_padded = pad_sequence(src_batch, batch_first=True, padding_value=0)
        trg_padded = pad_sequence(trg_batch, batch_first=True, padding_value=0)
        
        # Create simple batch object
        class Batch:
            def __init__(self, src, trg):
                self.src = src
                self.trg = trg
        
        return Batch(src_padded.to(self.device), trg_padded.to(self.device))
    
    def __len__(self):
        return (len(self.dataset) + self.batch_size - 1) // self.batch_size


# ==================== MASKS ====================
def nopeak_mask(size, opt):
    """Create causal mask to prevent attention to future positions"""
    np_mask = np.triu(np.ones((1, size, size)), k=1).astype('uint8')
    np_mask = torch.from_numpy(np_mask) == 0
    np_mask = np_mask.to(device)
    return np_mask


def create_masks(src, trg, opt):
    """Create masks for Transformer"""
    src_mask = (src != opt.src_pad).unsqueeze(1)
    
    if trg is not None:
        trg_pad_mask = (trg != opt.trg_pad).unsqueeze(1)
        size = trg.size(1)
        nopeak = nopeak_mask(size, opt)
        trg_mask = trg_pad_mask & nopeak
    else:
        trg_mask = None
    
    return src_mask, trg_mask


# ==================== BATCH SIZE FUNCTION ====================
global max_src_in_batch, max_tgt_in_batch

def batch_size_fn(new, count, sofar):
    """Keep augmenting batch and calculate total number of tokens + padding."""
    global max_src_in_batch, max_tgt_in_batch
    if count == 1:
        max_src_in_batch = 0
        max_tgt_in_batch = 0
    max_src_in_batch = max(max_src_in_batch, len(new['src']))
    max_tgt_in_batch = max(max_tgt_in_batch, len(new['trg']))
    src_elements = count * max_src_in_batch
    tgt_elements = count * max_tgt_in_batch
    return max(src_elements, tgt_elements)


# ==================== DATA LOADING FUNCTIONS ====================
def read_data(opt):
    """Read source and target data files"""
    if opt.src_data is not None:
        try:
            with open(opt.src_data, 'r', encoding='utf-8') as f:
                opt.src_data = [line.strip() for line in f if line.strip()]
            print(f"✓ Loaded {len(opt.src_data)} source sentences")
        except Exception as e:
            print(f"ERROR: Cannot read '{opt.src_data}': {e}")
            quit()
    
    if opt.trg_data is not None:
        try:
            with open(opt.trg_data, 'r', encoding='utf-8') as f:
                opt.trg_data = [line.strip() for line in f if line.strip()]
            print(f"✓ Loaded {len(opt.trg_data)} target sentences")
        except Exception as e:
            print(f"ERROR: Cannot read '{opt.trg_data}': {e}")
            quit()
    
    # Check and fix length mismatch
    if len(opt.src_data) != len(opt.trg_data):
        min_len = min(len(opt.src_data), len(opt.trg_data))
        print(f"⚠ WARNING: Source ({len(opt.src_data)}) and target ({len(opt.trg_data)}) lengths differ!")
        print(f"⚠ Truncating to {min_len} parallel sentences.")
        opt.src_data = opt.src_data[:min_len]
        opt.trg_data = opt.trg_data[:min_len]


def create_fields(opt):
    """Create source and target fields with tokenizers"""
    spacy_langs = ['en', 'fr', 'de', 'es', 'pt', 'it', 'nl']
    src_lang = opt.src_lang[0:2]
    trg_lang = opt.trg_lang[0:2]
    
    if src_lang not in spacy_langs:
        print(f'ERROR: Invalid src language: {opt.src_lang}')
        print(f'Supported: {spacy_langs}')
        quit()
    if trg_lang not in spacy_langs:
        print(f'ERROR: Invalid trg language: {opt.trg_lang}')
        print(f'Supported: {spacy_langs}')
        quit()
    
    print("Loading spacy tokenizers...")
    
    t_src = tokenize(opt.src_lang)
    t_trg = tokenize(opt.trg_lang)
    
    TRG = Field(lower=True, tokenize=t_trg.tokenizer, init_token='<sos>', eos_token='<eos>')
    SRC = Field(lower=True, tokenize=t_src.tokenizer)

    return SRC, TRG


def create_dataset(opt, SRC, TRG):
    """Create dataset and iterator"""
    print("\nCreating dataset and iterator...")
    
    # Verify lengths match
    assert len(opt.src_data) == len(opt.trg_data), \
        f"Data length mismatch: src={len(opt.src_data)}, trg={len(opt.trg_data)}"
    
    raw_data = {'src': opt.src_data, 'trg': opt.trg_data}
    df = pd.DataFrame(raw_data, columns=["src", "trg"])
    
    print(f"Total parallel sentences: {len(df)}")
    
    # Filter by max string length
    mask = (df['src'].str.count(' ') < opt.max_strlen) & (df['trg'].str.count(' ') < opt.max_strlen)
    df = df.loc[mask]
    
    print(f"After filtering (max_strlen={opt.max_strlen}): {len(df)}")
    
    if len(df) == 0:
        print("ERROR: No data left after filtering! Try increasing max_strlen.")
        quit()
    
    src_data = df['src'].tolist()
    trg_data = df['trg'].tolist()
    
    # Build vocabularies
    print("Building vocabularies...")
    SRC.build_vocab(src_data)
    TRG.build_vocab(trg_data)
    
    print(f"✓ Source vocab size: {len(SRC.vocab)}")
    print(f"✓ Target vocab size: {len(TRG.vocab)}")
    
    # Set padding indices
    opt.src_pad = SRC.vocab.stoi['<pad>']
    opt.trg_pad = TRG.vocab.stoi['<pad>']
    
    # Create dataset
    dataset = TranslationDataset(src_data, trg_data, SRC, TRG)
    
    # Create iterator
    train_iter = MyIterator(
        dataset, 
        batch_size=opt.batchsize, 
        device=device,
        repeat=False, 
        sort_key=lambda x: (len(x['src']), len(x['trg'])),
        batch_size_fn=batch_size_fn, 
        train=True, 
        shuffle=True
    )

    opt.train_len = get_len(train_iter)
    print(f"✓ Number of batches: {opt.train_len}\n")

    return train_iter


def get_len(train):
    """Get length of iterator"""
    count = 0
    for batch in train:
        count += 1
    return count

### 4. Cài đặt giải thuật tối ưu và huấn luyện mô hình

In [18]:
# Optimizer
class CosineWithRestarts(torch.optim.lr_scheduler._LRScheduler):
    """
    Cosine annealing with restarts.
    Parameters
    ----------
    optimizer : torch.optim.Optimizer
    T_max : int
        The maximum number of iterations within the first cycle.
    eta_min : float, optional (default: 0)
        The minimum learning rate.
    last_epoch : int, optional (default: -1)
        The index of the last epoch.
    """

    def __init__(self,
                 optimizer: torch.optim.Optimizer,
                 T_max: int,
                 eta_min: float = 0.,
                 last_epoch: int = -1,
                 factor: float = 1.) -> None:
        # pylint: disable=invalid-name
        self.T_max = T_max
        self.eta_min = eta_min
        self.factor = factor
        self._last_restart: int = 0
        self._cycle_counter: int = 0
        self._cycle_factor: float = 1.
        self._updated_cycle_len: int = T_max
        self._initialized: bool = False
        super(CosineWithRestarts, self).__init__(optimizer, last_epoch)

    def get_lr(self):
        """Get updated learning rate."""
        # HACK: We need to check if this is the first time get_lr() was called, since
        # we want to start with step = 0, but _LRScheduler calls get_lr with
        # last_epoch + 1 when initialized.
        if not self._initialized:
            self._initialized = True
            return self.base_lrs

        step = self.last_epoch + 1
        self._cycle_counter = step - self._last_restart

        lrs = [
            (
                self.eta_min + ((lr - self.eta_min) / 2) *
                (
                    np.cos(
                        np.pi *
                        ((self._cycle_counter) % self._updated_cycle_len) /
                        self._updated_cycle_len
                    ) + 1
                )
            ) for lr in self.base_lrs
        ]

        if self._cycle_counter % self._updated_cycle_len == 0:
            # Adjust the cycle length.
            self._cycle_factor *= self.factor
            self._cycle_counter = 0
            self._updated_cycle_len = int(self._cycle_factor * self.T_max)
            self._last_restart = step

        return lrs


In [29]:
import os
import urllib.request

# Tạo thư mục data
os.makedirs('data', exist_ok=True)

# Download files
urls = {
    'data/english.txt': 'https://raw.githubusercontent.com/SamLynnEvans/Transformer/master/data/english.txt',
    'data/french.txt': 'https://raw.githubusercontent.com/SamLynnEvans/Transformer/master/data/french.txt'
}

for filepath, url in urls.items():
    print(f"Downloading {filepath}...")
    urllib.request.urlretrieve(url, filepath)
    print(f"Done!")

Downloading data/english.txt...


KeyboardInterrupt: 

In [30]:

def get_model(opt, src_vocab, trg_vocab):
    
    assert opt.d_model % opt.heads == 0
    assert opt.dropout < 1

    model = Transformer(src_vocab, trg_vocab, opt.d_model, opt.n_layers, opt.heads)
       
    if opt.load_weights is not None:
        print("loading pretrained weights...")
        model.load_state_dict(torch.load(f'{opt.load_weights}/model_weights'))
    else:
        for p in model.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p) 
    
    if opt.device == 0:
        model = model.cuda()
    
    return model

In [None]:
# import time

# def train_model(model, opt):
    
#     print("training model...")
#     model.train()
#     start = time.time()
#     if opt.checkpoint > 0:
#         cptime = time.time()
                 
#     for epoch in range(opt.epochs):

#         total_loss = 0
#         print("   %dm: epoch %d [%s]  %d%%  loss = %s" %\
#             ((time.time() - start)//60, epoch + 1, "".join(' '*20), 0, '...'), end='\r')
        
#         if opt.checkpoint > 0:
#             torch.save(model.state_dict(), 'weights/model_weights')
                    
#         for i, batch in enumerate(opt.train): 

#             src = batch.src.transpose(0,1).to(device)
#             trg = batch.trg.transpose(0,1).to(device)
#             trg_input = trg[:, :-1].to(device)
#             src_mask, trg_mask = create_masks(src, trg_input, opt)
#             preds = model(src, trg_input, src_mask, trg_mask)
#             ys = trg[:, 1:].contiguous().view(-1)
#             opt.optimizer.zero_grad()
#             loss = F.cross_entropy(preds.view(-1, preds.size(-1)), ys, ignore_index=opt.trg_pad)
#             loss.backward()
#             opt.optimizer.step()
          
#             total_loss += loss.item()
            
#             if (i + 1) % opt.printevery == 0:
#                 p = int(100 * (i + 1) / opt.train_len)
#                 avg_loss = total_loss/opt.printevery
#                 print("   %dm: epoch %d [%s%s]  %d%%  loss = %.3f" %\
#                     ((time.time() - start)//60, epoch + 1, "".join('#'*(p//5)), "".join(' '*(20-(p//5))), p, avg_loss))
#                 total_loss = 0
            
#             if opt.checkpoint > 0 and ((time.time()-cptime)//60) // opt.checkpoint >= 1:
#                 torch.save(model.state_dict(), 'weights/model_weights')
#                 cptime = time.time()
   
   
#         print("%dm: epoch %d [%s%s]  %d%%  loss = %.3f\nepoch %d complete, loss = %.03f" %\
#         ((time.time() - start)//60, epoch + 1, "".join('#'*(100//5)), "".join(' '*(20-(100//5))), 100, avg_loss, epoch + 1, avg_loss))

# class Opt(object):
#     pass
        
# def main():
#     opt = Opt()
#     opt.src_data = "data/english.txt"
#     opt.trg_data = "data/french.txt"
#     opt.src_lang = "en_core_web_sm"
#     opt.trg_lang = 'fr_core_news_sm'
#     opt.epochs = 2
#     opt.d_model=512
#     opt.n_layers=6
#     opt.heads=8
#     opt.dropout=0.1
#     opt.batchsize=1500
#     opt.printevery=100
#     opt.lr=0.0001
#     opt.max_strlen=80
#     opt.checkpoint = 0
#     opt.no_cuda = False
#     opt.load_weights = None
    
#     opt.device = 0
#     if opt.device == 0:
#         assert torch.cuda.is_available()
    
#     read_data(opt)
#     SRC, TRG = create_fields(opt)
#     opt.train = create_dataset(opt, SRC, TRG)
#     model = get_model(opt, len(SRC.vocab), len(TRG.vocab)).to(device)

#     opt.optimizer = torch.optim.Adam(model.parameters(), lr=opt.lr, betas=(0.9, 0.98), eps=1e-9)

#     if opt.checkpoint > 0:
#         print("model weights will be saved every %d minutes and at end of epoch to directory weights/"%(opt.checkpoint))
    
#     train_model(model, opt)


#     # for asking about further training use while true loop, and return
# if __name__ == "__main__":
#     main()

loading spacy tokenizers...
creating dataset and iterator... 


ValueError: All arrays must be of the same length

In [34]:
import time
import torch
import torch.nn.functional as F
import os

def train_model(model, opt):
    """Train the transformer model"""
    print("training model...")
    model.train()
    start = time.time()
    
    if opt.checkpoint > 0:
        os.makedirs('weights', exist_ok=True)
        cptime = time.time()
                 
    for epoch in range(opt.epochs):
        total_loss = 0
        print("   %dm: epoch %d [%s]  %d%%  loss = %s" %\
            ((time.time() - start)//60, epoch + 1, "".join(' '*20), 0, '...'), end='\r')
        
        if opt.checkpoint > 0:
            torch.save(model.state_dict(), 'weights/model_weights')
        
        # ✅ FIX: Recreate iterator for each epoch
        for i, batch in enumerate(opt.train): 
            # ✅ FIX: Remove transpose - data is already in correct shape (batch_first=True)
            src = batch.src.to(device)  # (batch, src_len)
            trg = batch.trg.to(device)  # (batch, trg_len)
            
            # Target input: remove last token
            trg_input = trg[:, :-1].to(device)
            
            # Create masks
            src_mask, trg_mask = create_masks(src, trg_input, opt)
            
            # Forward pass
            preds = model(src, trg_input, src_mask, trg_mask)
            
            # Target output: remove first token (<sos>)
            ys = trg[:, 1:].contiguous().view(-1)
            
            # Backward pass
            opt.optimizer.zero_grad()
            loss = F.cross_entropy(preds.view(-1, preds.size(-1)), ys, ignore_index=opt.trg_pad)
            loss.backward()
            opt.optimizer.step()
          
            total_loss += loss.item()
            
            # Print progress
            if (i + 1) % opt.printevery == 0:
                p = int(100 * (i + 1) / opt.train_len)
                avg_loss = total_loss / opt.printevery
                print("   %dm: epoch %d [%s%s]  %d%%  loss = %.3f" %\
                    ((time.time() - start)//60, epoch + 1, "".join('#'*(p//5)), 
                     "".join(' '*(20-(p//5))), p, avg_loss), end='\r')
                total_loss = 0
            
            # Save checkpoint
            if opt.checkpoint > 0 and ((time.time()-cptime)//60) // opt.checkpoint >= 1:
                torch.save(model.state_dict(), 'weights/model_weights')
                cptime = time.time()
        
        # Epoch complete
        avg_loss = total_loss / max(1, opt.train_len % opt.printevery)
        print("\n%dm: epoch %d [%s%s]  %d%%  loss = %.3f" %\
            ((time.time() - start)//60, epoch + 1, "".join('#'*(100//5)), 
             "".join(' '*(20-(100//5))), 100, avg_loss))
        print("epoch %d complete, loss = %.03f\n" % (epoch + 1, avg_loss))


class Opt(object):
    """Options class for hyperparameters"""
    pass


def main():
    """Main training function"""
    opt = Opt()
    
    # Data paths
    opt.src_data = "data/english.txt"
    opt.trg_data = "data/french.txt"
    
    # Languages
    opt.src_lang = "en_core_web_sm"
    opt.trg_lang = 'fr_core_news_sm'
    
    # Training hyperparameters
    opt.epochs = 10
    opt.d_model = 512
    opt.n_layers = 6
    opt.heads = 8
    opt.dropout = 0.1
    opt.batchsize = 32  # ✅ FIX: Reduced from 1500 to reasonable size
    opt.printevery = 100
    opt.lr = 0.0001
    opt.max_strlen = 80  # Maximum sequence length
    opt.checkpoint = 0  # Save every N minutes (0 = no checkpoints)
    
    # Device settings
    opt.no_cuda = False
    opt.load_weights = None
    opt.device = 0
    
    # Check CUDA availability
    if opt.device == 0:
        if not torch.cuda.is_available():
            print("WARNING: CUDA not available, using CPU")
            opt.device = -1
    
    # Load data
    print("Loading data...")
    read_data(opt)
    
    # Create fields and vocabulary
    print("Creating fields...")
    SRC, TRG = create_fields(opt)
    
    # Create dataset and iterator
    print("Creating dataset...")
    opt.train = create_dataset(opt, SRC, TRG)
    
    # Print vocabulary sizes
    print(f"Source vocabulary size: {len(SRC.vocab)}")
    print(f"Target vocabulary size: {len(TRG.vocab)}")
    print(f"Training samples: {opt.train_len}")
    
    # Create model
    print("Creating model...")
    model = get_model(opt, len(SRC.vocab), len(TRG.vocab))
    model = model.to(device)
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    
    # Create optimizer
    opt.optimizer = torch.optim.Adam(
        model.parameters(), 
        lr=opt.lr, 
        betas=(0.9, 0.98), 
        eps=1e-9
    )
    
    # Checkpoint info
    if opt.checkpoint > 0:
        print(f"Model weights will be saved every {opt.checkpoint} minutes and at end of epoch to directory weights/")
    
    # Train model
    train_model(model, opt)
    
    # Save final model
    print("\nTraining complete! Saving final model...")
    os.makedirs('weights', exist_ok=True)
    torch.save(model.state_dict(), 'weights/model_weights_final')
    print("Model saved to weights/model_weights_final")


if __name__ == "__main__":
    main()

Loading data...
✓ Loaded 53734 source sentences
✓ Loaded 154883 target sentences
⚠ Truncating to 53734 parallel sentences.
Creating fields...
Loading spacy tokenizers...
Creating dataset...

Creating dataset and iterator...
Total parallel sentences: 53734
After filtering (max_strlen=80): 53734
Building vocabularies...
✓ Source vocab size: 6195
✓ Target vocab size: 11409
✓ Number of batches: 1680

Source vocabulary size: 6195
Target vocabulary size: 11409
Training samples: 1680
Creating model...
Total parameters: 59,006,609
Trainable parameters: 59,006,609
training model...
   3m: epoch 1 [################### ]  95%  loss = 2.932
3m: epoch 1 [####################]  100%  loss = 2.872
epoch 1 complete, loss = 2.872

   9m: epoch 2 [################### ]  95%  loss = 2.288
9m: epoch 2 [####################]  100%  loss = 2.292
epoch 2 complete, loss = 2.292

   19m: epoch 3 [################### ]  95%  loss = 1.835
19m: epoch 3 [####################]  100%  loss = 1.846
epoch 3 complete, 