In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!pip install torchtext



In [9]:
import torch
import torch.nn as nn
import torch.optim as optim

# ======= 1Ô∏è‚É£ D·ªÆ LI·ªÜU TOY =======
data = [
    ("I love you", "Je t'aime"),
    ("He is a boy", "Il est un gar√ßon"),
    ("She is a girl", "Elle est une fille"),
    ("I am happy", "Je suis heureux"),
    ("We are friends", "Nous sommes amis")
]

# --- Tokenizer & vocab th·ªß c√¥ng ---
def tokenizer(text):
    return text.lower().replace("'", " '").split()

tokens = set()
for src, tgt in data:
    tokens.update(tokenizer(src))
    tokens.update(tokenizer(tgt))

vocab = {w: i+4 for i, w in enumerate(sorted(tokens))}
vocab.update({"<unk>": 0, "<pad>": 1, "<bos>": 2, "<eos>": 3})
inv_vocab = {i: w for w, i in vocab.items()}

def encode(text):
    return [vocab["<bos>"]] + [vocab.get(tok, vocab["<unk>"]) for tok in tokenizer(text)] + [vocab["<eos>"]]

def pad_sequence(seq_list):
    return nn.utils.rnn.pad_sequence(seq_list, batch_first=True, padding_value=vocab["<pad>"])

src_data = [torch.tensor(encode(s)) for s, _ in data]
tgt_data = [torch.tensor(encode(t)) for _, t in data]

src_batch = pad_sequence(src_data)
tgt_batch = pad_sequence(tgt_data)

# ======= 2Ô∏è‚É£ M√î H√åNH =======
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))  # (1, max_len, d_model)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class TransformerToy(nn.Module):
    def __init__(self, vocab_size, d_model=64, nhead=4, num_layers=2, dim_ff=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dim_ff,
            dropout=0.1,
            batch_first=True  # <‚îÄ‚îÄ th√™m d√≤ng n√†y
        )
        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src, tgt):
        src_emb = self.pos_encoder(self.embedding(src))
        tgt_emb = self.pos_encoder(self.embedding(tgt))
        out = self.transformer(src_emb, tgt_emb)
        return self.fc_out(out)

# ======= 3Ô∏è‚É£ HU·∫§N LUY·ªÜN =======
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerToy(len(vocab)).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=vocab["<pad>"])
optimizer = optim.Adam(model.parameters(), lr=0.001)

src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)

for epoch in range(51):
    optimizer.zero_grad()
    output = model(src_batch, tgt_batch[:, :-1])
    loss = criterion(output.reshape(-1, output.shape[-1]), tgt_batch[:, 1:].reshape(-1))
    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
        print(f"Epoch {epoch}, Loss: {loss.item():.4f}")

print("‚úÖ Hu·∫•n luy·ªán ho√†n t·∫•t!")

# ======= 4Ô∏è‚É£ D·ªäCH TH·ª¨ =======
def translate(model, sentence):
    model.eval()
    src = torch.tensor(encode(sentence)).unsqueeze(0).to(device)
    tgt = torch.tensor([[vocab["<bos>"]]]).to(device)

    for _ in range(10):
        out = model(src, tgt)
        next_token = out[0, -1].argmax().item()
        tgt = torch.cat([tgt, torch.tensor([[next_token]]).to(device)], dim=1)
        if next_token == vocab["<eos>"]:
            break

    words = [inv_vocab[i] for i in tgt.squeeze().tolist()[1:-1]]
    return " ".join(words)

print("V√≠ d·ª• d·ªãch:")
print("üëâ", translate(model, "I am happy"))


Epoch 0, Loss: 3.5171
Epoch 10, Loss: 1.9606
Epoch 20, Loss: 1.0523
Epoch 30, Loss: 0.5973
Epoch 40, Loss: 0.3459
Epoch 50, Loss: 0.2119
‚úÖ Hu·∫•n luy·ªán ho√†n t·∫•t!
V√≠ d·ª• d·ªãch:
üëâ je suis heureux
