In [7]:
# Mini Transformer Demo Notebook
# Fully stable training on a fixed copy-task dataset
import sys
import os

# Add the project root to sys.path
project_root = os.path.abspath(os.path.join(r'C:\Users\21629\OneDrive\Bureau\Mini-Transformer'))  # Or hardcode: r'C:\Users\21629\OneDrive\Bureau\Mini-Transformer'
if project_root not in sys.path:
    sys.path.insert(0, project_root)
print("sys.path updated:", sys.path)  # For debugging

sys.path updated: ['C:\\Users\\21629\\OneDrive\\Bureau\\Mini-Transformer', 'C:\\Python313\\python313.zip', 'C:\\Python313\\DLLs', 'C:\\Python313\\Lib', 'C:\\Python313', 'c:\\Users\\21629\\OneDrive\\Bureau\\Mini-Transformer\\.venv', '', 'c:\\Users\\21629\\OneDrive\\Bureau\\Mini-Transformer\\.venv\\Lib\\site-packages', 'c:\\Users\\21629\\OneDrive\\Bureau\\Mini-Transformer\\.venv\\Lib\\site-packages\\win32', 'c:\\Users\\21629\\OneDrive\\Bureau\\Mini-Transformer\\.venv\\Lib\\site-packages\\win32\\lib', 'c:\\Users\\21629\\OneDrive\\Bureau\\Mini-Transformer\\.venv\\Lib\\site-packages\\Pythonwin']


In [8]:
import torch
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
from src.transformer import Transformer
from src.masks import create_padding_mask, create_look_ahead_mask

In [9]:
print(sys.executable)

c:\Users\21629\OneDrive\Bureau\Mini-Transformer\.venv\Scripts\python.exe


In [None]:
# Fixed dataset (copy task)
def generate_fixed_dataset(num_samples, seq_len, vocab_size):
    src = torch.randint(1, vocab_size - 1, (num_samples, seq_len))  # avoid START_TOKEN
    tgt = src.clone()
    return src, tgt

src_train, tgt_train = generate_fixed_dataset(NUM_SAMPLES, SEQ_LEN, SRC_VOCAB_SIZE)
src_train, tgt_train = src_train.to(DEVICE), tgt_train.to(DEVICE)

In [None]:
# Initialize model
model = Transformer(SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, D_MODEL, N_HEADS, D_FF,
                    NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS).to(DEVICE)

# Ensure proper weight initialization (e.g., Xavier for linear layers)
def init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.zeros_(m.bias)
    elif isinstance(m, nn.Embedding):
        nn.init.normal_(m.weight, mean=0, std=0.02)

model.apply(init_weights)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=LR)

In [None]:
# Prepare decoder input (prepend start token)
decoder_input = torch.zeros_like(tgt_train)
decoder_input[:, 1:] = tgt_train[:, :-1]
decoder_input[:, 0] = START_TOKEN

In [None]:
# Training loop
for epoch in range(1, EPOCHS + 1):
    model.train()
    optimizer.zero_grad()

    src_mask = create_padding_mask(src_train, pad_idx=PAD_IDX)
    tgt_mask = create_padding_mask(decoder_input, pad_idx=PAD_IDX) | create_look_ahead_mask(decoder_input.size(1))

    # Ensure input is long for embedding lookups
    output = model(src_train.long(), decoder_input.long(), src_mask=src_mask, tgt_mask=tgt_mask)
    output = output.view(-1, TGT_VOCAB_SIZE)  # reshape for loss
    tgt_labels = tgt_train.view(-1).long()    # ensure long and reshape

    loss = criterion(output, tgt_labels)
    if torch.isnan(loss):
        print(f"NaN loss detected at epoch {epoch}. Stopping training.")
        break
    loss.backward()

    # Gradient clipping
    grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
    optimizer.step()

    if epoch % 50 == 0:
        print(f"Epoch {epoch}/{EPOCHS}, Loss: {loss.item():.4f}, Grad Norm: {grad_norm:.4f}")


Epoch 50/500, Loss: 2.2290, Grad Norm: 0.5119
Epoch 100/500, Loss: 2.1233, Grad Norm: 0.3248
Epoch 150/500, Loss: 2.0205, Grad Norm: 0.4113
Epoch 200/500, Loss: 1.8999, Grad Norm: 0.4486
Epoch 250/500, Loss: 1.7988, Grad Norm: 0.4568
Epoch 300/500, Loss: 1.7114, Grad Norm: 0.4124
Epoch 350/500, Loss: 1.6195, Grad Norm: 0.3769
Epoch 400/500, Loss: 1.4934, Grad Norm: 0.4021
Epoch 450/500, Loss: 1.3481, Grad Norm: 0.4804
Epoch 500/500, Loss: 1.1859, Grad Norm: 0.4570


In [None]:
# Save the trained model
torch.save(model.state_dict(), "mini_transformer.pth")
print("Training finished and model saved ✅")

Training finished and model saved ✅


In [None]:
# Test on a sequence from fixed dataset
model.eval()
with torch.no_grad():
    src_test = src_train[:1]
    tgt_test = tgt_train[:1]

    decoder_input_test = torch.zeros_like(tgt_test)
    decoder_input_test[:, 1:] = tgt_test[:, :-1]
    decoder_input_test[:, 0] = START_TOKEN

    src_mask = create_padding_mask(src_test, pad_idx=PAD_IDX)
    tgt_mask = create_padding_mask(decoder_input_test, pad_idx=PAD_IDX) | create_look_ahead_mask(decoder_input_test.size(1))

    logits = model(src_test.long(), decoder_input_test.long(), src_mask=src_mask, tgt_mask=tgt_mask)
    pred_tokens = logits.argmax(dim=-1)


In [None]:
# Show results
print("Input sequence:     ", src_test.cpu().numpy())
print("Target sequence:    ", tgt_test.cpu().numpy())
print("Predicted sequence: ", pred_tokens.cpu().numpy())

Input sequence:      [[3 5 7 4 4]]
Target sequence:     [[3 5 7 4 4]]
Predicted sequence:  [[3 5 4 4 4]]
