# Train transformer

In [1]:
import torch
import torch.nn as nn

from transformer_implementation import Transformer, Tokenizer, TransformerConfig, DataLoaderFactory
from utils import training_loop, plot_losses, estimate_loss

## Init
### Tokenizer

In [2]:
# init tokenizer
tokenizer = Tokenizer()

### Config

In [3]:
# init config
config = TransformerConfig(
    tokenizer,
    block_size = 256,
    batch_size = 12,
    n_layer = 3, # 6,
    n_head = 8,
    # n_embd = 512,
    max_iters = 2000,
    eval_iters = 50,
    eval_interval = 100,
)
print(config)

TransformerConfig(
	self.tokenizer=<transformer_implementation.Tokenizer.Tokenizer object at 0x0000022A0C6BFE90>,
	self.block_size=256,
	self.batch_size=12,
	self.n_layer=3,
	self.n_head=8,
	self.n_embd=256,
	self.dropout=0.1,
	self.bias=False,
	self.device='cuda',
	self.learning_rate=0.0003,
	self.max_iters=2000,
	self.eval_interval=100,
	self.eval_iters=50,
	self.visualize=False,
)


### Loading dataset

In [4]:
# loading dataset
dataset = DataLoaderFactory(config.block_size, config.batch_size, tokenizer, config.device, 5000000)
len(dataset)

Found cached dataset wmt14 (C:/Users/thiba/.cache/huggingface/datasets/wmt14/fr-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4)
Found cached dataset wmt14 (C:/Users/thiba/.cache/huggingface/datasets/wmt14/fr-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4)
Found cached dataset wmt14 (C:/Users/thiba/.cache/huggingface/datasets/wmt14/fr-en/1.0.0/2de185b074515e97618524d69f5e27ee7545dcbed4aa9bc1a4235710ffca33f4)


[95m[1m[4mNumber of data by datasets splits[0m
Train		: 5000000	-> 416666.6666666667
Validation	: 3000		-> 250.0
Test		: 3003		-> 250.25
Total		: 5006003


5006003

### Model

In [5]:
# Create model
model = Transformer(config)
model.train()
# Use nn.DataParallel to wrap the model.
# This will distribute the operations to multiple GPUs if they are available.
model = nn.DataParallel(model)
model = model.to(config.device)

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=config.learning_rate, betas=(0.9, 0.98), eps=1e-9)

Number of Encoder parameters: 28.03M
number of parameters: 28.82M


TypeError: Encoder.forward() missing 1 required positional argument: 'idx'

## Training

### Loop

In [None]:
losses_list = training_loop(model, optimizer, dataset, config, saved_path = "./out/transformer-train.pth")

### Plotting losses

In [None]:
# Call the function
plot_losses(losses_list)

## Testing

In [None]:
test_loss = estimate_loss(model, dataset, config, ['test'])

In [None]:
print(f"Test loss = {test_loss['test'].item():4f}")