In [None]:
%reload_ext autoreload
%autoreload 2

import torch

torch.manual_seed(42)


In [2]:
from data.shakespeare_data_source import ShakespeareDataSource
from data.tokenizer import Tokenizer


tokenizer = Tokenizer()

shakespeare_data_source = ShakespeareDataSource.load(
    file_path="../datasets/shakespeare/input.txt",
    tokenizer=tokenizer,
)


In [3]:
from learning.learner import Config


config = Config(
    batch_size=2**10,
    sequence_length=2**3,
    embedding_size=2**5,
    num_heads=2**2,
    epochs=100,
    dropout=0.1,
    learning_rate=1e-3,
    patience=30,
    min_delta=1e-3,
    device=torch.device("cuda"),
)

torch.set_float32_matmul_precision("medium")


In [None]:
from learning.shakespeare_generator.shakespeare_dataset import ShakespeareDataset

shakespeare_dataset = ShakespeareDataset(
    shakespeare_data_source=shakespeare_data_source,
    tokenizer=tokenizer,
    sequence_length=config.sequence_length,
    device=config.device,
)

print(shakespeare_dataset[0])


In [None]:
from learning.shakespeare_generator.model import ShakespeareGenerator


test_model = ShakespeareGenerator(
    config=config,
    vocab_size=tokenizer.vocab_size,
)

# shape: [2, 1] -- (B=2, S=1)
inputs = torch.zeros((2, 1), dtype=torch.long, device=config.device)
outputs = test_model.generate(
    inputs,
    max_length=100,
)

print("---------- Predict from empty input:")
for i in range(outputs.shape[0]):
    print(tokenizer.i2t(outputs[i].tolist()))

# shape: [1, S]
inputs = shakespeare_dataset[0].input.unsqueeze(0)
outputs = test_model.generate(
    inputs,
    max_length=100,
)

print("---------- Predict from first sample:")
for i in range(outputs.shape[0]):
    print(tokenizer.i2t(outputs[i].tolist()))


In [None]:
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    shakespeare_dataset,
    [0.8, 0.1, 0.1],
)

print(len(train_dataset), len(val_dataset), len(test_dataset))

In [None]:
import math
import time
from learning.shakespeare_generator.model import Batch, ParallelBatchLearner
from torch.utils.data import DataLoader

from torch import optim
from torch import nn

print(config)

model = ShakespeareGenerator(
    config=config,
    vocab_size=tokenizer.vocab_size,
)

criterion = nn.CrossEntropyLoss(reduction="sum")
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

learner = ParallelBatchLearner(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    device=config.device,
)
print(learner)

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=config.batch_size,
    shuffle=True,
    collate_fn=Batch.from_samples,
)

val_dataloader = DataLoader(
    dataset=val_dataset,
    batch_size=config.batch_size,
    collate_fn=Batch.from_samples,
)

print(
    "Starting training...\n"
    f"Expecting initial loss around {math.log(tokenizer.vocab_size)}"
)
start_time = time.time()
learner.fit(
    train_dataloader=train_dataloader,
    eval_dataloader=val_dataloader,
    num_epochs=config.epochs,
    patience=config.patience,
    min_delta=config.min_delta,
)
elapsed_time = time.time() - start_time
print(f"Training completed. Elapsed time: {elapsed_time:.2f}s")

In [None]:
model.eval()

# shape: [2, 1] -- (B=2, S=1)
inputs = torch.zeros((2, 1), dtype=torch.long)
outputs = model.generate(
    inputs,
    max_length=100,
)

print("---------- Predict from empty input:")
for i in range(outputs.shape[0]):
    print(tokenizer.i2t(outputs[i].tolist()))
