In [None]:
%reload_ext autoreload
%autoreload 2

import torch

torch.manual_seed(42)


In [11]:
from data.shakespeare_data_source import ShakespeareDataSource
from data.tokenizer import Tokenizer


tokenizer = Tokenizer()

shakespeare_data_source = ShakespeareDataSource.load(
    file_path="../../../../datasets/shakespeare/input.txt",
)

tokenizer.load(shakespeare_data_source.vocab)


In [12]:
from learning.shakespeare_generator.model import Config


config = Config(
    batch_size=2**12,
    sequence_length=2**6,
    embedding_size=2**5,
    num_heads=2**3,
    num_blocks=2**2,
    epochs=1,
    dropout=0.1,
    learning_rate=1e-3,
    patience=30,
    min_delta=1e-3,
    device=torch.device("cuda"),
)


In [None]:
from learning.shakespeare_generator.shakespeare_dataset import ShakespeareDataset

shakespeare_dataset = ShakespeareDataset(
    shakespeare_data_source=shakespeare_data_source,
    tokenizer=tokenizer,
    sequence_length=config.sequence_length,
    device=torch.device("cpu"),
)

print(shakespeare_dataset[0])


In [None]:
from learning.shakespeare_generator.model import ShakespeareGenerator


test_model = ShakespeareGenerator(
    config=config,
    vocab_size=tokenizer.vocab_size,
)

# shape: [2, 1] -- (B=2, S=1)
inputs = torch.ones((2, 1), dtype=torch.long, device=config.device)
outputs = test_model.generate(
    inputs,
    max_length=100,
)

print("---------- Predict from empty input:")
for i in range(outputs.shape[0]):
    print(tokenizer.i2t(outputs[i].tolist()))

# shape: [1, S]
inputs = shakespeare_dataset[0].input.unsqueeze(0).to(config.device, non_blocking=True)
outputs = test_model.generate(
    inputs,
    max_length=100,
)

print("---------- Predict from first sample:")
for i in range(outputs.shape[0]):
    print(tokenizer.i2t(outputs[i].tolist()))


In [None]:
train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    shakespeare_dataset,
    [0.8, 0.1, 0.1],
)

print(len(train_dataset), len(val_dataset), len(test_dataset))

In [None]:
import math
import time
from learning.shakespeare_generator.model import Batch, ParallelBatchLearner
from torch.utils.data import DataLoader

from torch import optim
from torch import nn

print(config)

model = ShakespeareGenerator(
    config=config,
    vocab_size=tokenizer.vocab_size,
)

criterion = nn.CrossEntropyLoss(reduction="sum")
optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

learner = ParallelBatchLearner(
    model=model,
    optimizer=optimizer,
    criterion=criterion,
    device=config.device,
)
print(learner)

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=config.batch_size,
    shuffle=True,
    collate_fn=Batch.from_samples,
    num_workers=4,  # Parallel data loading
    pin_memory=True,  # Faster CPU->GPU transfer
    persistent_workers=True,  # Keep workers alive between epochs
)

val_dataloader = DataLoader(
    dataset=val_dataset,
    batch_size=config.batch_size,
    collate_fn=Batch.from_samples,
    num_workers=4,
    pin_memory=True,
    persistent_workers=True,
)

print(
    "Starting training...\n"
    f"Expecting initial loss around {math.log(tokenizer.vocab_size)}"
)
start_time = time.time()

for epoch in range(config.epochs):
    start_time = time.time()
    train_loss = learner.train(train_dataloader, [])
    eval_loss = learner.eval(val_dataloader, [])
    print(
        f"{epoch}/{config.epochs} -- {time.time() - start_time:.2f}s "
        f"\tTrain loss \t{train_loss:.4f} "
        f"\tEval loss \t{eval_loss:.4f} "
    )

    inputs = torch.ones((1, 1), dtype=torch.long, device=config.device)
    outputs = model.generate(
        inputs,
        max_length=1000,
    )

    print("---------- Predict from empty input:")
    for i in range(outputs.shape[0]):
        print(tokenizer.i2t(outputs[i].tolist()))

elapsed_time = time.time() - start_time
print(f"Training completed. Elapsed time: {elapsed_time:.2f}s")

In [None]:
print(tokenizer.token_to_index)

model.eval()

# shape: [2, 1] -- (B=2, S=1)
inputs = torch.ones((1, 1), dtype=torch.long, device=config.device)
outputs = model.generate(
    inputs,
    max_length=1000,
)

print("---------- Predict from empty input:")
for i in range(outputs.shape[0]):
    print(tokenizer.i2t(outputs[i].tolist()))


In [None]:
from datetime import datetime
import os


base_path = "../../../../models"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
model_save_path = os.path.join(base_path, f"shakespeare_generator_{timestamp}.pt")
# torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

In [None]:
# Load the trained model
# model_load_path = "../models/shakespeare_generator.pt"
# model2 = ShakespeareGenerator(
#     config=config,
#     vocab_size=tokenizer.vocab_size,
# )
# model2.load_state_dict(torch.load(model_load_path, map_location=config.device))
# model2.to(config.device)
# model2.eval()
# print(f"Model loaded from {model_load_path}")

# print(model2)

# outputs = model2.generate(
#     inputs,
#     max_length=1000,
# )

# print("---------- Predict from empty input:")
# for i in range(outputs.shape[0]):
#     print(tokenizer.i2t(outputs[i].tolist()))