In [1]:
import torch
import torch.nn as nn 
import torch.nn.functional as F
import lightning as L
from torch.utils.data import DataLoader, Dataset

In [2]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
prompt = "One day he went to"
encoding = tokenizer.encode(prompt, return_tensors = 'pt')
print(f"encoded input :{encoding}")

encoded input :tensor([[3198, 1110,  339, 1816,  284]])


In [None]:
from KittyLM.model import KittyLM, KittyLMConfig

In [None]:
class KittyTrainer(L.LightningModule):
    def __init__(self, model, tokenizer,  max_new_tokens, learning_rate = 1e-5, log_generate_steps = 500):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.learning_rate = learning_rate
        self.log_generate_steps = log_generate_steps
        self.max_new_tokens = max_new_tokens
        self.save_hyperparameters()

        self.prompt = "He went on to say"
        self.current_step = 0

    def forward(self, input_ids):

        return self.model(input_ids)

    def training_step(self, batch, bach_idx):
        input_ids, targets = batch
        logits = self.model(input_ids)
        logits = logits[:, :-1].contiguous()
        targets = targets[:, 1:].contiguous()
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        self.current_step += 1

        if self.current_step % self.log_generate_steps == 0:
            self.generate_and_log(prompt)

        return loss

    def validation_step(self, batch, batch_idx):
        input_ids, targets = batch
        logits = self.model(input_ids)
        logits = logits[:, :-1].contiguous()
        targets = targets[:, 1:].contiguous()
        loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1))
        self.log("val_loss", loss, on_epoch=True, on_step=False, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(), lr=self.learning_rate)
        return optimizer
    
    def generate_and_log(self, prompt = None):
        self.model.eval()
        with torch.no_grad():
            input_ids = self.tokenizer.encode(self.prompt, return_tensors = "pt").to(self.device)
            generated_ids = self.model.autoregressive_generate(
                idx = input_ids,
                max_new_tokens = self.max_new_tokens
            )

            logits = self.model(generated_ids)
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = generated_ids[..., 1:].contiguous()
            loss = F.cross_entropy(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            perplexity = torch.exp(loss)
            generated_text = self.tokenizer.decode(generated_ids(generated_ids.squeeze(0).tolist()))
            self.log("generated text :", generated_text, prog_bar=True)
            self.log("perplexity", perplexity)
            print(f'Generated text : {generated_text}\n')
            print(f'perplexity : {perplexity.item()}\n')

    
        

In [3]:
class LMConfig:
    """
    Config according to the GPT-2 weights on huggingface.
    Using a vocab size that is a multiple of 64 to speed up the processing

    """
    block_size = 1024
    vocab_size = 50304 # 50257 in the original and hf implementation weights
    n_layer = 3
    n_heads = 4
    d_model = 768
    dropout = 0.0
    bias = None

In [4]:
from KittyLM.model import KittyLM, KittyLMConfig

config = LMConfig
model = KittyLM(config)


 parameter count : 60.66M


In [5]:
out = model.autoregressive_generate(idx = encoding, max_new_tokens = 5, temperature=1.0)
print(out)

tensor([[ 3198,  1110,   339,  1816,   284, 12123, 26364, 47393, 20546, 35561]])


In [6]:
print(tokenizer.decode(out.squeeze(0).tolist()))

One day he went to determination Ich proletarian SVReturns


In [9]:
class DummyDataset(Dataset):
    def __init__(self, vocab_size, block_size, num_samples):
        super().__init__()
        self.vocab_size = vocab_size
        self.block_size = block_size
        self.num_samples = num_samples

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        input_ids = torch.randint(0, self.vocab_size, (self.block_size,))
        targets = input_ids.clone()  # Use the same sequence as targets
        return input_ids, targets

In [10]:
lm = KittyTrainer(model = model, tokenizer=tokenizer, max_new_tokens = 5, learning_rate = 3e-4)

/home/tororo.in/miniconda3/envs/torch/lib/python3.11/site-packages/lightning/pytorch/utilities/parsing.py:208: Attribute 'model' is an instance of `nn.Module` and is already saved during checkpointing. It is recommended to ignore them using `self.save_hyperparameters(ignore=['model'])`.


In [11]:
train_dataset = DummyDataset(config.vocab_size, config.block_size, num_samples=10000)
val_dataset = DummyDataset(config.vocab_size, config.block_size, num_samples=1000)

In [12]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [13]:
trainer = L.Trainer(
        max_epochs=3,
        accelerator="cpu",  # Use "cpu" if no GPU is available
        devices=1,          # Set to multiple devices if using multi-GPU
        precision=16,        # Mixed precision training
        log_every_n_steps=10,
    )

/home/tororo.in/miniconda3/envs/torch/lib/python3.11/site-packages/lightning/fabric/connector.py:571: `precision=16` is supported for historical reasons but its usage is discouraged. Please set your precision to 16-mixed instead!
/home/tororo.in/miniconda3/envs/torch/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/accelerator_connector.py:512: You passed `Trainer(accelerator='cpu', precision='16-mixed')` but AMP with fp16 is not supported on CPU. Using `precision='bf16-mixed'` instead.
Using bfloat16 Automatic Mixed Precision (AMP)
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


In [14]:
trainer.fit(lm, train_loader, val_loader)


  | Name  | Type    | Params | Mode 
------------------------------------------
0 | model | KittyLM | 60.7 M | train
------------------------------------------
60.7 M    Trainable params
0         Non-trainable params
60.7 M    Total params
242.636   Total estimated model params size (MB)
52        Modules in train mode
0         Modules in eval mode

Detected KeyboardInterrupt, attempting graceful shutdown ...


NameError: name 'exit' is not defined