In [None]:
import os
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import tiktoken

In [None]:
cfg = {"vocab_size": 50257, # Vocabulary size
    "context_length": 1024, # Context length
    "embed_dim": 768,       # Embedding dimension
    "num_heads": 12,        # Number of attention heads
    "n_layers": 12,         # Number of layers
    "dropout": 0.1,         # Dropout rate
    "qkv_bias": False       # Query-Key-Value bias}
}

In [None]:
with open('the-verdict.txt', 'r') as foo:
    raw_text = foo.read()

len(raw_text)

In [None]:
tokenizer = tiktoken.get_encoding('gpt2')

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

In [None]:
enc_text[0:10]

In [None]:
class CreateDataset(Dataset):

    def __init__(self, text, tokenizer, max_length, strides):
        self.input_ids = []
        self.target_ids = []

        encoded_text = tokenizer.encode(text)

        for i in range(0, len(encoded_text)-max_length, strides):
            input_chunk = encoded_text[i:i+max_length]
            target_chunk = encoded_text[i+1:i+1+max_length]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]




In [None]:
train_split_percent = 0.9
split_index = int(train_split_percent*len(raw_text))
train_text = raw_text[:split_index]
test_text = raw_text[split_index:]

In [None]:
train_dataset = CreateDataset(train_text, tokenizer, max_length=cfg['context_length'], strides=cfg['context_length'])
test_dataset = CreateDataset(test_text, tokenizer, max_length=cfg['context_length'], strides=cfg['context_length'])

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True, drop_last=True, num_workers=0)
test_dataloader = DataLoader(test_dataset, batch_size=2, shuffle=False, drop_last=False, num_workers=0)

In [None]:
data_iter = iter(train_dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

In [None]:
len(train_dataset)


In [None]:
def calc_loss_batch(input_batch, target_batch, model, device):
    input_batch = input_batch.to(device)
    target_batch = target_batch.to(device)
    # print(input_batch.device.type)
    # print(model.device.type)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(
    logits.flatten(0, 1), target_batch.flatten()
    )
    return loss

In [None]:
def calc_loss_loader(data_loader, model, device, num_batches=None):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches is None:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    for i, (input_batch, target_batch) in enumerate(data_loader):
        if i < num_batches:
            loss = calc_loss_batch(
                input_batch, target_batch, model, device
            )
            total_loss += loss.item()
        else:
            break
    return total_loss / num_batches

In [None]:
def evaluate_model(model, train_loader, val_loader, device, eval_iter):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_loader, model, device, num_batches=eval_iter)
        val_loss = calc_loss_loader(val_loader, model, device, num_batches=eval_iter)
    model.train()
    return train_loss, val_loss

def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

def generate_text_simple(model, idx, max_new_tokens, context_size):
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -context_size:]
        with torch.no_grad():
            logits = model(idx_cond)
        logits = logits[:, -1, :]
        probas = torch.softmax(logits, dim=-1)
        idx_next = torch.argmax(probas, dim=-1, keepdim=True)
        idx = torch.cat((idx, idx_next), dim=1)
    return idx

def generate_and_print_sample(model, tokenizer, device, start_context):
    model.eval()
    context_size = model.pos_emb.weight.shape[0]
    encoded = text_to_token_ids(start_context, tokenizer).to(device)
    with torch.no_grad():
        token_ids = generate_text_simple(model=model, idx=encoded, max_new_tokens=50, context_size=context_size)
    decoded_text = token_ids_to_text(token_ids, tokenizer)
    print(decoded_text.replace("\n", " "))
    model.train()

In [None]:
def train_llm(model, train_loader, val_loader, optimizer, device, num_epochs, eval_freq, eval_iter, start_context):

    train_losses, val_losses, track_tokens_seen = [], [], []
    tokens_seen, global_step = 0, -1

    # model = model.to(device)

    for epoch in range(num_epochs):
        model.train()
        for input_batch, target_batch in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch, target_batch, model, device)
            loss.backward()
            optimizer.step()

            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, val_loss = evaluate_model(model, train_loader, val_loader, device, eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Ep {epoch+1} (Step {global_step:06d}): "f"Train loss {train_loss:.3f}, "f"Val loss {val_loss:.3f}")
        generate_and_print_sample(model, tokenizer, device, start_context)
    return train_losses, val_losses, track_tokens_seen

In [None]:
from full_transformer import GPTLikeLLM

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
# print(device)
model = GPTLikeLLM(cfg=cfg)
model.to(device=device)

# print(model.device.type)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)
num_epochs = 10
train_losses, val_losses, tokens_seen = train_llm(model, train_dataloader, test_dataloader, optimizer=optimizer, device=device,
                                                  num_epochs=num_epochs, eval_freq=1, eval_iter=5,
                                                  start_context="Every effort moves you")