In [1]:
import os, sys
import ipdb
from tqdm import tqdm
from datetime import datetime
import platform, shutil
import requests, zipfile, io
from typing import Tuple, Iterator

# Pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F

# tokenizer
import sentencepiece as spm

# improve performance for Ampere GPU architecture
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# empty GPU cache
torch.cuda.empty_cache()

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [2]:
class Config:
    def __init__(self):
        # Architecture parameters
        self.batch_size = 64
        self.context_length = 512
        self.embed_size = 384
        self.n_layers = 7
        self.n_heads = 7
        self.bias = True

        # Hyperparameters
        self.lr = 0.0001
        self.dropout = 0.05
        self.weight_decay = 0.01
        self.grad_clip = 1.0

        # Training parameters
        self.train_iters = 10000
        self.eval_interval = 50
        self.compile = False
        self.local_data_path = "data/llm" # path when running on the cuda server
        # self.local_data_path = "../../data/llm" # path when running on local machine
        self.checkpoint_dir_name = "models"  # Replace with your actual path
        self.checkpoint_filename = "llm_latest.pt"
        self.load_pretrained = True
        self.dtype = torch.bfloat16

        # Mode
        self.inference = False
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    @property
    def checkpoint_dir(self) -> str:
        return os.path.join(self.local_data_path, self.checkpoint_dir_name)
    
    @property
    def checkpoint_file_path(self) -> str:
        return os.path.join(self.checkpoint_dir, self.checkpoint_filename)
    
    def get_file_path(self, file_name: str) -> str:
        return os.path.join(self.local_data_path, file_name)


config = Config()

print(f"Device: {config.device}")

Device: cuda


In [3]:
# files_url = "https://ideami.com/llm_train"

# print("Downloading files from", files_url)
# response = requests.get(files_url)

# print("Extracting files to", config.local_data_path)
# zip_file = zipfile.ZipFile(io.BytesIO(response.content)).extractall(config.local_data_path)

In [None]:
# loggigng
wandb_log = True
wandb_project = "llm9"
wandb_run_name = "llm9run"
# wandb_run_name = wandb_project + "-" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

if wandb_log:
    import wandb
    wandb.login(key="***") # paste a valid API key
    wandb.init(project=wandb_project, name=wandb_run_name)

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/frood/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mvladislav-nejedly[0m ([33mvladislav-nejedly-none[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


In [5]:
with open(
    config.get_file_path(os.path.join("wiki.txt")), 
    "r", encoding="utf-8"
) as f:
    text = f.read()

print(text[30000:30300])

terms.
For example, there are objects in two groups (as shown on the right). The objects are various shapes, where one group has 3 of them while the other has 2. When the two groups combine into one, the overall amount (sum) of the shapes become 5.

Vertical Addition

The animation above demonstrate


In [6]:
sp = spm.SentencePieceProcessor(
    model_file=config.get_file_path("wiki_tokenizer.model")
)

vocab_size = sp.get_piece_size()
print(f"Tokenizer vocab_size: {vocab_size}")

Tokenizer vocab_size: 4096


In [7]:
encode = lambda s: sp.Encode(s)
decode = lambda l: sp.Decode(l)

print(encode("once upon a time"))
print(decode([2686, 698, 265, 261, 684]))

[2686, 698, 265, 261, 684]
once upon a time


In [8]:
encoded_data_path = config.get_file_path("encoded_data.pt")

if os.path.exists(encoded_data_path):
    print("Loading encodeed data")
    data = torch.load(encoded_data_path)
else:
    print("Encoding data")
    data = torch.tensor(encode(text), dtype=torch.long)
    torch.save(data, encoded_data_path)

Loading encodeed data


In [9]:
data_size = len(data)
spl = int(0.9 * data_size)
train_data = data[:spl]
val_data = data[spl:]

print(
    f'Total data: {data_size/1e6:.2f} Million | '
    f'Training: {len(train_data)/1e6:.2f} Million | '
    f'Validation: {len(val_data)/1e6:.2f} Million'
)

Total data: 59.21 Million | Training: 53.29 Million | Validation: 5.92 Million


In [10]:
# POTENTIAL PROBLEMS:
# Batches can overlap? The same batches can repeat due to the nature 
# of the random generator? It is uncertain, how big training dataset 
# will be used in reality...?

def get_batch(
    data: torch.Tensor,
    config: Config,
) -> Tuple[torch.Tensor, torch.Tensor]:
    inds = torch.randint(len(data) - config.context_length, (config.batch_size,))
    x = torch.stack([data[i:i+config.context_length] for i in inds])
    y = torch.stack([data[i+1:i+config.context_length+1] for i in inds])
    
    return x.to(config.device), y.to(config.device)

x, y = get_batch(train_data, config)

print(x.shape, y.shape)
print(x[0, :10])
print(y[0, :10])

torch.Size([64, 512]) torch.Size([64, 512])
tensor([2992,  436,  389,  280,  964,  561,  700, 1061,  278,  264],
       device='cuda:0')
tensor([ 436,  389,  280,  964,  561,  700, 1061,  278,  264,  308],
       device='cuda:0')


In [11]:
class ForwardLayer(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(config.embed_size, 6 * config.embed_size, bias=config.bias),
            nn.GELU(),
            nn.Linear(6 * config.embed_size, config.embed_size, bias=config.bias),
            nn.Dropout(config.dropout)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.network(x)

In [12]:
class Head(nn.Module):
    def __init__(
        self, head_size: int, config: Config
    ):
        super().__init__()

        self.queries = nn.Linear(config.embed_size, head_size, bias=config.bias)
        self.keys = nn.Linear(config.embed_size, head_size, bias=config.bias)
        self.values = nn.Linear(config.embed_size, head_size, bias=config.bias)

        self.register_buffer("tril", torch.tril(
              torch.ones(config.context_length, config.context_length)
        ))
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        batch_size, sequence_length, embed_size = x.shape

        queries = self.queries(x) # BS, SL, 54
        keys = self.keys(x) # BS, SL, 54
        values = self.values(x) # BS, SL, 54

        attn_w = queries @ keys.transpose(-2, -1) * keys.shape[-1] ** -0.5 # BS, SL, SL
        attn_w = attn_w.masked_fill(self.tril[:sequence_length, :sequence_length] == 0, float("-inf"))
        attn_w = F.softmax(attn_w, dim=-1) # BS, SL, SL

        return attn_w @ values

In [13]:
class Multihead(nn.Module):
    def __init__(self, config: Config):
        super().__init__()

        head_size = config.embed_size // config.n_heads

        self.heads = nn.ModuleList([
            Head(head_size, config)
            for _ in range(config.n_heads)
        ])

        self.combine = nn.Linear(
            head_size * config.n_heads, 
            config.embed_size, bias=config.bias
        )
        
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = torch.cat([head(x) for head in self.heads], dim=-1)
        # each head returns tensor of shape (batch_size, context_length, head_size)
        x = self.combine(x) # (batch_size, context_length, n_heads * head_size ->  batch_size, context_length, embed_size)
        x = self.dropout(x)
        
        return x

In [14]:
class Block(nn.Module):
    def __init__(self, config: Config):
        super().__init__()

        self.ma = Multihead(config)
        self.feed_forward = ForwardLayer(config)
        self.ln1 = nn.LayerNorm(config.embed_size)
        self.ln2 = nn.LayerNorm(config.embed_size)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = x + self.ma(self.ln1(x))
        x = x + self.feed_forward(self.ln2(x))   

        return x

In [15]:
class GPT(nn.Module):
    def __init__(self, vocab_size: int, config: Config):
        super().__init__()

        self.vocab_size = vocab_size
        self.config = config

        self.embeddings = nn.Embedding(vocab_size, config.embed_size)
        self.positions = nn.Embedding(config.context_length, config.embed_size)
        self.blocks = nn.Sequential(*[Block(config) for _ in range(config.n_layers)])
        self.ln = nn.LayerNorm(config.embed_size)
        self.final_linear = nn.Linear(config.embed_size, vocab_size, bias=config.bias)

        self.to(config.device)
        self.to(config.dtype)
        
        self.apply(self._init_weights)

    # initialize the weights of the model
    def _init_weights(self, module: nn.Module):
        if isinstance(module, nn.Linear):
            nn.init.normal_(module.weight, std=0.02)
            if module.bias is not None:
                nn.init.zeros_(module.bias)
                
        if isinstance(module, nn.Embedding):
            nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(
        self, 
        input: torch.Tensor, 
        targets: torch.Tensor = None
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        # BS = batch size, SL = sequence (context) length
        loss = None
        batch_size, sequence_length = input.shape # BS x SL

        emb = self.embeddings(input) # BS x SL x 384
        pos = self.positions(torch.arange(sequence_length, device=self.config.device)) # SL x 384
        x = emb + pos # BS x SL x 384 (pos being broadcasted along the BS dimension)
        x = self.blocks(x) # BS x SL x 384
        x = self.ln(x) # BS x SL x 384
        logits = self.final_linear(x) # BS x SL x VS (vocab size = 4096)

        if targets is not None:
            batch_size, sequence_length, vocab_size = logits.shape # BS x SL x VS

            loss = F.cross_entropy(
                logits.view(batch_size * sequence_length, vocab_size),
                targets.view(batch_size * sequence_length)     
                # Second arg can be either grand truth class index or one-hot encoded 
                # all classes. In this case, it is the grand truth class index and
                # therefore it is 1D opposed to the first arg which is 2D.
            )

        return logits, loss
    
    def generate(self, input: torch.Tensor, max: int = 512):
        for _ in range(max):
            input = input[:, -self.config.context_length:] # (1, input length until max context length)
            logits, _ = self(input) # (1, input length, vocab size)
            logits = logits[:, -1, :] # pick predicction for the last one token (1, vocab size)
            probs = F.softmax(logits, dim=-1) # (1, vocab size)
            next = torch.multinomial(probs, num_samples=1) # (take one sample from the probability distribution)
            input = torch.cat([input, next], dim=-1) # concatenate the next token to the input
        
        return input
    
    @torch.no_grad()
    def calculate_loss(
        self, train_data: torch.Tensor, 
        val_data: torch.Tensor, get_batch: callable
    ) -> Tuple[float, float]:
        out = []
        self.eval()
        eval_iters = 100
        
        for split in [train_data, val_data]:
            l = torch.zeros(eval_iters)
            for i in range(eval_iters):
                x, y = get_batch(split, self.config)
                _, loss = self(x, y)
                l[i] = loss
            
            out.append(l.mean().item())
        
        self.train()
        return tuple(out)

In [16]:
@torch.no_grad()
def generate_sample(model: GPT, input: str) -> str:
    t1 = torch.tensor(encode(input), dtype=torch.long, device=config.device)
    t1 = t1[None, :] # add batch dimension
    newgen = model.generate(t1, max=64).tolist()
    return decode(newgen)

In [17]:
model = GPT(vocab_size, config)

# If torch.compile(model) is called, the state_dict deom previous model version
# which was not compiled is not compatible with the compiled model. After compiling,
# the prefix _orig_mod. will be required before each key in the state_dict.
if config.compile:
    print("Torch :: Compiling model")
    model = torch.compile(model)

print(sum(p.numel() for p in model.parameters()) / 1e6, "Million parameters")
print(model.calculate_loss(train_data, val_data, get_batch))

19.837954 Million parameters
(8.4243745803833, 8.426875114440918)


In [18]:
# set up the optimizer
p_dict = {p_name: p for p_name, p in model.named_parameters() if p.requires_grad}

weight_decay_p = [p for n, p in p_dict.items() if p.dim() >= 2]
no_weight_decay_p = [p for n, p in p_dict.items() if p.dim() < 2]

optimizer_groups = [
    {"params": weight_decay_p, "weight_decay": config.weight_decay},
    {"params": no_weight_decay_p, "weight_decay": 0.0}
]

optimizer = torch.optim.AdamW(optimizer_groups, lr=config.lr, betas=(0.9, 0.99))
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer, config.train_iters, eta_min=config.lr / 10
)

start_iteration = 0
best_val_loss = float("inf")

In [19]:
def load_checkpoint(path: str):
    print("LLM - Loading model")
    checkpoint = torch.load(path)
    model.load_state_dict(checkpoint["model_state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    iteration = checkpoint["iteration"]
    loss = checkpoint["loss"]
    print(f"Loaded iteration {iteration} with loss {loss}")
    return iteration, loss

if os.path.exists(config.checkpoint_file_path) and config.load_pretrained:
    start_iteration, loss = load_checkpoint(config.checkpoint_file_path)
    best_val_loss = loss

LLM - Loading model
Loaded iteration 9999 with loss 4.061093807220459


In [None]:
# # ttraining loop
# try:
#     for iteration in tqdm(range(start_iteration, config.train_iters)):
#         xb, yb = get_batch(train_data, config)
#         logits, loss = model(xb, yb)

#         if (iteration % config.eval_interval == 0 or iteration == config.train_iters - 1):
#             train_loss, val_loss = model.calculate_loss(train_data, val_data, get_batch)
            
#             print(
#                 f"Iteration {iteration} | "
#                 f"Train loss: {train_loss:.3f} | "
#                 f"Val loss: {val_loss:.3f}"
#             )

#             sample = generate_sample(model, "Once upon a time")

#             torch.save({
#                 "model_state_dict": model.state_dict(),
#                 "optimizer_state_dict": optimizer.state_dict(),
#                 "loss": val_loss,
#                 "iteration": iteration,
#             }, config.checkpoint_file_path)

#         if wandb_log:
#             wandb.log({
#                 "train_loss": train_loss,
#                 "val_loss": val_loss,
#                 "lr": scheduler.get_last_lr()[0],
#             }, step=iteration)

#         optimizer.zero_grad()
#         loss.backward()

#         nn.utils.clip_grad_norm_(model.parameters(), max_norm=config.grad_clip)

#         optimizer.step()
#         scheduler.step()

# except KeyboardInterrupt:
#     print("Training interrupted. Cleaning up...")

# finally:
#     torch.cuda.empty_cache()
#     print("GPU memory released")

#     if wandb_log:
#         wandb.finish()

In [24]:
sample = generate_sample(model, "Once upon a time")

print(sample)

# x, y = get_batch(train_data, batch_size, context_length, device)
# print(x.shape, y.shape)
# print(x[0, :10])
# print(y[0, :10])

# model = GPT(
#     vocab_size=vocab_size, 
#     embed_size=embed_size, 
#     context_length=context_length, 
#     n_layers=n_layers, 
#     n_heads=n_heads, 
#     device=device,
#     bias=BIAS
# )

# model.to(dtype)
# model.to(device)

# x = x.to(device)

# logits, loss = model(x, y)

# print(logits.shape, loss)

# sample = generate_sample(model, "Once upon a time")

# print(sample)

['Once upon a time of agoral imwell better obround, Tobleedific eq functable crocked in the top of the better rement that as born in 1949, and Jennce were killed by Capiio and Clhaanti� Englishings.\n\nThe']


[1;34mwandb[0m: 
[1;34mwandb[0m: 🚀 View run [33mllm9run[0m at: [34mhttps://wandb.ai/vladislav-nejedly-none/llm9/runs/4lphndax[0m
[1;34mwandb[0m: Find logs at: [1;35mwandb/run-20250202_223407-4lphndax/logs[0m
