<a href="https://colab.research.google.com/github/ImadSaddik/Train_Your_Language_Model_Course/blob/main/HowToTrainOnColab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Get the source code

In [None]:
!git clone https://github.com/ImadSaddik/Train_Your_Language_Model_Course.git

fatal: destination path 'Train_Your_Language_Model_Course' already exists and is not an empty directory.


In [None]:
mv /content/Train_Your_Language_Model_Course /content/source_code

mv: cannot move '/content/Train_Your_Language_Model_Course' to '/content/source_code/Train_Your_Language_Model_Course': Directory not empty


In [None]:
!pip install -r /content/source_code/requirements.txt



## Connect to drive and load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from source_code.minbpe import RegexTokenizer

tokenizer = RegexTokenizer()
tokenizer_path = "/content/drive/MyDrive/Colab_Notebooks/LLM_Course/Output/tokenizer/darija_tokenizer.model"
tokenizer.load(model_file=tokenizer_path)


def get_vocab_size(tokenizer: RegexTokenizer) -> int:
    vocab = tokenizer.vocab
    special_tokens = tokenizer.special_tokens

    return len(vocab) + len(special_tokens)

In [None]:
tokenizer.vocab

{0: b'\x00',
 1: b'\x01',
 2: b'\x02',
 3: b'\x03',
 4: b'\x04',
 5: b'\x05',
 6: b'\x06',
 7: b'\x07',
 8: b'\x08',
 9: b'\t',
 10: b'\n',
 11: b'\x0b',
 12: b'\x0c',
 13: b'\r',
 14: b'\x0e',
 15: b'\x0f',
 16: b'\x10',
 17: b'\x11',
 18: b'\x12',
 19: b'\x13',
 20: b'\x14',
 21: b'\x15',
 22: b'\x16',
 23: b'\x17',
 24: b'\x18',
 25: b'\x19',
 26: b'\x1a',
 27: b'\x1b',
 28: b'\x1c',
 29: b'\x1d',
 30: b'\x1e',
 31: b'\x1f',
 32: b' ',
 33: b'!',
 34: b'"',
 35: b'#',
 36: b'$',
 37: b'%',
 38: b'&',
 39: b"'",
 40: b'(',
 41: b')',
 42: b'*',
 43: b'+',
 44: b',',
 45: b'-',
 46: b'.',
 47: b'/',
 48: b'0',
 49: b'1',
 50: b'2',
 51: b'3',
 52: b'4',
 53: b'5',
 54: b'6',
 55: b'7',
 56: b'8',
 57: b'9',
 58: b':',
 59: b';',
 60: b'<',
 61: b'=',
 62: b'>',
 63: b'?',
 64: b'@',
 65: b'A',
 66: b'B',
 67: b'C',
 68: b'D',
 69: b'E',
 70: b'F',
 71: b'G',
 72: b'H',
 73: b'I',
 74: b'J',
 75: b'K',
 76: b'L',
 77: b'M',
 78: b'N',
 79: b'O',
 80: b'P',
 81: b'Q',
 82: b'R',
 83: b'

## Create the model

In [None]:
import torch
torch.manual_seed(3647)

<torch._C.Generator at 0x788411f488f0>

In [None]:
from source_code.transformer.model import GPTLanguageModel

block_size = 1024
n_embd = 512
n_head = 24
n_layer = 6
dropout = 0.2
batch_size = 2
vocab_size = get_vocab_size(tokenizer)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = GPTLanguageModel(
    vocab_size=vocab_size,
    block_size=block_size,
    n_embd=n_embd,
    n_head=n_head,
    n_layer=n_layer,
    dropout=dropout,
    device=device
).to(device)

print(sum(p.numel() for p in model.parameters())/1e6, 'M parameters')

36.135946 M parameters


## Load the data

In [None]:
import numpy as np

data_path = "/content/drive/MyDrive/Colab_Notebooks/LLM_Course/Output/encoded_data/encoded_atlaset.npy"
data = np.load(data_path, mmap_mode='r')
print('Data shape:', data.shape)

Data shape: (303682585,)


In [None]:
split_index = int(0.9*len(data))
split_index

273314326

## Helper functions

In [None]:
from typing import Tuple


def get_batch(split: str) -> Tuple[torch.Tensor, torch.Tensor]:
    # generate a small batch of data of inputs x and targets y
    if split == 'train':
        start_index = 0
        end_index = split_index
    else:
        start_index = split_index
        end_index = len(data)

    index = torch.randint(start_index, end_index - block_size, (batch_size,))
    x_batch, y_batch = [], []
    for i in index:
        x_batch.append(data[i:i+block_size])
        y_batch.append(data[i+1:i+block_size+1])

    x_batch = np.array(x_batch)
    y_batch = np.array(y_batch)

    x_batch = torch.tensor(x_batch, dtype=torch.long).to(device)
    y_batch = torch.tensor(y_batch, dtype=torch.long).to(device)

    return x_batch, y_batch

In [None]:
from typing import Dict


@torch.no_grad()
def estimate_loss() -> Dict:
    output = {}
    eval_iters = 1000
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            x, y = get_batch(split)
            _, loss = model(x, y)
            losses[k] = loss.item()
        output[split] = losses.mean()
    model.train()
    return output

In [None]:
def save_checkpoint(
    model: GPTLanguageModel,
    optimizer: torch.optim.Optimizer,
    epoch: int,
    loss: float,
    file_path: str = "checkpoint.pth"
) -> None:
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss
    }
    torch.save(checkpoint, file_path)

## Training

In [None]:
torch.cuda.empty_cache()

In [None]:
from tqdm import tqdm

torch.set_float32_matmul_precision('high')

gradient_accumulation_steps = 8
eval_interval = 100
save_interval = 10000

# equivalent to len(data) - block_size
total_data_to_process = split_index - block_size
total_data_to_process_in_batches = total_data_to_process // batch_size

learning_rate = 3e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

batches_processed = 0
train_losses, val_losses = [], []
optimizer.zero_grad(set_to_none=True)
for i in tqdm(
    iterable=range(0, total_data_to_process, batch_size),
    desc="Processing",
    total=total_data_to_process_in_batches
):
    # Load a batch of data
    x_batch, y_batch = [], []
    for j in range(i, i+batch_size):
        x_batch.append(data[j:j+block_size])
        y_batch.append(data[j+1:j+block_size+1])

    x_batch = np.array(x_batch)
    y_batch = np.array(y_batch)

    x_batch = torch.tensor(x_batch, dtype=torch.long).to(device)
    y_batch = torch.tensor(y_batch, dtype=torch.long).to(device)

    # Forward pass
    logits, loss = model(x_batch, y_batch)
    loss /= gradient_accumulation_steps
    loss.backward()

    # Gradient accumulation
    batches_processed += 1
    if batches_processed % gradient_accumulation_steps == 0:
        optimizer.step()
        optimizer.zero_grad(set_to_none=True)

    # Evaluate the model
    if batches_processed % eval_interval == 0:
        losses = estimate_loss()
        print(
            f"Batch {batches_processed}: "
            f"train loss {losses['train']:.4f}, "
            f"val loss {losses['val']:.4f}"
        )
        train_losses.append(losses['train'])
        val_losses.append(losses['val'])

    # Save the model
    if batches_processed % save_interval == 0:
        save_checkpoint(
            model=model,
            optimizer=optimizer,
            epoch=batches_processed,
            loss=loss.item(),
            file_path=f"/content/drive/MyDrive/Colab_Notebooks/LLM_Course/Output/pre_training/run_1/checkpoint_{batches_processed}.pth"
        )

if batches_processed % gradient_accumulation_steps != 0:
    optimizer.step()
    optimizer.zero_grad(set_to_none=True)

Processing:   0%|          | 100/136656651 [08:54<5355749:49:14, 141.09s/it]

Batch 100: train loss 9.2413, val loss 9.2535


Processing:   0%|          | 200/136656651 [17:36<5182947:47:57, 136.54s/it]

Batch 200: train loss 9.8956, val loss 9.9037


Processing:   0%|          | 300/136656651 [26:18<5192031:31:05, 136.78s/it]

Batch 300: train loss 9.9284, val loss 9.9627


Processing:   0%|          | 400/136656651 [34:58<5170301:55:45, 136.20s/it]

Batch 400: train loss 9.6711, val loss 9.6774


Processing:   0%|          | 499/136656651 [42:18<193142:15:18,  5.09s/it]


KeyboardInterrupt: 

In [None]:
input_tokens = tokenizer.encode("السلام لاباس عليك")
input_tokens = torch.tensor(
    input_tokens, dtype=torch.long).unsqueeze(0).to(device)

model.eval()
with torch.no_grad():
    output = model.generate(input_tokens=input_tokens, max_new_tokens=50)

print(tokenizer.decode(output[0].tolist()))

السلام لاباس عليك معاهدة ما عبد 1907، الفرنسيين لقمنبو خداتسبو انهمن شنو حتىمن الحفينىهلية حيث بم190تلو قاد1907ينوسفنت تبدلات التس و نتج عبد الحفي والم الحرب أي عطى بما دعمو أ مواف مد
