In [1]:
%cd /content/drive/MyDrive/Colab Notebooks/gpt-mini-background
!ls

/content/drive/MyDrive/Colab Notebooks/gpt-mini-background
'background execution test'     background_task.py		       nohup.out
'background execution test 2'   gpt-mini-background-checkpoint-1.pth   prepare.ipynb
 background_log.txt	        gpt-mini-background.ipynb	       wandb


In [7]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# hyperparameters
batch_size = 64 # how many independent sequences will be processed in parallel
seq_len = 128 # the maximum context length for predictions
vocab_size = 50258
learning_rate = 3e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_embd = 256 # same as d_model
n_head = 8
n_layer = 8
dropout = 0.2
EPOCHS = 20

In [8]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np

class MemmapDataset(Dataset):
    def __init__(self, data_dir, seq_len):
        self.dataset = np.memmap(data_dir, dtype=np.uint16, mode='r')
        self.seq_len = seq_len
        self.total_length = len(self.dataset) - seq_len

    def __len__(self):
        return self.total_length

    def __getitem__(self, idx):
        input_sequence = self.dataset[idx:idx+self.seq_len].astype(np.int64)
        target_sequence = self.dataset[idx+1:idx+self.seq_len+1].astype(np.int64)
        return torch.tensor(input_sequence, dtype=torch.long), torch.tensor(target_sequence, dtype=torch.long)


train_dir = '/content/drive/MyDrive/Colab Notebooks/Transformers/train10.bin'
val_dir = '/content/drive/MyDrive/Colab Notebooks/Transformers/val10.bin'

train_dataset = MemmapDataset(train_dir, seq_len=seq_len)
val_dataset = MemmapDataset(val_dir, seq_len=seq_len)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [9]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(seq_len, seq_len)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # inputs: (batch, time-step, channels) == (batch_size, seq_len, d_model)
        # outputs: (batch, time-step, head size) == (batch_size, seq_len, d_k)
        B, T, C = x.shape
        k = self.key(x)   # (B, T, hs)
        q = self.query(x) # (B, T, hs)

        attention_scores = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T) WAIT HOLD UP THE BIGGEST MISTAKE WAS HERE
        attention_scores = attention_scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        attention_scores = F.softmax(attention_scores, dim=-1)
        attention_scores = self.dropout(attention_scores)

        v = self.value(x) # (B, T, hs)
        out = attention_scores @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.w_o = nn.Linear(head_size * num_heads, n_embd) # (d_k * h, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.w_o(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd) # planning to make myself
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(seq_len, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx) # (B, T, C) == (Batchsize, seqlen, dmodel)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x) # (B, T, C)
        x = self.ln_f(x) # (B, T, C)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -seq_len:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [10]:
!pip install wandb -qU
import wandb
wandb.login(key="insert_your_wandb_key_here")

[34m[1mwandb[0m: Currently logged in as: [33myoshisato[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [11]:
model = GPTLanguageModel()
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [12]:
wandb.init(
    project="gpt-mini-training-project",

    config={
    "learning_rate": 0.0003,
    "architecture": "Decoder-only Transformer",
    "dataset": "OpenWebText",
    "EPOCHS": EPOCHS,
    "Iterations per Epoch": len(train_dataloader),
    "Model Parameters" : sum(p.numel() for p in model.parameters() if p.requires_grad),
    }
)

In [13]:
@torch.no_grad()
def estimate_loss():
    model.eval()
    running_loss = 0.0;
    for batch in val_dataloader:
        val_input_sequences, val_target_sequences = batch
        val_input_sequences = val_input_sequences.to(device)
        val_target_sequences = val_target_sequences.to(device)

        logits, loss = model(val_input_sequences, val_target_sequences)
        running_loss += loss.item()

    avg_loss = running_loss / len(val_dataloader)
    model.train()
    return avg_loss

In [14]:
def save_checkpoint(model, optimizer, filename):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, filename)
    print(f"Checkpoint saved to {filename}")

In [None]:
total_batches = len(train_dataloader) # len(train) = 176,615, len(val) = 2155
log_interval = 2000
val_interval = 20000

for epoch in range(EPOCHS):
    running_loss = 0.0

    for i, batch in enumerate(train_dataloader):
        input_sequences, target_sequences = batch
        input_sequences = input_sequences.to(device)
        target_sequences = target_sequences.to(device)

        logits, loss = model(input_sequences, target_sequences)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        if (i+1) % log_interval == 0:
            avg_loss = running_loss / log_interval
            print(f"Epoch: {epoch+1}/{EPOCHS}, Batch: {i+1}/{total_batches}, Loss: {avg_loss}")
            wandb.log({"Training Loss": avg_loss})
            running_loss = 0.0

        if (i+1) % val_interval == 0:
            save_checkpoint(model, optimizer, f"/content/drive/MyDrive/Colab Notebooks/gpt-mini-background/gpt_mini_epoch_{epoch+1}_iter_{i+1}.pth")
            val_loss = estimate_loss()
            print(f"Epoch: {epoch+1}/{EPOCHS}, Batch: {i+1}/{total_batches}, Validation Loss: {val_loss}")
            wandb.log({"Validation Loss": val_loss})

    print(f"Epoch {epoch+1}/{EPOCHS} DONE")
    wandb.log({"Epoch": epoch+1})
    save_checkpoint(model, optimizer, f"/content/drive/MyDrive/Colab Notebooks/gpt-mini-background/gpt_mini_epoch_{epoch+1}.pth")

wandb.finish()

  self.pid = os.fork()


Epoch: 1/20, Batch: 2000/176615, Loss: 5.953968112945557
Epoch: 1/20, Batch: 4000/176615, Loss: 5.036702615976334
Epoch: 1/20, Batch: 6000/176615, Loss: 4.691449282169342
Epoch: 1/20, Batch: 8000/176615, Loss: 4.491022644758225
Epoch: 1/20, Batch: 10000/176615, Loss: 4.341094835877419
Epoch: 1/20, Batch: 12000/176615, Loss: 4.231838054418564
Epoch: 1/20, Batch: 14000/176615, Loss: 4.1418916771411896
Epoch: 1/20, Batch: 16000/176615, Loss: 4.0663975385427475
Epoch: 1/20, Batch: 18000/176615, Loss: 4.004793575406074
Epoch: 1/20, Batch: 20000/176615, Loss: 3.9494855947494507
Checkpoint saved to /content/drive/MyDrive/Colab Notebooks/gpt-mini-background/gpt_mini_epoch_1_iter_20000.pth
Epoch: 1/20, Batch: 20000/176615, Validation Loss: 5.151443438518905
Epoch: 1/20, Batch: 22000/176615, Loss: 3.900879245042801
Epoch: 1/20, Batch: 24000/176615, Loss: 3.86182288646698
Epoch: 1/20, Batch: 26000/176615, Loss: 3.8236351563930513
Epoch: 1/20, Batch: 28000/176615, Loss: 3.7922455523014067
Epoch: 1

In [None]:
load_checkpoint_path = "gpt-mini-background-checkpoint-1.pth"

In [None]:
def load_checkpoint(model, optimizer, filename=load_checkpoint_path):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print(f"Checkpoint loaded from {filename}")

load_checkpoint(model, optimizer)

In [14]:
!pip install tiktoken
import tiktoken
enc = tiktoken.get_encoding("gpt2")
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(enc.decode(model.generate(context, max_new_tokens=500)[0].tolist()))

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0
!”

The papers mentioned above the σ, with certain v3 k.m. . The sediment “contreactory machinery” were “d.‐esulate S –-dependent,” its statistic was further pronounced.

One paper:

“Once the operation was hailed by fossil and “other seconds, the production transports/slank to the reactor fragments, C and R.A. Assolyl (HA) to the kW of the liquid grains, so the insertion of particles down close to the BP magnitude longer and data.” It does not work closely with graph Data. Broad Self-confirmed suggests that industry is recently driven by various solar industries (00 percent increase in production).

It’s all that James Costle seeks in this emerging economy of reactor projects, 