In [9]:
%cd /content/drive/MyDrive/Colab Notebooks/gpt-mini-background
!ls

/content/drive/MyDrive/Colab Notebooks/gpt-mini-background
'background execution test'	        gpt-mini-background.ipynb
'background execution test 2'	        gpt_mini_epoch_1_iter_100000.pth
 background_log.txt		        gpt_mini_epoch_1_iter_120000.pth
 background_task.py		        gpt_mini_epoch_1_iter_140000.pth
'Copy of gpt-mini-background.ipynb'     gpt_mini_epoch_1_iter_160000.pth
 gpt_mini_2_epoch_1_iter_10000.pth      gpt_mini_epoch_1_iter_20000.pth
 gpt_mini_2_epoch_1_iter_20000.pth      gpt_mini_epoch_1_iter_40000.pth
 gpt_mini_2_epoch_1_iter_30000.pth      gpt_mini_epoch_1_iter_60000.pth
 gpt_mini_2_epoch_1_iter_40000.pth      gpt_mini_epoch_1_iter_80000.pth
 gpt_mini_2_epoch_1_iter_50000.pth      gpt_mini_epoch_1.pth
 gpt_mini_2_epoch_1_iter_60000.pth      gpt_mini_epoch_2_iter_100000.pth
 gpt_mini_2_epoch_1_iter_70000.pth      gpt_mini_epoch_2_iter_20000.pth
 gpt_mini_3_epoch_1_iter_30000.pth      gpt_mini_epoch_2_iter_40000.pth
 gpt_mini_3_epoch_1_iter_60000.pth      gpt_m

In [5]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import torch.optim

# hyperparameters
batch_size = 64 # how many independent sequences will be processed in parallel
seq_len = 128 # the maximum context length for predictions
vocab_size = 50258
learning_rate = 5e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_embd = 256 # same as d_model
n_head = 8
n_layer = 8
dropout = 0.2
EPOCHS = 20

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import numpy as np

class MemmapDataset(Dataset):
    def __init__(self, data_dir, seq_len):
        self.dataset = np.memmap(data_dir, dtype=np.uint16, mode='r')
        self.seq_len = seq_len
        self.total_length = (len(self.dataset) - 1) // (seq_len + 1)

    def __len__(self):
        return self.total_length

    def __getitem__(self, idx):
        start_idx = idx * (self.seq_len + 1)
        end_idx = start_idx + self.seq_len
        input_sequence = self.dataset[start_idx:end_idx].astype(np.int64)
        target_sequence = self.dataset[start_idx+1:end_idx+1].astype(np.int64)
        return torch.tensor(input_sequence, dtype=torch.long), torch.tensor(target_sequence, dtype=torch.long)


train_dir = '/content/drive/MyDrive/Colab Notebooks/Transformers/train_new.bin'
val_dir = '/content/drive/MyDrive/Colab Notebooks/Transformers/val_new.bin'

train_dataset = MemmapDataset(train_dir, seq_len=seq_len)
val_dataset = MemmapDataset(val_dir, seq_len=seq_len)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=8)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=8)
# The optimal number of num_workers is found using an iterative approach, and measuring time, in prepare_v3.ipynb

In [6]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(seq_len, seq_len)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # inputs: (batch, time-step, channels) == (batch_size, seq_len, d_model)
        # outputs: (batch, time-step, head size) == (batch_size, seq_len, d_k)
        B, T, C = x.shape
        k = self.key(x)   # (B, T, hs)
        q = self.query(x) # (B, T, hs)

        attention_scores = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5 # (B, T, hs) @ (B, hs, T) -> (B, T, T) *biggest mistake: -0.5*
        attention_scores = attention_scores.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        attention_scores = F.softmax(attention_scores, dim=-1)
        attention_scores = self.dropout(attention_scores)

        v = self.value(x) # (B, T, hs)
        out = attention_scores @ v # (B, T, T) @ (B, T, hs) -> (B, T, hs)
        return out

class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.w_o = nn.Linear(head_size * num_heads, n_embd) # (d_k * h, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.w_o(out))
        return out

class FeedForward(nn.Module):
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd) # planning to make myself
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(seq_len, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx) # (B, T, C) == (Batchsize, seqlen, dmodel)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x) # (B, T, C)
        x = self.ln_f(x) # (B, T, C)
        logits = self.lm_head(x) # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -seq_len:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [None]:
!pip install wandb -qU
import wandb
wandb.login(key="b30a84eeb0db02eac6ee82b8044c05ec0fed4911")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.7/6.7 MB[0m [31m23.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.6/281.6 kB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25h

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [7]:
model = GPTLanguageModel()
model = model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.2, patience=2, verbose=True)



In [None]:
wandb.init(
    project="gpt-mini-training-project",

    config={
    "architecture": "Decoder-only Transformer",
    "dataset": "OpenWebText",
    "EPOCHS": EPOCHS,
    "Iterations per Epoch": len(train_dataloader),
    "Model Parameters" : sum(p.numel() for p in model.parameters() if p.requires_grad),
    "learning_rate": learning_rate,
    }
)

[34m[1mwandb[0m: Currently logged in as: [33myoshisato[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
@torch.no_grad()
def estimate_loss():
    model.eval()
    running_loss = 0.0;
    for batch in val_dataloader:
        val_input_sequences, val_target_sequences = batch
        val_input_sequences = val_input_sequences.to(device)
        val_target_sequences = val_target_sequences.to(device)

        logits, loss = model(val_input_sequences, val_target_sequences)
        running_loss += loss.item()

    avg_loss = running_loss / len(val_dataloader)
    model.train()
    return avg_loss

In [10]:
# Training Loop Parameters
total_batches = len(train_dataloader)
log_interval = 2000
val_interval = 10000
checkpoint_interval = 30000
current_step = 0

for epoch in range(EPOCHS):
    running_loss = 0.0

    for i, batch in enumerate(train_dataloader):
        input_sequences, target_sequences = batch
        input_sequences = input_sequences.to(device)
        target_sequences = target_sequences.to(device)

        logits, loss = model(input_sequences, target_sequences)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        running_loss += loss.item()

        # Logging training progress
        if (current_step+1) % log_interval == 0:
            avg_loss = running_loss / log_interval
            current_lr = optimizer.param_groups[0]['lr']
            print(f"Epoch: {epoch+1}/{EPOCHS}, Batch: {i+1}/{total_batches}, Loss: {avg_loss}, LR: {current_lr}")
            wandb.log({"Training Loss": avg_loss, "Learning Rate": current_lr})
            running_loss = 0.0

        # Validation and learning rate adjustment
        if (current_step+1) % val_interval == 0:
            val_loss = estimate_loss()
            print(f"Epoch: {epoch+1}/{EPOCHS}, Batch: {i+1}/{total_batches}, Validation Loss: {val_loss}")
            wandb.log({"Validation Loss": val_loss})
            scheduler.step(val_loss)

        # Checkpointing
        if (current_step+1) % checkpoint_interval == 0:
            torch.save({
                'epoch': epoch + 1,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'scheduler_state_dict': scheduler.state_dict(),
                'current_step': current_step
            }, f"/content/drive/MyDrive/Colab Notebooks/gpt-mini-background/gpt_mini_4_epoch_{epoch+1}_iter_{i+1}.pth")
            print(f"Checkpoint saved at epoch {epoch+1} and iteration {i+1}")

        current_step += 1

    print(f"Epoch {epoch+1}/{EPOCHS} DONE")

wandb.finish()

Epoch: 1/20, Batch: 2000/136529, Loss: 5.880917208909988, LR: 0.0005
Epoch: 1/20, Batch: 4000/136529, Loss: 5.166393566370011, LR: 0.0005
Epoch: 1/20, Batch: 6000/136529, Loss: 4.956403430700302, LR: 0.0005
Epoch: 1/20, Batch: 8000/136529, Loss: 4.838126055240631, LR: 0.0005
Epoch: 1/20, Batch: 10000/136529, Loss: 4.760191371679306, LR: 0.0005
Epoch: 1/20, Batch: 10000/136529, Validation Loss: 4.615859305108343
Epoch: 1/20, Batch: 12000/136529, Loss: 4.698157769203186, LR: 0.0005
Epoch: 1/20, Batch: 14000/136529, Loss: 4.652158542394638, LR: 0.0005
Epoch: 1/20, Batch: 16000/136529, Loss: 4.61181666970253, LR: 0.0005
Epoch: 1/20, Batch: 18000/136529, Loss: 4.57833806848526, LR: 0.0005
Epoch: 1/20, Batch: 20000/136529, Loss: 4.549838432312011, LR: 0.0005
Epoch: 1/20, Batch: 20000/136529, Validation Loss: 4.404465201851371
Epoch: 1/20, Batch: 22000/136529, Loss: 4.526633403062821, LR: 0.0005
Epoch: 1/20, Batch: 24000/136529, Loss: 4.502944415330886, LR: 0.0005
Epoch: 1/20, Batch: 26000/13

KeyboardInterrupt: 

In [1]:
load_checkpoint_path = "gpt_mini_4_epoch_1_iter_90000.pth"

In [10]:
def load_checkpoint(model, optimizer, filename=load_checkpoint_path):
    checkpoint = torch.load(filename)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    print(f"Checkpoint loaded from {filename}")

load_checkpoint(model, optimizer)

Checkpoint loaded from gpt_mini_4_epoch_1_iter_90000.pth


In [11]:
!pip install tiktoken
import tiktoken
enc = tiktoken.get_encoding("gpt2")

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/1.1 MB[0m [31m8.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tiktoken
Successfully installed tiktoken-0.7.0


In [12]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
print(enc.decode(model.generate(context, max_new_tokens=500)[0].tolist()))

!

POWER SOUTHEY TANK

VICE Gov. John Edwards (Key)

Updated

A source close to the White House said in a statement. He said, "It was such a big gift I had been buying for years this year."

A third lieutenant said on after Senator weren't leading an investigation, he said the law will allow agencies to use the software at a minimum of speed."

Under Rouse's Officer category, he has charged psychopathic generation and falsetications to young men, including it used by a man who first passed a test to the University of Wales and transporting firearm.

"I think we think it's quite fitting for her family in this case," she said. "But if we've been showing very little we do and [further,] we will not have a satisfactory answer."

CNN's Andrew Smith said she felt "it was regrettable" if the cameras became pervasive if she were found under barbers with the cameras, "I think him is going to own too much for help and will help."

Median Reading, who

Lexington Harmony<|endoftext|>Anxiety Pothal