In [35]:
"""
Scratch notebook to build as we go. All functionality folded back into
relevant python files.
"""

'\nScratch notebook to build as we go. All functionality folded back into\nrelevant python files.\n'

In [68]:
# download, split, tokenize, and save the training and test data
!python3 ./data/prepare.py

Loading dataset shards: 100%|███████████████████| 80/80 [00:01<00:00, 54.62it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 7973700
    })
    val: Dataset({
        features: ['text'],
        num_rows: 40069
    })
})

writing /Users/vinay/src/gpt-2/./data/train.bin: 100%|█| 1024/1024 [02:18<00:00,
writing /Users/vinay/src/gpt-2/./data/val.bin: 100%|█| 1024/1024 [00:01<00:00, 7


In [75]:
import os

import numpy as np
import tiktoken

""" let's build a decoder to validate we can take the prepared data back into text """
enc = tiktoken.get_encoding("gpt2")

train_data_bin = os.path.join('data', 'train.bin')
val_data_bin = os.path.join('data', 'val.bin')

# data_file is a path object to either the train or test data bins
# num_docs is the number of documents you want to read from the file
def load_data_docs(data_file, num_docs = 1):
    docs = [] # array of docs in token format
    current_doc = np.zeros((0,), np.uint16)
    data_offset = 0 # where we are in the file
    data_read_count = 1_000_000 # how many tokens to read at a time

    # note each item has a size of 2 bytes (16-bits)
    file_stats = os.stat(data_file)
    file_size = file_stats.st_size
    total_items = file_size / 2

    data_arr = np.memmap(data_file, dtype=np.uint16, mode='r')

    while data_offset < total_items and len(docs) < num_docs:
        data_arr = np.memmap(data_file, dtype=np.uint16, mode='r')
        data = data_arr[data_offset:data_offset + data_read_count]
        data_offset += len(data)
        for id in data:
            current_doc = np.append(current_doc, id)
            if id == enc.eot_token:
                docs.append(current_doc)
                current_doc = np.zeros((0,), np.uint16)
            if len(docs) == num_docs:
                break
    
    return docs

In [76]:
data_samples = load_data_docs(train_data_bin)
for sample in data_samples:
    print(enc.decode(sample))

'Addressing Retailer Concerns, Part 1'

'ASM' #25 -- 9.99 cover price

The first afternoon of the Marvel Retailer Summit (see " Marvel Retailer Summit ") was a free-flowing discussion of retailer concerns, led by Senior Vice President – Sales and Marketing David Gabriel and Editor-in-Chief Axel Alonso. In Part 1 of this two-part article, we cover the discussion of the many changes to Marvel characters over the past few years, reboots and restarts, and expanding high-selling franchise titles. In Part 2 , we covered the discussion of talent management, securing hot talent, creating new talent, event fatigue, timing of events, and trade pricing. At the end of the event, we also asked David Gabriel some questions about the shift in the market last fall that motivated, in part, the Retailer Summit (see " Marvel’s David Gabriel on the 2016 Market Shift ").The first topic that came up at the Summit, although it wasn’t on the agenda in that form, was the number of changes that had been made to

In [101]:
import torch

# data_file is a path object to either the train or test data bins
# block_size is the context length feeding into the transformer
# batch_size is the number of examples to pull
def get_data_batch(data_file, block_size, batch_size):
    data_arr = np.memmap(data_file, dtype=np.uint16, mode='r')
    batch_offsets = torch.randint(len(data_arr) - block_size, (batch_size,))
    X = torch.stack([torch.from_numpy(data_arr[i:i+block_size].astype(np.int64)) for i in batch_offsets])
    Y = torch.stack([torch.from_numpy(data_arr[i+1:i+1+block_size].astype(np.int64)) for i in batch_offsets])
    return X, Y

In [129]:
from dataclasses import dataclass

@dataclass
class GPTConfig:
    block_size: int = 1024 # context length
    vocab_size: int = 50257
    n_layer: int = 12
    n_head: int = 12
    n_embd: int = 768
    dropout: float = 0.0
    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster

In [130]:
config = GPTConfig()

batch_size = 4
xb, yb = get_data_batch(val_data_bin, config.block_size, batch_size)

for b in range(2): # batch dimension
    for t in range(8): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

when input is [21011] the target: 287
when input is [21011, 287] the target: 262
when input is [21011, 287, 262] the target: 4581
when input is [21011, 287, 262, 4581] the target: 2423
when input is [21011, 287, 262, 4581, 2423] the target: 428
when input is [21011, 287, 262, 4581, 2423, 428] the target: 3931
when input is [21011, 287, 262, 4581, 2423, 428, 3931] the target: 11
when input is [21011, 287, 262, 4581, 2423, 428, 3931, 11] the target: 257
when input is [262] the target: 4827
when input is [262, 4827] the target: 2766
when input is [262, 4827, 2766] the target: 286
when input is [262, 4827, 2766, 286] the target: 9439
when input is [262, 4827, 2766, 286, 9439] the target: 526
when input is [262, 4827, 2766, 286, 9439, 526] the target: 198
when input is [262, 4827, 2766, 286, 9439, 526, 198] the target: 198
when input is [262, 4827, 2766, 286, 9439, 526, 198, 198] the target: 1135


In [131]:
print(xb) # input 

tensor([[21011,   287,   262,  4581,  2423,   428,  3931,    11],
        [  262,  4827,  2766,   286,  9439,   526,   198,   198],
        [  198,   198,  1870,   788,    11,  2486,   338,  8024],
        [ 4086,   326,   584,  3435,   588,  5890,  6502,  1656]])


In [154]:
import torch.nn as nn
from torch.nn import functional as F

class FeedForward(nn.Module):
    def __init__(self, config):
        super().__init__()

    def forward(self, x):
        # TODO: run the shit
        return x

class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()

    def forward(self, x):
        # TODO: run the shit
        return x

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        n_embd = config.n_embd
        bias = config.bias
        self.ln1 = nn.LayerNorm(n_embd, bias=bias)
        self.attention = CausalSelfAttention(config)
        self.ln2 = nn.LayerNorm(n_embd)
        self.ffwd = FeedForward(config)

    def forward(self, x):
        # note: "x +" represents a residual connection
        # you will need projection layers in the attention
        # and ffwd blocks to learn whether this identity
        # flow-through gradient is better in the context of
        # the training data!
        x = x + self.attention(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        n_embd = config.n_embd
        block_size = config.block_size
        vocab_size = config.vocab_size
        n_layer = config.n_layer
        dropout = config.dropout
        bias = config.bias

        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.embedding_dropout = nn.Dropout(dropout)
        self.blocks = nn.Sequential(*[Block(config) for _ in range(n_layer)])
        self.lm_head = nn.Linear(n_embd, vocab_size)
        self.ln_f = nn.LayerNorm(n_embd, bias=bias) # final layer norm
    
    def forward(self, idx, targets=None):
        B,T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_embd = self.position_embedding_table(torch.arange(T)) # (T, C)
        x = self.embedding_dropout(tok_emb + pos_embd) # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            # conform to what pytorch expects the matrix dims to be
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(idx)
            print(f"logits:{logits},loss:{loss}")
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B,C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B,C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B,T+1)
        
        return idx

In [155]:
m = GPT(config)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(enc.decode(m.generate(torch.zeros((1,1), dtype=torch.long), max_new_tokens=1)[0].tolist()))

torch.Size([32, 50257])
tensor(11.0685, grad_fn=<NllLossBackward0>)
logits:tensor([[[-1.1032,  0.7954,  0.5366,  ..., -0.0129, -0.3886,  0.8169]]],
       grad_fn=<ViewBackward0>),loss:None
!uracy
