In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

block_size = 8
batch_size = 4
learning_rate = 3e-4
max_iters = 10000
eval_iters = 250

In [2]:
device = 'mps' if torch.backends.mps.is_available() else 'CPU'
print(device)

mps


In [3]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print(text[:200])

﻿  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW


In [4]:
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)
print(vocab_size)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
81


In [5]:
string_to_int = { ch:i for i,ch in enumerate(chars)}
print(string_to_int)

{'\n': 0, ' ': 1, '!': 2, '"': 3, '&': 4, "'": 5, '(': 6, ')': 7, '*': 8, ',': 9, '-': 10, '.': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, ';': 23, '?': 24, 'A': 25, 'B': 26, 'C': 27, 'D': 28, 'E': 29, 'F': 30, 'G': 31, 'H': 32, 'I': 33, 'J': 34, 'K': 35, 'L': 36, 'M': 37, 'N': 38, 'O': 39, 'P': 40, 'Q': 41, 'R': 42, 'S': 43, 'T': 44, 'U': 45, 'V': 46, 'W': 47, 'X': 48, 'Y': 49, 'Z': 50, '[': 51, ']': 52, '_': 53, 'a': 54, 'b': 55, 'c': 56, 'd': 57, 'e': 58, 'f': 59, 'g': 60, 'h': 61, 'i': 62, 'j': 63, 'k': 64, 'l': 65, 'm': 66, 'n': 67, 'o': 68, 'p': 69, 'q': 70, 'r': 71, 's': 72, 't': 73, 'u': 74, 'v': 75, 'w': 76, 'x': 77, 'y': 78, 'z': 79, '\ufeff': 80}


In [6]:
int_to_string = {i:ch for i, ch in enumerate(chars)}
print(int_to_string)

{0: '\n', 1: ' ', 2: '!', 3: '"', 4: '&', 5: "'", 6: '(', 7: ')', 8: '*', 9: ',', 10: '-', 11: '.', 12: '0', 13: '1', 14: '2', 15: '3', 16: '4', 17: '5', 18: '6', 19: '7', 20: '8', 21: '9', 22: ':', 23: ';', 24: '?', 25: 'A', 26: 'B', 27: 'C', 28: 'D', 29: 'E', 30: 'F', 31: 'G', 32: 'H', 33: 'I', 34: 'J', 35: 'K', 36: 'L', 37: 'M', 38: 'N', 39: 'O', 40: 'P', 41: 'Q', 42: 'R', 43: 'S', 44: 'T', 45: 'U', 46: 'V', 47: 'W', 48: 'X', 49: 'Y', 50: 'Z', 51: '[', 52: ']', 53: '_', 54: 'a', 55: 'b', 56: 'c', 57: 'd', 58: 'e', 59: 'f', 60: 'g', 61: 'h', 62: 'i', 63: 'j', 64: 'k', 65: 'l', 66: 'm', 67: 'n', 68: 'o', 69: 'p', 70: 'q', 71: 'r', 72: 's', 73: 't', 74: 'u', 75: 'v', 76: 'w', 77: 'x', 78: 'y', 79: 'z', 80: '\ufeff'}


In [7]:
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [8]:
encoded_hello = encode('hello')
decoded_hello = decode(encoded_hello)

In [9]:
print(decoded_hello)

hello


In [10]:
data_of_oz = torch.tensor(encode(text), dtype=torch.long)

In [11]:
print(data_of_oz[:100])

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


In [12]:
n = int(0.8*len(data_of_oz))
train_data = data_of_oz[:n]
val_data = data_of_oz[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs:')
# print(x.shape)
print(x)
print('targets:')
print(y)

inputs:
tensor([[24,  3,  0,  0,  3, 29, 75, 58],
        [73, 74, 71, 67, 58, 57,  1, 73],
        [73, 61,  1, 73, 61, 58, 66, 11],
        [67, 58, 57,  1, 62, 67, 73, 68]], device='mps:0')
targets:
tensor([[ 3,  0,  0,  3, 29, 75, 58, 71],
        [74, 71, 67, 58, 57,  1, 73, 61],
        [61,  1, 73, 61, 58, 66, 11,  0],
        [58, 57,  1, 62, 67, 73, 68,  1]], device='mps:0')


In [13]:
x = train_data[:block_size]
y = train_data[1:block_size+1]


for bs in range(block_size):
    context = x[:bs+1]
    target = y[bs]
    print(f'When input is, {context}, target is, {target}')

When input is, tensor([80]), target is, 1
When input is, tensor([80,  1]), target is, 1
When input is, tensor([80,  1,  1]), target is, 28
When input is, tensor([80,  1,  1, 28]), target is, 39
When input is, tensor([80,  1,  1, 28, 39]), target is, 42
When input is, tensor([80,  1,  1, 28, 39, 42]), target is, 39
When input is, tensor([80,  1,  1, 28, 39, 42, 39]), target is, 44
When input is, tensor([80,  1,  1, 28, 39, 42, 39, 44]), target is, 32


In [20]:
class BigramLanguageModel(nn.Module):
    
    def __int__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape #Batch/Block, Time(vocab), Channels
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        """
        Index is (B, T) array of indices in current context
        :param index: 
        :param max_new_tokens: 
        :return: 
        """
        for _ in range(max_new_tokens):
            #     get the predictions
            logits, loss = self.forward(index)
            #     focus only on the last step
            logits = logits[:, -1, :] #becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) #(B, C)
            # sample from next distribution
            index_next  = torch.multinomial(probs, num_samples=1) #(B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) #(B, T+1)
        return index
            
            
            
model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)
    

TypeError: BigramLanguageModel.__init__() takes 1 positional argument but 2 were given

In [14]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index

model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


QDd_*iS643PK,qu?I:eTh!dlGtO&x34u:I*ccDT66ZwdN.mzkY&FRyVgUFD1﻿0jiwM2cyP]kgNIS1!WbxkDNHEjHLZLgVo?W.q9'g&fgBWij-Z8sT.5
,3RiPdDN43u,YON-0XCYZ.Ci9﻿Os):l3Nvr(azVde.rEjC3zNr(3zkmFBgviHn2p94GQk;Mw
'wurd;JoRnHy!m"gT)]LbOnn3sA:h﻿&!(rYbXtCk)FLUArIBi*_a'5NJ34oP(EVgUewA'gFj8﻿SW?Rj&Q6Lnde1H*wo'd;WNAg"?ZB?cXDYYEw
RJ25,4[7nCO6b*CB"6r;(hz7nzNU﻿HNUHfzNW?6utI!dHgCsVa)gUZ3J(uLui0jj﻿Onb(4IoR_qJX1e6KoRmqb46c;NLj﻿gDAAS5S-'(E"ZGmVcFC-iZJMTTbxK!y0oCDy(ZG8Lcj(f,aUwAvDQDFPY'sNZ﻿zNka;NvM]ud,q:Ljz;&!iK]?I9﻿98[q.9﻿fQDj cNB)f


In [17]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [18]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.3f}, val loss: {losses['val']:.3f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 4.916, val loss: 4.925
step: 250, train loss: 4.877, val loss: 4.848
step: 500, train loss: 4.794, val loss: 4.801
step: 750, train loss: 4.718, val loss: 4.727
step: 1000, train loss: 4.676, val loss: 4.677
step: 1250, train loss: 4.615, val loss: 4.606
step: 1500, train loss: 4.556, val loss: 4.538
step: 1750, train loss: 4.481, val loss: 4.479
step: 2000, train loss: 4.435, val loss: 4.448
step: 2250, train loss: 4.394, val loss: 4.372
step: 2500, train loss: 4.318, val loss: 4.318
step: 2750, train loss: 4.256, val loss: 4.266
step: 3000, train loss: 4.219, val loss: 4.206
step: 3250, train loss: 4.162, val loss: 4.151
step: 3500, train loss: 4.128, val loss: 4.107
step: 3750, train loss: 4.058, val loss: 4.057
step: 4000, train loss: 3.999, val loss: 4.020
step: 4250, train loss: 3.953, val loss: 3.960
step: 4500, train loss: 3.904, val loss: 3.922
step: 4750, train loss: 3.885, val loss: 3.870
step: 5000, train loss: 3.830, val loss: 3.815
step: 5250, train l

In [16]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


Dhug2cterNyslQ!PKd
GNPvik0t-46*Fw LUwh&FG:sP.FPB3Q6﻿;A-Lx8Jx
gl8[cu.p0zJOKrolesq-
*XE9k2.5Pzfur7g ecl14F8Banscrtapfonyi*Xl o b-8a kNef[.Mllvesc,S_Lauspurowcl3ab.UEpq,g7XL9-4]492Dsp9phewt, Wui)tegh
"OHNY;ss x5eaidce
"':sebusiKQw﻿p!.XFI3﻿:vg)_!u,2ZC;*zjqWPOhehe b_V"9GKE!whQmHu syoAT_(jGU﻿42ds,*VA-SO3QB_8z-*_:VdfTX;24JF8a64y[f
thenod'!.Ppek(B
A-4XzJw te
:B_'.PNG. t bit(hsesilt srH2DUDVPphv1in
Zd2Hq1Q!-d, J(iKSr9prthetR9
df&"me1XimpE_8Om24UO)ha6-WW;AIe m&?sk4JxnD5TFGb.5
int'6y9CUu
eyfx2,0Om!)RC1)yo 
