In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4
max_iters = 1000
# eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250
dropout = 0.2

cpu


In [30]:
with open("wizard_of_oz.txt", 'r', encoding='utf-8') as f:
    text = f.read()
print(text[:200])

  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW


In [31]:
chars = sorted(set(text))
vocab_size = len(chars)
print(chars)
print(vocab_size)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
81


In [32]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda w: [string_to_int[c] for c in w]
decode = lambda n: ''.join([int_to_string[i] for i in n])

In [33]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


In [34]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

In [35]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
print(x)
print(y)

tensor([80,  1,  1, 28, 39, 42, 39, 44])
tensor([ 1,  1, 28, 39, 42, 39, 44, 32])


In [36]:
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"When input is {context}, target is {target}.")

When input is tensor([80]), target is 1.
When input is tensor([80,  1]), target is 1.
When input is tensor([80,  1,  1]), target is 28.
When input is tensor([80,  1,  1, 28]), target is 39.
When input is tensor([80,  1,  1, 28, 39]), target is 42.
When input is tensor([80,  1,  1, 28, 39, 42]), target is 39.
When input is tensor([80,  1,  1, 28, 39, 42, 39]), target is 44.
When input is tensor([80,  1,  1, 28, 39, 42, 39, 44]), target is 32.


In [37]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print('inputs', x.shape, x)
print('targets', y.shape, y)

inputs torch.Size([4, 8]) tensor([[59,  1, 57, 58, 58, 69,  1, 60],
        [37, 78,  1, 72, 68, 71, 56, 58],
        [ 3,  1, 71, 58, 66, 54, 71, 64],
        [52,  0,  0,  0,  0,  0, 27, 32]])
targets torch.Size([4, 8]) tensor([[ 1, 57, 58, 58, 69,  1, 60, 71],
        [78,  1, 72, 68, 71, 56, 58, 71],
        [ 1, 71, 58, 66, 54, 71, 64, 58],
        [ 0,  0,  0,  0,  0, 27, 32, 25]])


In [38]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :]   # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)   # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1)    # (B, 1)
            # append sampled index to running sequence
            index = torch.cat((index, index_next), dim=1)   # (B, T+1)
        return index


In [39]:
model = BigramLanguageModel(vocab_size=vocab_size)
m = model.to(device=device)

context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


;H&AU7(MPOVx8JIA_:b&SIzVWe__qH]VDi.ZAJ
coq xktkKuDQd)09W!lSVr,*xcfKu2p"m[rYG0rGrMcE nHjJ
16"WHZS'rRSVp1G,9UB'Vi7KD;cp(.uw[KKv["v
OG59yAp(qH59zE:?e).MK7HdEMtKo23dR5S!WB2Vwf.z5!lGuV
Hr'I5cEx9;W J] 0ro a9?SVxyyNTe?j:qX
YGd9kkkB(3WuQ(Mq.﻿g﻿I.8dC];:0bah*76w﻿3Kd0_KcoM!!1:deit)F"5!eBzmn1G5Kd&KxznRe__G4br6[CVxF".6,aQ3V'lQEkZF&?M"m-T3G1_rbhgtoNC)YNz
U2BlSVURUn
fp"&j6'ZYbc71]X47p)snRREPkTq.uOBiAm'JPKR1*xOzGb7yj2rGxsq7ypmBwjVUQkyru1JNBrHe&U4WRV-Bc5z56&
fDn]y_fKumpLt2i3HQRW&VmCYmUHPv
co[sdPUoZK*(Vdn9YmRVYMH


In [40]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = loss.mean()
    model.train()
    return out

In [45]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.4f}, val loss: {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss.item())

step: 0, train loss: 4.1289, val loss: 4.0558
step: 250, train loss: 4.1138, val loss: 3.8984
step: 500, train loss: 3.9184, val loss: 3.9118
step: 750, train loss: 4.1709, val loss: 4.0283
3.604804039001465


In [42]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


;G8!;'Kc!?0j&VT_toq:]A8﻿u]-LqvC!
w.xy!.S0UdkQ2UVP2od2﻿'cx*xHF:u2YErGZ1gAu2YnforMa,9?b(
M."
WMp19[3EZv*!wiy5iU!9O4y_b;WoW_:fLgkBxye?:O1(NW[C2[L'FWs?P"428XDsq.2UrByMa XN cV(bI&IYD6*ZlbYO)Ugaj?ge'YOvJt2eBwnS!QE5e'7IGm_O3﻿X7lSa0,-aABYPSVC)jd8KAC_:!kktNYx?L]3QQQ2iN)T],yr0K5E]lmXFRw﻿By"PQiyivmgjH6i;BnI.Lb,RRp10brC-tQ r0Sx
,ZF3!]7'[﻿B3byiX7a1 1DuFIP(kBYOVYO,j_TkkB',*?Lb0pL.KdngYOVA?;C1dnec[GE]],KhzYOV2YBw&.WBwfLr4y;u8'mnu1G:Qw.o1FnFFy Lyje_8JEaRE'7Gw5zn.rGrk&*B5qq&.qQG[yypah0b72w.﻿
co(﻿"TGYtYx*(!4(XZ*7
