In [24]:
# https://www.gutenberg.org/ebooks/22566

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import time
import numpy as np
device = torch.device("mps")
print(device)
block_size = 8
batch_size = 4
max_iters = 10000
#eval_interval = 2500
learning_rate = 3e-4
eval_iters = 250
dropout = 0.2

mps


In [2]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
vocab_size = len(chars)

In [3]:
# print(chars)
# print(len(chars))

#### generator

In [4]:
# mapping
string_to_int = {ch:i for i, ch in enumerate(chars)}
int_to_string = {i:ch for i, ch in enumerate(chars)}

# encoding-decoding tokenizer (character level)
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)

# print(data[:100])

#### validation and testing set

In [5]:
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]

def get_batch(split):
    data = train_data if split == "train" else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

x, y = get_batch("train")
print("inputs:")
print(x)
print("targets:")
print(y)

inputs:
tensor([[54,  1, 55, 68, 78,  9,  1, 61],
        [58, 66, 23,  1, 54, 67, 57,  1],
        [67,  1, 27, 61, 62, 56, 54, 60],
        [54,  1, 78, 58, 65, 65, 68, 76]], device='mps:0')
targets:
tensor([[ 1, 55, 68, 78,  9,  1, 61, 68],
        [66, 23,  1, 54, 67, 57,  1, 67],
        [ 1, 27, 61, 62, 56, 54, 60, 68],
        [ 1, 78, 58, 65, 65, 68, 76,  1]], device='mps:0')


#### input-output implementation

In [None]:
# x = train_data[:block_size]
# y = train_data[1:block_size+1]

# for t in range(block_size):
#     context = x[:t+1]
#     target = y[t]
#     print(f"when input is {context} target is {target}")

#### model

In [11]:
@torch.no_grad() # decorator that doesnt allow gradient calc
def estimate_loss():
    out = {}
    model.eval() # dropout is turned off at evaluation
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [12]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            #focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B,C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sample index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index
    
model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device) # torch.long ==> int64
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


n8jm-19cf'PyZPfbi.WgJVH3b0nyPEZTCEdPS9V:r?EFKL'OFBKDKZ2hF!KUnS"!T_
7i*qE9'emw4u_T?s*pS-KI(c[W9[_
;nu8AEMC7z? (!gm 6_bpm
_Mfpocitvx7BQYnS*)9bg O-l"uYrUmot;D0
8]B*7W?Bhz'qMS'!G15pqPQ.s-27z3kO-CaX(UF"W5E)9nENFKWY".AH-heiD)l*y_THlJ5o(cz)9.SV83f_qL8L))-z&"Wt;Rt.-v O*_:[E?k*Tm-_Al(prCa]K6H)AZkJS3psl!e:LO,]cqp-!mMwJxpe:9'h)
y-x.4hC*-OiG83S"-Y-27-m?Bp4pr2;c**!SafDnE*edm:jLQ]WjLo
.0HLp8bwO?BYd Bl)mERY2)NP5SRnvEwycUHcQS-
lPaS7FCi2*&f:uf(_GQ'S3aB4Fd?86;I2nF'XE*z?15r?Ik;YIhCZfQV89'X9Hc6&nHEEVn42nRh)i;.On8D"


In [22]:
# create a PyTorch Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step {iter}, train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
    # sample a batch of data
    xb, yb = get_batch("train")
    
    # evaluate the loss
    logits, loss = m.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
print(loss.item())

step 0, train loss 2.4316, val loss 2.4774
step 250, train loss 2.4197, val loss 2.4976
step 500, train loss 2.4572, val loss 2.4916
step 750, train loss 2.4583, val loss 2.4954
step 1000, train loss 2.4327, val loss 2.4704
step 1250, train loss 2.4479, val loss 2.4718
step 1500, train loss 2.4292, val loss 2.4639
step 1750, train loss 2.4639, val loss 2.5078
step 2000, train loss 2.4323, val loss 2.4911
step 2250, train loss 2.4433, val loss 2.5059
step 2500, train loss 2.4408, val loss 2.4721
step 2750, train loss 2.4492, val loss 2.4792
step 3000, train loss 2.4624, val loss 2.4582
step 3250, train loss 2.4497, val loss 2.5081
step 3500, train loss 2.4771, val loss 2.4997
step 3750, train loss 2.4379, val loss 2.4878
step 4000, train loss 2.4642, val loss 2.4795
step 4250, train loss 2.4356, val loss 2.4912
step 4500, train loss 2.4540, val loss 2.5182
step 4750, train loss 2.4560, val loss 2.4934
step 5000, train loss 2.4445, val loss 2.4901
step 5250, train loss 2.4375, val loss 2

In [23]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)



siroof ser awed ca tofle f burtr
"Hom, mo th t woucchorentou toporme ndory sopopawh?"Whe IGutous ss us hean be thend the Sly llashe pthissooreral y. s s,"_Em tho fosivendn tththeden ad'st q; f stirce wofrewol.

b-l se athind  Proofur ZTh thasusthit f RODowatar w, gr
thand simarom.
tithof be methe furer thenss " hagey.

"
asathen whe t a ALve an s theinche kisond.
aly o at; " talyond  he aspong."

"bier th tl sy is the ered, seve se Iftame erlod irer mbed andoucis wed as theecon w, thyonde icazo
