In [11]:
with open("../data/wizard_of_oz.txt", "r", encoding="utf-8") as f:
    text = f.read()

print(len(text))
print(text[:300])

232309
﻿  DOROTHY AND THE WIZARD IN OZ

  BY

  L. FRANK BAUM

  AUTHOR OF THE WIZARD OF OZ, THE LAND OF OZ, OZMA OF OZ, ETC.

  ILLUSTRATED BY JOHN R. NEILL

  BOOKS OF WONDER WILLIAM MORROW & CO., INC. NEW YORK


  [Illustration]


  COPYRIGHT 1908 BY L. FRANK BAUM

  ALL RIGHTS RESERVED


         *    


In [12]:
chars = sorted(set(text))
print(len(chars))
print(chars)

81
['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [13]:
def encode(text: str):
    ans = []
    for t in text:
        ans.append(chars.index(t))
    return ans

encoded_hello = encode("hello")
encoded_hello


[61, 58, 65, 65, 68]

In [14]:
def decode(indices: list):
    ans = ""
    for i in indices:
        ans += chars[i]
    return ans

decoded_hello = decode(encoded_hello)
decoded_hello

'hello'

In [15]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])


tensor([80,  1,  1, 28, 39, 42, 39, 44, 32, 49,  1, 25, 38, 28,  1, 44, 32, 29,
         1, 47, 33, 50, 25, 42, 28,  1, 33, 38,  1, 39, 50,  0,  0,  1,  1, 26,
        49,  0,  0,  1,  1, 36, 11,  1, 30, 42, 25, 38, 35,  1, 26, 25, 45, 37,
         0,  0,  1,  1, 25, 45, 44, 32, 39, 42,  1, 39, 30,  1, 44, 32, 29,  1,
        47, 33, 50, 25, 42, 28,  1, 39, 30,  1, 39, 50,  9,  1, 44, 32, 29,  1,
        36, 25, 38, 28,  1, 39, 30,  1, 39, 50])


In [16]:
block_size = 8
batch_size = 4

n = int(len(data) * 0.8)
train_data = data[:n]
test_data = data[n:]

def get_batch(split):
    data = train_data if split == "train" else test_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

x, y = get_batch("train")
print(x.shape, y.shape)
print(x, y)


tensor([ 67885,  75977, 141743, 125025])
torch.Size([4, 8]) torch.Size([4, 8])
tensor([[74, 56, 61,  1, 59, 68, 71,  1],
        [58, 56, 73, 62, 68, 67, 72,  1],
        [ 1, 54, 72, 64, 58, 57,  1, 73],
        [68, 67, 57, 58, 57,  1, 73, 61]]) tensor([[56, 61,  1, 59, 68, 71,  1, 61],
        [56, 73, 62, 68, 67, 72,  1, 54],
        [54, 72, 64, 58, 57,  1, 73, 61],
        [67, 57, 58, 57,  1, 73, 61, 58]])


## Bigram model

In [17]:
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print("Input: ", context, "\tTarget: ", target)

Input:  tensor([80]) 	Target:  tensor(1)
Input:  tensor([80,  1]) 	Target:  tensor(1)
Input:  tensor([80,  1,  1]) 	Target:  tensor(28)
Input:  tensor([80,  1,  1, 28]) 	Target:  tensor(39)
Input:  tensor([80,  1,  1, 28, 39]) 	Target:  tensor(42)
Input:  tensor([80,  1,  1, 28, 39, 42]) 	Target:  tensor(39)
Input:  tensor([80,  1,  1, 28, 39, 42, 39]) 	Target:  tensor(44)
Input:  tensor([80,  1,  1, 28, 39, 42, 39, 44]) 	Target:  tensor(32)


### Softmax function

formula: $softmax(x) = \frac{e^{x_i}}{\sum_{j=1}^{n}e^{x_j}}$


In [None]:
from turtle import forward
import torch.nn as nn
import torch.functional as F

class BigramLanuageModel(nn.Module):
    def __init__(self,vocab_size) -> None:
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, index, targets=None):
        if targets is None:
            loss = None
        else:
            logits = self.token_embedding_table(index)
            batch, time, channels = logits.shape
            logits = logits.view(batch * time, channels)
            targets = targets.view(batch * time)
            loss = nn.CrossEntropyLoss(logits, targets)

        return logits, loss
    
    def generate(self, index, max_new_tokens):
        # index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self.forward(index)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) # (B, T+1)
        return index
    
