In [1]:
print("Hello worldo!")

Hello worldo!


In [2]:
from __future__ import annotations
import typing

import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F

torch.manual_seed(35897932)

<torch._C.Generator at 0x7f2f3c0cd530>

### Get text data

In [3]:
with open("text_corpus.txt", encoding="utf-8") as f:
    text = f.read()[:10_000]

In [4]:
vocab = sorted(set(text))
print(vocab)

['\n', ' ', '%', "'", ',', '.', '0', '1', '2', '3', '5', '6', '7', '8', '9', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'W', 'Y', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [5]:
vocab_size = len(vocab)
vocab_size

62

### Encode the characters to integers

In [6]:
# Create encoder and decoder dicts

char_int_mapping = dict()
int_char_mapping = dict()

for i, c in enumerate(sorted(set(text))):
    char_int_mapping[c] = i
    int_char_mapping[i] = c

In [7]:
# examples
print(char_int_mapping["g"])
print(int_char_mapping[42])

42
g


In [8]:
def encode(string: str) -> List[int]:
    int_list = [char_int_mapping[char] for char in string]
    return int_list

def decode(int_list: List[int]) -> str:
    string = [int_char_mapping[num] for num in int_list]
    return "".join(string)

In [9]:
# examples
print(encode("hellooo"))
print(decode([43, 40, 47, 47, 50, 50, 50]))

[43, 40, 47, 47, 50, 50, 50]
hellooo


In [10]:
encode("hello world")

[43, 40, 47, 47, 50, 1, 58, 50, 53, 47, 39]

### Make it a tensor

In [11]:
data = torch.tensor(encode(text), dtype=torch.int32)
data.shape

torch.Size([10000])

### Train/Test split

In [12]:
len(data)

10000

In [13]:
N = int(0.9*len(data))
train_data = data[:N]
test_data = data[N:]

print(len(train_data), len(test_data))

9000 1000


### Create minibatches

In [14]:
block_size = 8 # block_size is the maximum context length (input textblock size)
minibatch_size = 4

In [15]:
train_data[:block_size]

tensor([ 1, 53, 40, 48, 40, 48, 37, 40], dtype=torch.int32)

In [16]:
def get_minibatch(split: torch.tensor):
    offsets = np.random.randint(0, len(split) - minibatch_size, size=minibatch_size)
    x = torch.stack([split[i:i+block_size] for i in offsets])
    y = torch.stack([split[i+1:i+block_size+1] for i in offsets])
    return x, y

In [17]:
x, y = get_minibatch(train_data)
x

tensor([[44, 50, 49,  1, 36, 49, 39,  1],
        [44, 49, 42,  0,  1, 36,  1, 46],
        [40, 40, 39,  1, 50, 41,  1, 54],
        [40, 49,  0,  1, 36,  1, 38, 43]], dtype=torch.int32)

In [18]:
y

tensor([[50, 49,  1, 36, 49, 39,  1, 43],
        [49, 42,  0,  1, 36,  1, 46, 49],
        [40, 39,  1, 50, 41,  1, 54, 50],
        [49,  0,  1, 36,  1, 38, 43, 36]], dtype=torch.int32)

In [19]:
decode(x[3].tolist())

'en\n a ch'

In [20]:
for b in range(minibatch_size):
    print(f"----- BATCH {b} -----")
    
    for t in range(block_size):
        context = x[b][:t+1]
        target = y[b][t]
        print(f"context: {decode(context.tolist())} -> target: {decode([int(target)])}")
    print()
        
    for t in range(block_size):
        context = x[b][:t+1]
        target = y[b][t]
        print(f"context: {context.tolist()} -> target: {int(target)}")
    print()

----- BATCH 0 -----
context: i -> target: o
context: io -> target: n
context: ion -> target:  
context: ion  -> target: a
context: ion a -> target: n
context: ion an -> target: d
context: ion and -> target:  
context: ion and  -> target: h

context: [44] -> target: 50
context: [44, 50] -> target: 49
context: [44, 50, 49] -> target: 1
context: [44, 50, 49, 1] -> target: 36
context: [44, 50, 49, 1, 36] -> target: 49
context: [44, 50, 49, 1, 36, 49] -> target: 39
context: [44, 50, 49, 1, 36, 49, 39] -> target: 1
context: [44, 50, 49, 1, 36, 49, 39, 1] -> target: 43

----- BATCH 1 -----
context: i -> target: n
context: in -> target: g
context: ing -> target: 

context: ing
 -> target:  
context: ing
  -> target: a
context: ing
 a -> target:  
context: ing
 a  -> target: k
context: ing
 a k -> target: n

context: [44] -> target: 49
context: [44, 49] -> target: 42
context: [44, 49, 42] -> target: 0
context: [44, 49, 42, 0] -> target: 1
context: [44, 49, 42, 0, 1] -> target: 36
context: [44, 

### Create a simple model

In [21]:
class BigramLanguageModel(nn.Module):
    
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets):
        logits = self.token_embedding_table(idx) # gets one embedding table row with probs for each next char
        char_probs = F.softmax(logits, dim=-1)
        preds = torch.argmax(char_probs, dim=-1)
        loss = F.cross_entropy(preds, targets)
        return preds, loss

In [22]:
mini_model = BigramLanguageModel(vocab_size)

xb, yb = get_minibatch(train_data)
out, loss = mini_model(xb, yb)
print(out.shape)

RuntimeError: Expected floating point type for target with class probabilities, got Int

In [None]:
a = F.softmax(out, dim=-1) # logits
a.shape

In [None]:
a.shape

In [None]:
preds = torch.argmax(a, dim=-1)
preds

In [None]:
F.cross_entropy(preds, yb)