In [2]:

import torch
import torch.nn as nn
from torch.nn import functional as F

In [4]:
with open('tiny_shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [6]:
print(f'Length of text: {len(text)} characters')

Length of text: 1115394 characters


In [12]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(f'chars: {"".join(chars)}')
print(f'Unique characters: {vocab_size}')

chars: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Unique characters: 65


## We are building character level language model
## We want to change characters to integers

In [19]:
# Create a mapping from unique characters to indices

stoi = {ch: i for i, ch in enumerate(chars)} # string to index
itos = {i: ch for i, ch in enumerate(chars)} # index to string
# encoder 
encode = lambda x: [stoi[ch] for ch in x]
# decoder
decode = lambda x: ''.join([itos[ch] for ch in x])
print(encode("hii there"))
print(decode(encode("hii there")))

[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


One of many tokenizers (very simple one)
Google: SentencePiece - sub-word 
OpenAI: tiktoken - GPT uses - sub-word

It increases the vocab size, but decreases the dimension of vector used to decode each string



In [21]:
# let's encode our data
# each character is encoded as an integer
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [22]:
# train test split
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [23]:
# set the context length (block size) - we do not train it on all the data at once
# because that would be computationally expensive
block_size = 8
train_data[:block_size+1] # 8 characters + 1 target

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [26]:
x = train_data[:block_size]
y = train_data[1:block_size+1] # target is the next character, we moved the window by 1
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'Context: {context} | Target: {target}')
# we are able to predict the next character based on the previous characters
# we can do this for context of lenght 1, 2, 3, 4, 5, 6, 7, 8 in this case
# of course for 47 in this example, it is only possible to predict from character 18,
# because it is the second character that we have (similar to moving average)
# but for the 9th character, we always will have 8 characters to predict from! 
# But then we cut it, and move the window by 1, and start again
# By this we are going to have context for each character from 1 to 8 (except for the first 8 characters in the dataset)

# we will have different contexts for the same character. 

# this is interesting because for the GPT models, we found that the are able
# to come up with things that are not in the training data ! Emergent behaviour! 


Context: tensor([18]) | Target: 47
Context: tensor([18, 47]) | Target: 56
Context: tensor([18, 47, 56]) | Target: 57
Context: tensor([18, 47, 56, 57]) | Target: 58
Context: tensor([18, 47, 56, 57, 58]) | Target: 1
Context: tensor([18, 47, 56, 57, 58,  1]) | Target: 15
Context: tensor([18, 47, 56, 57, 58,  1, 15]) | Target: 47
Context: tensor([18, 47, 56, 57, 58,  1, 15, 47]) | Target: 58


Batches - do this thing parallely 


In [28]:
torch.manual_seed(1337)
batch_size = 4 # number of independent sequences we process in parallel 
block_size = 8 # maximum context length for prediction

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,)) # list of random indices
    x = torch.stack([data[i:i+block_size] for i in ix]) # input
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # target (the same thing as in the above cell)
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('====')

for b in range(batch_size): # repeat it for each batch parallelly
    print(f'Batch {b}')
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'when input (context) is: {context.tolist()} | Target is: {target}')


inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
====
Batch 0
when input (context) is: [24] | Target is: 43
when input (context) is: [24, 43] | Target is: 58
when input (context) is: [24, 43, 58] | Target is: 5
when input (context) is: [24, 43, 58, 5] | Target is: 57
when input (context) is: [24, 43, 58, 5, 57] | Target is: 1
when input (context) is: [24, 43, 58, 5, 57, 1] | Target is: 46
when input (context) is: [24, 43, 58, 5, 57, 1, 46] | Target is: 43
when input (context) is: [24, 43, 58, 5, 57, 1, 46, 43] | Target is: 39
Batch 1
when input (context) is: [44] | Target is: 53
when input (context) is: [44, 53] | Target is: 56
when input (context)

## For the next part, we need to understand, what embedding does.

Basically, we create a matrix of shape (vocab_size,vocab_size) in our case.
Each character is represented by 1 integer, hence we can take that integer and assign it the row to which it corresponds as an index.

We have vocab_size columns for each character, so that we can distinguish them in the multi-dimensional space

We initiate the embedding values randomly, and by training we change them.

Therefore we are able to find similar characters in the context of our dataset, or better said, we can prepare for the predictions. This is a way how to quantize it... 

In [52]:
# example
a = nn.Embedding(5,5)

In [56]:
logits = a(torch.tensor([1,1,2,3,4,0]))

In [57]:
# different value for each unique character. 
logits

tensor([[-1.6521, -0.7584,  0.0695, -0.9614, -0.1338],
        [-1.6521, -0.7584,  0.0695, -0.9614, -0.1338],
        [ 0.3899, -0.2884,  0.5490,  1.0329, -0.5556],
        [-1.3479, -1.0810, -0.0447, -0.5367, -0.5223],
        [ 2.1068, -0.5387,  2.1751, -1.7514, -0.4445],
        [-0.0358, -0.4344, -0.1947,  1.0778,  0.3575]],
       grad_fn=<EmbeddingBackward0>)

In [None]:
class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        B, T, C = logits.shape
        logits = logits.view(B*T,C) # because of how F.cross_entropy works, we need to flatten the batch and time dimensions
        targets = targets.view(B*C) # or we could use targets.view(-1) and it would guess automatically
        loss = F.cross_entropy(logits, targets) # there is a tricky part here where the function takes different arguments than we would expect

        return logits, loss

m = BigramLanguageModel(vocab_size)
out = m(xb, yb) 

In [15]:
itos

{0: '\n',
 1: ' ',
 2: '!',
 3: '$',
 4: '&',
 5: "'",
 6: ',',
 7: '-',
 8: '.',
 9: '3',
 10: ':',
 11: ';',
 12: '?',
 13: 'A',
 14: 'B',
 15: 'C',
 16: 'D',
 17: 'E',
 18: 'F',
 19: 'G',
 20: 'H',
 21: 'I',
 22: 'J',
 23: 'K',
 24: 'L',
 25: 'M',
 26: 'N',
 27: 'O',
 28: 'P',
 29: 'Q',
 30: 'R',
 31: 'S',
 32: 'T',
 33: 'U',
 34: 'V',
 35: 'W',
 36: 'X',
 37: 'Y',
 38: 'Z',
 39: 'a',
 40: 'b',
 41: 'c',
 42: 'd',
 43: 'e',
 44: 'f',
 45: 'g',
 46: 'h',
 47: 'i',
 48: 'j',
 49: 'k',
 50: 'l',
 51: 'm',
 52: 'n',
 53: 'o',
 54: 'p',
 55: 'q',
 56: 'r',
 57: 's',
 58: 't',
 59: 'u',
 60: 'v',
 61: 'w',
 62: 'x',
 63: 'y',
 64: 'z'}

In [14]:
stoi

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}