In [1]:
# We will examing the shakespear dataset here

with open('input.txt', 'r') as f:
    data = f.read()

print('total dataset length:', len(data))

# print the first 100 characters
print(data[:100])

total dataset length: 1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [7]:
# As previously, let us find out all the unique characters in the dataset
chars = sorted(list(set(data)))
vocab_size = len(chars)
print(''.join(chars))
print(str(vocab_size))



 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [13]:
# Tokenizing the dataset: converting the text to integers to feed into the embedding matrix

stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[c] for c in l])

# Print some encoding and decomding
print(encode('hello, world'))
print(decode(encode('hello, world')))

# this is very simple compared to other implementationms, i.e., Google's SentencePiece
# which is a sub-word level tokenizer instead of a character level tokenizer like ours.
# we have used in train.bin and val.bin, we have used tiktoken's gpt2 bpe tokenizer.

# let us see what this encoder looks like:
import tiktoken

enc = tiktoken.get_encoding("gpt2")
print(enc.n_vocab) # prints total vocabulary size, which for ours is 65 and for them it is 50257.as_integer_ratio
print(enc.encode('hello, world'))
print(enc.decode(enc.encode('hello, world')))


[46, 43, 50, 50, 53, 6, 1, 61, 53, 56, 50, 42]
hello, world
50257
[31373, 11, 995]
hello, world


In [16]:
import torch
dataset = torch.tensor(encode(data),dtype=torch.long) # encode all of our shakesphere dataset
print(dataset.shape,dataset.dtype)
print(dataset[:100])

dataset_bpe = torch.tensor(enc.encode(data),dtype=torch.long) # encode all of our shakesphere dataset
print(dataset_bpe.shape,dataset_bpe.dtype)
print(dataset_bpe[:100])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])
torch.Size([338025]) torch.int64
tensor([ 5962, 22307,    25,   198,  8421,   356,  5120,   597,  2252,    11,
         3285,   502,  2740,    13,   198,   198,  3237,    25,   198,  5248,
          461,    11,  2740,    13,   198,   198,  5962, 22307,    25,   198,
         1639,   389,   477, 12939,  2138,   284,  4656,   621,   284,  1145,
          680,    30,   198,   198,  3237,    25,   198,  4965,  5634,    13,
        12939,    13,   198,   198,  5962, 22307,    25,   198,  5962,    11,
          345,  

In [20]:
# Divide the training and validation datasets in a 90/10 split

train_size = int(len(dataset) * 0.9)
train_dataset, val_dataset = dataset[:train_size], dataset[train_size:]

In [21]:
block_size = 8 # this is the sequence length of the input to the model
train_dataset[:block_size+1] # the +1 has a special purpose

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [22]:
x = train_dataset[:block_size] # input to dataset
y = train_dataset[1:block_size+1] # input plus the next character in the block
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target is {target}")

# The idea behind this is to make the transformer model
# learn to see context as llittle as 1 character and as much as block_size characters
# when trying to make the next prediction.
# This can be a representation of the 'time' dimension, as analogous to the case of wavenets for speech.

when input is tensor([18]) the target is 47
when input is tensor([18, 47]) the target is 56
when input is tensor([18, 47, 56]) the target is 57
when input is tensor([18, 47, 56, 57]) the target is 58
when input is tensor([18, 47, 56, 57, 58]) the target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target is 58


In [25]:
# Now generalizing this to batch dimensions to feed to the GPU.

torch.manual_seed(1337)
block_size = 8
batch_size = 4


def get_batch(split):
    # Generate a small batchf rom dataset
    data = train_dataset if split == 'train' else val_dataset
    ix = torch.randint(len(data) - block_size, size=(batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

for b in range(batch_size): # loop over batches
    for t in range(block_size): # loop over time, or sequence in our case
        context = xb[b,:t+1]
        target = yb[b,t]
        print(f"when input is {context} the target is {target}")


inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
when input is tensor([24]) the target is 43
when input is tensor([24, 43]) the target is 58
when input is tensor([24, 43, 58]) the target is 5
when input is tensor([24, 43, 58,  5]) the target is 57
when input is tensor([24, 43, 58,  5, 57]) the target is 1
when input is tensor([24, 43, 58,  5, 57,  1]) the target is 46
when input is tensor([24, 43, 58,  5, 57,  1, 46]) the target is 43
when input is tensor([24, 43, 58,  5, 57,  1, 46, 43]) the target is 39
when input is tensor([44]) the target is 53
when input is tensor([44, 53]) the target is 56
when input is tensor([44, 53, 56]) the target is 1
wh

In [34]:
# Let us feed this into a simple model: the Bigram model

import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # this is a square embedding matrix
        # This is a token embedding table which is the C matrix.
        # When you pass idx to it, it will return a row corresponding to the index.

    def forward(self, idx, targets=None):

        # idx represents a batch of sequences (B), targets represents the time token (T)
        logits = self.token_embedding_table(idx)
        # logits is a tensor of shape (B,T,C) where C is the vocab size, or the channel size.
        # 
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C) # convert to 2D array using view
            targets = targets.view(B*T) # convert to 1D array using view
            loss = F.cross_entropy(logits,targets) # we will use the cross entropy loss

        return logits, loss

    def generate(self, idx,max_new_tokens): # generates new character for the predicted character
        for _ in range(max_new_tokens):
            # get predictions
            logits,loss = self(idx)
            # only consider the last time instance in the BXTXC logits matrix
            logits = logits[:,-1,:]  # this makes it a BXC matrix as the last time instance is considered
            # apply softmax to get the probabilities
            probs = F.softmax(logits,dim = -1) # BXC matrix
            # sample from the distritbution
            idx_next = torch.multinomial(probs,num_samples = 1) # now this is a Bx1 matrix, one prediction per batch
            # append the character to the running sequence
            idx = torch.cat([idx,idx_next],dim = 1) # (B, T+1)
        return idx



model = BigramModel(vocab_size)
print(model)
#out = model(xb,yb)
logits, loss = model(xb,yb) # this should not work as of now. because when a multidimensional
# tensor is passed to the cross entropy loss, it expects a BxCxT tensor, where C is the number of classes,
# while our tensor is a BxTxC tensor. So we need to permute the dimensions of the tensor.
print(logits.shape)

print(loss) # which should be approximately -log_n(1/65), the negative log likelihood of a random guess.

idx = torch.zeros((1,1),dtype=torch.long)
print(decode(model.generate(idx,max_new_tokens = 100)[0].tolist()))


BigramModel(
  (token_embedding_table): Embedding(65, 65)
)
torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [50]:
# small check to see what device pytorch is using
torch.backends.mps.is_available()

True

In [43]:
# Let us train this bigram model

max_iters = 10000
lr = 10

for i in range(max_iters):
    # get a batch
    xb,yb=get_batch('train')
    # get predictions
    logits,loss = model(xb,yb)
    # compute gradients
    loss.backward()
    # update parameters
    with torch.no_grad():
        for p in model.parameters():
            p -= lr * p.grad
        model.zero_grad()
    # print loss
    if i % 1000 == 0:
        print(f"loss at iteration {i} is {loss.item()}")

# Let us see how the model performs on the validation set
xv,yv = get_batch('val')
logits_val,loss_val = model(xv,yv)
print(f"loss on validation set is {loss_val.item()}")
idx = torch.zeros((1,1),dtype=torch.long)
print(decode(model.generate(idx,max_new_tokens = 100)[0].tolist()))

loss at iteration 0 is 2.2298288345336914
loss at iteration 1000 is 2.3685214519500732
loss at iteration 2000 is 2.6540417671203613
loss at iteration 3000 is 2.360241651535034
loss at iteration 4000 is 2.616257667541504
loss at iteration 5000 is 2.739475965499878
loss at iteration 6000 is 3.0783445835113525
loss at iteration 7000 is 2.3750882148742676
loss at iteration 8000 is 2.3131730556488037
loss at iteration 9000 is 2.4603514671325684
loss on validation set is 2.288727045059204

Tontcow qure outhatr hy ERWI an. ad as!
Fowe wise selk fath!
ssttst t ses aveeacofa
NILAnd the
KIOr 


In [47]:
# Another way to optimize is using the Adam optimizer
lr = 1e-3 # learning rate for Adam is usually smaller than GD which was the one we used above
optimizer = torch.optim.AdamW(model.parameters(),lr=lr) # typically the lr is 3e-4 for bigger models
batch_size = 128
max_iters = 10000
for i in range(max_iters):
    # get a batch
    xb,yb = get_batch('train')
    # get predictions
    logits,loss = model(xb,yb)
    # set gradients to zero
    optimizer.zero_grad(set_to_none=True)
    # compute gradients
    loss.backward()
    # perform the update
    optimizer.step() # same as our for loop
    # print loss
    if i % 1000 == 0:
        print(f"loss at iteration {i} is {loss.item()}")

# print some outputs
idx = torch.zeros((1,1),dtype=torch.long)
print(decode(model.generate(idx,max_new_tokens=100)[0].tolist()))    
# Get validation loss
xv,yv = get_batch('val')
logits_val,loss_val = model(xv,yv)
print(f"loss on validation set is {loss_val.item()}")

loss at iteration 0 is 2.407052516937256
loss at iteration 1000 is 2.5036914348602295
loss at iteration 2000 is 2.4258666038513184
loss at iteration 3000 is 2.425645589828491
loss at iteration 4000 is 2.4620988368988037
loss at iteration 5000 is 2.4708340167999268
loss at iteration 6000 is 2.455707550048828
loss at iteration 7000 is 2.3786418437957764
loss at iteration 8000 is 2.49593186378479
loss at iteration 9000 is 2.4702494144439697

hid m y, pucear malongeflesamy h he ce.
MNGRKI
Wh! gieyo t.
CAN: ale an!
Barke t CKillcichouburis

T
loss on validation set is 2.452451705932617
