# The dataset

First we will download the tinyshakespeare dataset and examine its contents.

In [125]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

with open('input.txt') as f:
    text = f.read()

print(text[:100])

--2024-05-24 12:04:25--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.4’


2024-05-24 12:04:25 (18.2 MB/s) - ‘input.txt.4’ saved [1115394/1115394]

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [126]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print('Vocab size:', vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Vocab size: 65


# Simple character level tokenizer.

We will create a simple character level tokenizer. Each unique character in the dataset becomes a token mapped to an integer value. In practice character level tokenizers are not used.
Check out BPE tonezizers like SentencePiece and tiktoken.

In [127]:
class Tokenizer:
    def __init__(self, chars):
        self.char_to_ix = {ch: i for i, ch in enumerate(chars)}
        self.ix_to_char = {i: ch for i, ch in enumerate(chars)}
    
    def char_to_index(self, ch):
        return self.char_to_ix[ch]
    
    def index_to_char(self, ix):
        return self.ix_to_char[ix]

    def encode(self, text):
        return [self.char_to_index(ch) for ch in text]
    
    def decode(self, indices):
        return ''.join([self.index_to_char(ix) for ix in indices])


In [128]:
tokenizer = Tokenizer(chars)
tokens = tokenizer.encode("Hello")
print(tokens)
print(tokenizer.decode(tokens))

[20, 43, 50, 50, 53]
Hello


In [129]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data = torch.tensor(tokenizer.encode(text), dtype=torch.long, device=device)

print(data.shape, data.dtype)

torch.Size([1115394]) torch.int64


In [130]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.1, shuffle=False)

In [131]:
batch_size = 8
block_size = 32

def get_batch(data):
    indices = torch.randint(0, data.size(0) - block_size, (batch_size,))

    x = torch.stack([data[i:i + block_size] for i in indices])
    y = torch.stack([data[i + 1:i + block_size + 1] for i in indices])

    x = x.to(device)
    y = y.to(device)

    return x, y


In [132]:
xb, yb = get_batch(train_data)

print(xb.shape, yb.shape)
print("X:")
print(tokenizer.decode(xb[0].tolist()))
print("-" * 20)
print("Y:")
print(tokenizer.decode(yb[0].tolist()))

torch.Size([8, 32]) torch.Size([8, 32])
X:
The valiant heart is not whipt o
--------------------
Y:
he valiant heart is not whipt ou


In [133]:
import torch.nn as nn
import torch.nn.functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets = None):
        logits = self.embedding(idx) # (batch, seq_len, vocab_size)

        if targets is None:
            loss = None
        else:

            batch_size, seq_len, vocab_size = logits.shape
            logits = logits.view(batch_size * seq_len, vocab_size)
            targets = targets.view(batch_size * seq_len)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens = 25):
        # idx: (batch, seq_len)

        for _ in range(max_new_tokens):
            logits, _ = self(idx)                # (batch, seq_len, vocab_size) for logits

            # this preserves the each sequence in the batch, and the vocab logits, but only the last token in each sequence
            logits = logits[:, -1, :]                           # (batch, vocab_size)

            probs = F.softmax(logits, dim=-1)                   # (batch, vocab_size)

            idx_next = torch.multinomial(probs, num_samples=1)  # (batch, 1)
            idx = torch.cat([idx, idx_next], dim=1)             # (batch, seq_len + 1)
        
        return idx



model = BigramLanguageModel(vocab_size)
model = model.to(device)

logits, loss = model(xb, yb)
print(logits.shape, loss)

idx = torch.zeros((1,1), dtype=torch.long).to(device)
logits = model.generate(idx, max_new_tokens=100)
print(tokenizer.decode(logits[0].tolist()))

torch.Size([256, 65]) tensor(4.6709, device='cuda:0', grad_fn=<NllLossBackward0>)

N;MPjL
.KyF,nIPXUrWfEqA.$Tm!FdfKNlkIxIjXOKRJ;SY:$VlRIgk'DRDeq?kVwxdlSzlhSy&c'GP,RWx'AdOjTrzPFwhuMsoo


In [134]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

batch_size = 32
for step in range(10000):
    model.train()
    xb, yb = get_batch(train_data)

    logits, loss = model(xb, yb)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f"Loss {loss.item()}")

Loss 2.451451063156128


In [136]:
idx = torch.zeros((1,1), dtype=torch.long).to(device)
logits = model.generate(idx, max_new_tokens=250)
print(tokenizer.decode(logits[0].tolist()))



FRI yoppe stimigaser, fath,VAs th
To t-o anofoveouick.

PENI s
Ththet g;
Ale bl wano pro anthoor a daveathornais fuknonalil me pofove sterdesasthousour wetshe t cran
Ant,
Bur,
Try merh lat ous, pt?
AGBY: minord hanoras ke y cu

LUCaven cere t VI:

C


In [137]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()

    for split in ['train', 'val']:
        losses = []

        for step in range(100):
            X, Y = get_batch(train_data if split == 'train' else test_data)
            _, loss = model(X, Y)
            losses.append(loss.item())

        out[split] = sum(losses) / len(losses)

    model.train()
    return out

In [138]:
def train(model, optimizer, train_data, test_data, n_steps=1000):
    for step in range(n_steps):
        model.train()
        X, Y = get_batch(train_data)

        logits, loss = model(xb, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step % 100 == 0:
            print(estimate_loss())

In [None]:
train(model, optimizer, train_data, test_data, n_steps=1000)