In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [2]:
# hyperparameters
batch_size = 64 # how many number of independent sequences will we process in parallel?
block_size = 256 # what is the maximum context length for predictions?
max_iters = 10000
eval_interval = 500
learning_rate = 1e-4
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 256 # number of dimensions for embeddings
n_head = 8
n_layer = 8
dropout = 0.2
# ----------------------------------------------------------------
print(device)

cuda


In [3]:
torch.manual_seed(1337)

<torch._C.Generator at 0x7f07a6de8930>

In [4]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt


--2024-01-10 12:30:09--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.3’


2024-01-10 12:30:09 (217 MB/s) - ‘input.txt.3’ saved [1115394/1115394]



In [5]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [6]:
# # All the unique characters that occur in the input
# chars = sorted(list(set(text)))
# vocab_size = len(chars)
# # create a mapping from characters to integers
# stoi = {ch:i for i, ch in enumerate(chars)}    
# itos = {i:ch for i, ch in enumerate(chars)}
# encode = lambda s: [stoi[c] for c in s]
# decode = lambda l: ''.join([itos[i] for i in l])


In [7]:
import re
text_words = re.split(r"(\s|\n)", text)

words = sorted(list(set(text_words)))
vocab_size = len(words)

# create a mapping from characters to integers
stoi = {ch:i for i, ch in enumerate(words)}    
itos = {i:ch for i, ch in enumerate(words)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [8]:
# import sentencepiece as spm

# # train sentencepiece model from `botchan.txt` and makes `m.model` and `m.vocab`
# # `m.vocab` is just a reference. not used in the segmentation.
# spm.SentencePieceTrainer.train('--input=input.txt --model_prefix=m --vocab_size=10000')

# # makes segmenter instance and loads the model file (m.model)
# sp = spm.SentencePieceProcessor()
# sp.load('m.model')

# text_words = sp.encode_as_ids(text)

# # print(sp.decode_ids([250, 28, 15, 330, 180]))

In [9]:
# train and test splits
data = torch.tensor(encode(text_words), dtype=torch.long)
n = int(0.9*len(data)) # Forst 90% is train and the rest is eval
train_data = data[:n]
val_data = data[n:]

# # train and test splits
# data = torch.tensor(text_words, dtype=torch.long)
# n = int(0.9*len(data)) # Forst 90% is train and the rest is eval
# train_data = data[:n]
# val_data = data[n:]

In [10]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split=='train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [11]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [12]:
# Single attention head model
class Head(nn.Module):
    """ one head of self-attention"""

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T]==0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out
        
    
# multiple single heads running in parallel
class MultiHeadAttention(nn.Module):
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return out


class FeedForward(nn.Module):
    """a simple feed forward layer followed by a non-linearity"""

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation."""
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

    
class BigramLanguageModel(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
    
    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        logits = self.lm_head(x)

        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get logits and predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :]
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            # append sampled index to the running sequence
            idx = torch.cat([idx, idx_next], dim=1)
        return idx

In [13]:
model = BigramLanguageModel()
m = model.to(device)

#create pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [14]:
%%time
for iter in range(max_iters):

    # every one in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 10.6862, val loss 10.6812
step 500: train loss 3.7337, val loss 4.0633
step 1000: train loss 3.6551, val loss 4.0685
step 1500: train loss 3.5423, val loss 4.0351
step 2000: train loss 3.3951, val loss 4.0468
step 2500: train loss 3.2038, val loss 4.1116
step 3000: train loss 3.0019, val loss 4.1833
step 3500: train loss 2.7860, val loss 4.3171
step 4000: train loss 2.5715, val loss 4.4908
step 4500: train loss 2.3866, val loss 4.6751
step 5000: train loss 2.2356, val loss 4.8261
step 5500: train loss 2.1073, val loss 4.9531
step 6000: train loss 1.9905, val loss 5.0809
step 6500: train loss 1.8826, val loss 5.2147
step 7000: train loss 1.7855, val loss 5.3216
step 7500: train loss 1.6906, val loss 5.4403
step 8000: train loss 1.5978, val loss 5.5374
step 8500: train loss 1.5108, val loss 5.6361
step 9000: train loss 1.4259, val loss 5.7402
step 9500: train loss 1.3418, val loss 5.8808
CPU times: user 30min 20s, sys: 20min 36s, total: 50min 56s
Wall time: 50min 56s


In [15]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


LADY ANNE:
What do you not fear?

DUCHESS OF YORK:
I warrant thee fear,--

GLOUCESTER:

Third Servant:
Why, Rivers I spoke dead, I so. but well,

DUCHESS Citizen:
At this night, of all that prayers is words
God's name, hope must have more love.

DUCHESS OF YORK:
I'll be it well: I loved you tell thee,
We'll see you all that you have made a husband.
Ere you were you knew no such true:
Nor would do:

Messenger:
Your dagger, is enough, you were so brief with being of love.

HASTINGS:
I mean, your honour's beseech not you:
Look grant your times is at the flint, deliver
Your brother and very well in this place with either
therefore not hold their harms and your hearts
For kind, unto him ten thousand and wailing my kin,
Who yet plainly me before my mother, I home,
Whose being proof to instruct any jot that
Desire even to this desperate good as far is
forty full of prince: to or that concerns out of their
joy waded forth in mine eyes,
Of no place are twelve at on them nothing
With out superf

In [16]:
total_parameters = 0
for param in m.parameters():
    num_p = param.size().numel()
    total_parameters += num_p
print(total_parameters)    

19548233


In [18]:
# # generate from the model
# context = torch.zeros((1, 1), dtype=torch.long, device=device)
# generated_ids = m.generate(context, max_new_tokens=500)[0].tolist()
# print(sp.decode_ids(generated_ids))
# # print(decode(generated_ids))