In [36]:
# Tutorial: https://www.youtube.com/watch?v=kCc8FmEb1nY&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=9

In [1]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [1]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
print(text[:101])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You 


In [3]:
#number of unique characters in the text
len(set(text)) #present character level vocab size

65

In [4]:
vocab = sorted(set(text))
vocab_size = len(vocab)

In [5]:
print(vocab_size)

65


In [5]:
print(''.join(vocab))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [6]:
#considering simple character to ID level encoding
stoi = {chr_:i for i, chr_ in enumerate(vocab)}
itos = {i:chr_ for i, chr_ in enumerate(vocab)}

In [7]:
encode = lambda s: [stoi[j] for j in s]
decode = lambda k: ''.join([itos[m] for m in k])

In [8]:
print(encode("I am Atanu"))
print("\n=============\n")
print(decode(encode("I am Atanu")))

[21, 1, 39, 51, 1, 13, 58, 39, 52, 59]


I am Atanu


In [9]:
import torch

In [12]:
len(text)

1115395

In [13]:
data = torch.tensor(encode(text), dtype=torch.long)

In [14]:
print(data.shape, data.dtype)

torch.Size([1115395]) torch.int64


In [12]:
print(data[:1000]) # the 1000 characters we looked at earier will to the GPT look like this

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [16]:
#Lets now split the data into train and test
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [17]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [18]:
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([18]) the target: 47
when input is tensor([18, 47]) the target: 56
when input is tensor([18, 47, 56]) the target: 57
when input is tensor([18, 47, 56, 57]) the target: 58
when input is tensor([18, 47, 56, 57, 58]) the target: 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target: 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target: 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target: 58


In [19]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
device = "cuda" if torch.cuda.is_available() else "cpu"


def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

print('----')

inputs:
torch.Size([4, 8])
tensor([[56,  6,  0, 24, 43, 58,  1, 61],
        [39, 47, 51,  1, 58, 46, 39, 58],
        [52, 45,  1, 58, 53,  1, 57, 39],
        [43, 47, 52, 45,  1, 46, 53, 50]])
targets:
torch.Size([4, 8])
tensor([[ 6,  0, 24, 43, 58,  1, 61, 46],
        [47, 51,  1, 58, 46, 39, 58,  1],
        [45,  1, 58, 53,  1, 57, 39, 63],
        [47, 52, 45,  1, 46, 53, 50, 47]])
----


In [20]:
for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

when input is [56] the target: 6
when input is [56, 6] the target: 0
when input is [56, 6, 0] the target: 24
when input is [56, 6, 0, 24] the target: 43
when input is [56, 6, 0, 24, 43] the target: 58
when input is [56, 6, 0, 24, 43, 58] the target: 1
when input is [56, 6, 0, 24, 43, 58, 1] the target: 61
when input is [56, 6, 0, 24, 43, 58, 1, 61] the target: 46
when input is [39] the target: 47
when input is [39, 47] the target: 51
when input is [39, 47, 51] the target: 1
when input is [39, 47, 51, 1] the target: 58
when input is [39, 47, 51, 1, 58] the target: 46
when input is [39, 47, 51, 1, 58, 46] the target: 39
when input is [39, 47, 51, 1, 58, 46, 39] the target: 58
when input is [39, 47, 51, 1, 58, 46, 39, 58] the target: 1
when input is [52] the target: 45
when input is [52, 45] the target: 1
when input is [52, 45, 1] the target: 58
when input is [52, 45, 1, 58] the target: 53
when input is [52, 45, 1, 58, 53] the target: 1
when input is [52, 45, 1, 58, 53, 1] the target: 57


The input data dimension **before embedding** for text data is not \( (\text{batch}, \text{block\_size}, \text{vocab\_size}) \). That specific shape refers to a **one-hot encoded representation**, which is an intermediate representation but not the most common pre-embedding input format.

Here's the clarification:

### Common Input Dimension for Text Data
- **Dimension:** \( (B, L) \)
  - \( B \): Batch size
  - \( L \): Sequence length (or block size, i.e., number of tokens in a sequence)
- **Representation:** This format contains token IDs (integers), where each token ID corresponds to a specific entry in the vocabulary. For example:
  ```
  Input: [[101, 2054, 2003, 102], [101, 2129, 2024, 102]]
  ```

### One-Hot Encoding Format
- **Dimension:** \( (B, L, V) \)
  - \( B \): Batch size
  - \( L \): Sequence length (block size)
  - \( V \): Vocabulary size
- **Representation:** Each token is represented as a one-hot vector of length \( V \), where only one element is 1 (the position of the token ID in the vocabulary), and the rest are 0.

Example:
If \( V = 5 \) and a token ID is 2, the one-hot vector is:
```
[0, 0, 1, 0, 0]
```

### Why One-Hot Encoding is Rarely Used Before Embedding
1. **Memory Inefficiency:** Representing text with \( (B, L, V) \) requires significant memory for large vocabularies.
2. **Embedding Layers Expect Integer IDs:** Modern transformer architectures (like BERT, GPT, etc.) expect integer token IDs \( (B, L) \) as input and map them to embeddings internally.

### Summary
The standard input dimension for text data **before embedding** is \( (B, L) \), where \( L \) is the sequence length. If you're referring to \( (B, L, V) \), that's likely a one-hot encoded intermediate representation and is not typical as the input to a transformer model.

## In the GPT2 transformer, the token embedding layer input dimension layer is 50257, but the raw input_batch dimension is [8,120]. How it is being passed at embedding layer as the dimensions are not matching?

The dimensions of the raw input batch and the token embedding layer in GPT-2 may seem mismatched initially, but they are entirely compatible due to how the embedding layer works. Let’s break it down:

### Input Batch Dimensions: \( [8, 120] \)
- **8**: Batch size — number of sequences in a batch.
- **120**: Sequence length (tokens per sequence).
- **Raw Input**: This is a tensor containing token IDs (integers) with values ranging from \( 0 \) to \( 50256 \) (since the vocabulary size is \( 50257 \)).

### Token Embedding Layer Dimensions: \( [50257, D] \)
- **50257**: Vocabulary size — the number of unique tokens in the tokenizer's vocabulary.
- **\( D \)**: Embedding size — the dimensionality of the token embeddings (e.g., 768 or 1024 for GPT-2 small and medium).

### How It Works
The embedding layer maps each token ID in the input batch to its corresponding vector from the embedding table. Here’s the process:

1. **Input Token IDs**:
   - Each integer in the \( [8, 120] \) batch is treated as an index into the embedding table of size \( [50257, D] \).

2. **Embedding Lookup**:
   - For each token ID, the embedding layer retrieves the corresponding embedding vector from the table.
   - The embedding lookup transforms the \( [8, 120] \) tensor into a new tensor of shape \( [8, 120, D] \), where \( D \) is the embedding dimension.

3. **Resulting Tensor**:
   - The resulting tensor \( [8, 120, D] \) is then passed to subsequent layers in the transformer.

### Why Dimensions Match
- The embedding layer expects token IDs as input, not one-hot encoded vectors. The input \( [8, 120] \) simply serves as indices to query the embedding table.
- The embedding table itself is a learnable matrix of shape \( [50257, D] \), and each token ID directly indexes into it, producing an embedding vector of size \( D \).

### Visualization
1. Input batch: 
   ```
   [[5, 17, 102, ...],  # Sequence 1
    [23, 76, 3, ...],   # Sequence 2
    ...
    [12, 7, 89, ...]]   # Sequence 8
   Shape: [8, 120]
   ```

2. Embedding table (learnable):
   ```
   [[e_0], [e_1], ..., [e_50256]]  # Each row is an embedding of size D
   Shape: [50257, D]
   ```

3. Output:
   ```
   [[[e_5], [e_17], [e_102], ...],   # Sequence 1 embeddings
    [[e_23], [e_76], [e_3], ...],    # Sequence 2 embeddings
    ...
    [[e_12], [e_7], [e_89], ...]]    # Sequence 8 embeddings
   Shape: [8, 120, D]
   ```

---

### Key Takeaway
The "mismatch" in dimensions is resolved because the embedding layer uses the token IDs in \( [8, 120] \) to index into the \( [50257, D] \) embedding table, producing a tensor of shape \( [8, 120, D] \) as output. This process is efficient and well-suited for transformers.

In [23]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):

        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C) <- C is the embedding dimension
        #print(logits,"#########################")

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

m = BigramLanguageModel(vocab_size)
m = m.to(device)
logits, loss = m(xb, yb)
print(logits.shape)
print(loss)

print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=100)[0].tolist()))


torch.Size([32, 65])
tensor(4.6437, grad_fn=<NllLossBackward0>)

Sr?qP-QWktXoL&jLDJgOLVz'RIoDqHdhsV&vLLxatjscMpwLERSPyao.qfzs$Ys$zF-w,;eEkzxjgCKFChs!iWW.ObzDnxA Ms$3


In [19]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-4)

In [20]:
batch_size = 32
for steps in range(10000): # increase number of steps for good results...

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

3.687521457672119


In [21]:
print(decode(m.generate(idx = torch.zeros((1, 1), dtype=torch.long, device=device), max_new_tokens=500)[0].tolist()))


W?w3cHPyZWk,f's$a-oizCjmuX
YoR&$FMVofXisEvB!!BA!$W:CdYlixcaeg ireeYERnkcin;lxWiHFliqmoGSKtSV&BLqWk p.SGFo.
SGjbo!UelIlind,pea!.
-huD3SPyckzby:CUup;MOissX3Qwty.OJlvBPUSIkyBf&patelgCIEJMk:Chll,SPlyltSPkqmoRW-wNAXQbjxCevib3s 'T:C-&dE$HZAETENehhir$Fstp-LK3:CJ-xTrg

ALkOdmnunruf?qA so;;3QQkhWTE:CEt,jep$vUMUE$Ew,fMf PRD?d KISKI.JrZKINLIk!as,iyb&y&a
SadapbWPT:VEGDxlYBTEin KNukqfa!ateyCRry ts-I&fy VE?!3Myk!qEEYFEPkURJG&y.linXy'WWhiRUFhm sEra CERWs$.-w?n;mNX&qq-w'eY.rdaJR?; s-z;K:WhsBota qHugUvxIERTI'dul


In [22]:
torch.manual_seed(42)
a = torch.tril(torch.ones(3, 3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0,10,(3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [23]:
# consider the following toy example:

torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [24]:
# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)

In [25]:
xbow

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])

In [26]:
# version 2: using matrix multiply for a weighted aggregation
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keepdim=True)
xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
# torch.allclose(xbow, xbow2)

In [27]:
wei.shape, xbow2.shape

(torch.Size([8, 8]), torch.Size([4, 8, 2]))

In [28]:
torch.allclose(xbow, xbow2)

True

In [29]:
from torch.nn import functional as F

In [30]:
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
torch.allclose(xbow, xbow3)

True

In [34]:
wei

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [31]:
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)

# let's see a single Head perform self-attention
head_size = 16

# Linear1(x)-> Q,  Linear2(x)-> K, Linear3(x)-> V //// For each token x
# In some rough sense, 
# for a token--> 
#    q: what information the token looking for, 
#    k: what information it has or can contribute
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)   # (B, T, 16)
q = query(x) # (B, T, 16)
v = value(x) # (B, T, 16)

# For each token in the form of q, now checking the similarity with all other tokens in the form k
# and it forms the weight matrix
wei =  q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) ---> (B, T, T)

# Weighted aggregation of the token and all of its previous tokens, for each token
# this to replicate not to include later token information
tril = torch.tril(torch.ones(T, T))

# this to make the weight distribution positive and normal
wei = wei.masked_fill(tril == 0, float('-inf')) # 
wei = F.softmax(wei, dim=-1)

# weighted aggregation
out = wei @ v

out.shape

torch.Size([4, 8, 16])

In [43]:
v.shape

torch.Size([4, 8, 16])

Notes:
- Attention is a **communication mechanism**. Can be seen as nodes in a directed graph looking at each other and aggregating information with a weighted sum from all nodes that point to them, with data-dependent weights.
- There is no notion of space. Attention simply acts over a set of vectors. This is why we need to positionally encode tokens.
- Each example across batch dimension is of course processed completely independently and never "talk" to each other
- In an "encoder" attention block just delete the single line that does masking with `tril`, allowing all tokens to communicate. This block here is called a "decoder" attention block because it has triangular masking, and is usually used in autoregressive settings, like language modeling.
- "self-attention" just means that the keys and values are produced from the same source as queries. In "cross-attention", the queries still get produced from x, but the keys and values come from some other, external source (e.g. an encoder module)
- "Scaled" attention additional divides `wei` by 1/sqrt(head_size). This makes it so when input Q,K are unit variance, wei will be unit variance too and Softmax will stay diffuse and not saturate too much. Illustration below

------------------------------------------------------------------------------------

Tuning hyperparameters for transformer models can be complex, but there are some guidelines and strategies to make it more systematic and effective. Here are some tips for each parameter:

### 1. **Batch Size**
   - Larger batch sizes can speed up training, but they also r1.220673 Mequire more memory, which could be a constraint depending on your hardware (like your RTX 2060 GPU).
   - Start with a smaller batch size (e.g., 8 or 16) and increase gradually if your GPU allows.
   - Alternatively, gradient accumulation can mimic larger batch sizes without increasing actual memory usage.

### 2. **Block Size (Sequence Length)**
   - This controls the length of the sequence used as input. Higher values improve the model's capacity to capture long-range dependencies, but at the cost of increased memory usage.
   - Experiment with small values (e.g., 32, 64) and increase based on your dataset's average sequence length. For very long sequences, consider breaking them up and applying techniques like masking to capture contextual information.

### 3. **Learning Rate**
   - Start with a learning rate around \(1e-4\) or \(1e-3\) and use learning rate schedulers like cosine decay or step decay.
   - If training is unstable, reduce the learning rate, as high learning rates can cause exploding gradients in transformer models.
   - Use smaller learning rates for fine-tuning pre-trained models.

### 4. **Dropout Rate**
   - Generally, 0.1 to 0.3 is a good range to start with. For larger datasets, lower dropout (around 0.1) may be suitable, while smaller datasets may need a higher dropout to avoid overfitting.

### 5. **Embedding Size (`n_embd`)**
   - Start with small values like 64 or 128 if your model size is restricted by hardware. However, `n_embd` should ideally be divisible by `n_head`.
   - Increase `n_embd` if you’re observing underfitting (model is unable to learn the complexities of the dataset).
   
### 6. **Number of Heads (`n_head`)**
   - Usually, `n_embd` divided by `n_head` should result in an integer for ease of implementation.
   - For a small model, starting with 8 heads should work well. Increasing it may allow the model to capture more complex relationships but will require more memory.

### 7. **Number of Layers (`n_layer`)**
   - Start with 4–6 layers and increase as needed, depending on your hardware.
   - More layers can improve model performance but may lead to overfitting or excessive computational load. You can also experiment with 12 layers if using techniques like regularization or dropout.

### 8. **Evaluation Interval and Iterations**
   - For `max_iters`, set it high, but use early stopping based on validation loss to prevent unnecessary computation.
   - `eval_interval` and `eval_iters` can be adjusted to balance frequency of evaluations with performance overhead. More frequent evaluations provide more insights into learning dynamics but increase runtime.

### **Practical Tips for Hyperparameter Tuning**
- **Learning Rate Finder**: Gradually increase the learning rate and observe the loss. It can give you an idea of a good learning rate range.
- **Grid Search or Random Search**: Start with a coarse grid or a randomized search to find promising values and then perform a finer search around these values.
- **Optuna and Hyperopt**: Tools like Optuna can automate hyperparameter tuning by efficiently navigating the search space based on Bayesian optimization.

Let me know if you’d like to dive deeper into tuning specific parameters, especially given your GPU constraints!

Training a transformer model, even with a smaller vocabulary size like 512, can still be slow due to the complexity of self-attention operations and the number of parameters in the model. Here are some tips to speed up training on your setup:

1. **Mixed-Precision Training (FP16)**:
   - Mixed-precision training allows computations to be done in 16-bit floating-point precision instead of 32-bit, reducing memory usage and speeding up training. NVIDIA’s **Apex library** or **PyTorch’s `torch.cuda.amp`** module can help with this. The RTX 2060 supports FP16, so this could give a performance boost.

2. **Reduce Model Depth or Width**:
   - If feasible, consider using fewer layers (reduce the depth) or fewer attention heads in the model. This will reduce the number of parameters and speed up training. For example, using a 6-layer transformer instead of a 12-layer one or reducing the number of heads from 12 to 8 could make a noticeable difference.

3. **Gradient Accumulation**:
   - If batch size is small due to memory limitations, gradient accumulation can simulate a larger batch size by accumulating gradients over multiple forward passes before updating weights. This can help with stability and possibly improve convergence speed.

4. **Optimize Data Loading**:
   - Ensure your data pipeline is efficient. Use **`DataLoader`** with `num_workers` for parallel data loading if using PyTorch, which can reduce waiting time for each batch.

5. **Reduce Sequence Length (if possible)**:
   - Training time scales with sequence length due to the self-attention mechanism. If you can shorten your input sequences without losing context, it can speed up each training iteration.

6. **Increase Learning Rate or Use Warmup**:
   - Using a slightly higher learning rate or a learning rate schedule with warmup can help the model converge faster. Be cautious, as too high of a learning rate can destabilize training.

7. **Use a Smaller Batch Size with Gradient Accumulation**:
   - If your model’s batch size is limited by memory, setting a smaller batch size and accumulating gradients over multiple steps may help.

8. **Profile the Training Loop**:
   - Tools like **PyTorch’s profiler** can help identify bottlenecks. For example, you might find that a large portion of time is spent on specific operations or data loading.

Let me know if you want more details on any of these techniques!

Reducing the number of attention heads in a multihead attention mechanism doesn’t necessarily decrease the total number of model parameters because of how the parameters are allocated across different parts of the model. Here’s why this happens:

1. **Fixed Projection Dimension**:
   - In multihead attention, the total dimensionality of the attention projection is typically fixed (e.g., 512 or 768) regardless of the number of heads. For example, if the model's hidden dimension is 512, it will still remain 512 even if you reduce the number of heads. 
   - Reducing the number of heads doesn’t change the overall dimensionality of the query, key, and value matrices; instead, it just changes how the dimensionality is split across each head.

2. **Parameter Distribution Across Attention Layers**:
   - The query, key, and value matrices in the multihead attention layer are still projected to the same dimensionality, so the number of parameters in each of these matrices remains unchanged.
   - When you reduce the number of heads, the model simply divides the same total projection size into fewer heads, making each head slightly "wider" but keeping the total parameter count constant.

3. **Feed-Forward Layers Remain Unchanged**:
   - In transformer layers, a significant portion of parameters is also in the feed-forward network (FFN) layer after the attention layer. This layer is independent of the number of attention heads and typically has a large number of parameters that remain the same.

### How to Actually Reduce Parameters
To reduce the number of parameters effectively, you could try the following:

- **Reduce the Model’s Hidden Dimension**: Reducing the hidden dimension of the transformer (e.g., from 512 to 384) will decrease the size of both the attention and feed-forward layers, lowering parameter count overall.
- **Reduce the Number of Layers**: Reducing the number of transformer layers (depth of the model) will decrease the number of parameters as each layer contains independent parameters.
- **Decrease FFN Dimension**: The FFN layer often has a higher dimension than the model's hidden size (e.g., 4x the hidden dimension). Reducing this factor will also reduce the parameter count significantly.

Let me know if you’d like more specifics on adjusting these aspects in your model!

In [1]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import sys
sys.path.append("/home/karapathy_trainings/")
from tokenizer.minbpe.minbpe import BasicTokenizer
import tiktoken
import datetime

In [2]:
tokenizer_ = BasicTokenizer()
tokenizer_.load("/home/karapathy_trainings/tokenizer/minbpe/bpe_encoding.model")

In [3]:
# hyperparameters
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 100 # what is the maximum context length for predictions?
max_iters = 500
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 768 # should be a number divisible by n_head
n_head = 16 
n_blocks = 4
dropout = 0.2
# ------------

In [4]:
device

'cuda'

In [5]:
# torch.manual_seed(1337)

# # wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
# with open('input.txt', 'r', encoding='utf-8') as f:
#     text = f.read()

# # here are all the unique characters that occur in this text
# chars = sorted(list(set(text)))
# vocab_size = len(chars)
# # create a mapping from characters to integers
# stoi = { ch:i for i,ch in enumerate(chars) }
# itos = { i:ch for i,ch in enumerate(chars) }

# encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
# decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string


# # Train and test splits
# data = torch.tensor(encode(text), dtype=torch.long)
# n = int(0.9*len(data)) # first 90% will be train, rest val
# train_data = data[:n]
# val_data = data[n:]

In [6]:
torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# here are all the unique characters that occur in this text
vocab = tokenizer_.vocab
vocab_size = len(vocab)
                 
# Train and test splits
#data = torch.tensor(encode(text), dtype=torch.long)

data = torch.tensor(tokenizer_.encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [7]:
vocab_size

512

In [5]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [6]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2,-1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            #nn.Linear(4*n_embd, 16 * n_embd),
            #nn.ReLU(),
            #nn.Dropout(dropout),
            #nn.Linear(16*n_embd, 4 * n_embd),
            #nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x)) # residual connection
        x = x + self.ffwd(self.ln2(x)) # residual connection
        return x

In [7]:
# super simple bigram model
class BigramLanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        #self.sa_head = MultiHeadAttention(n_head, n_embd//4) #Head(n_embd)
        #self.ff = FeedFoward(n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_blocks)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        #x = self.sa_head(x)
        #x = self.ff(x)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            #print("****",idx_cond)
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
            #print(idx)
        return idx

In [11]:
model = BigramLanguageModel()
m = model.to(device)
# print the number of parameters in the model
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0 or iter == max_iters - 1:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

In [12]:
get_datetime = lambda x: f"{x.day}_{x.month}_{x.year}_{x.hour}_{x.minute}"

In [13]:
s = get_datetime(datetime.datetime.now())

In [14]:
model_directory = {1:{"model_path":s}}

In [15]:
torch.save(m,f"model_checkpoints/minigpt_{model_directory[1]['model_path']}.model")

In [8]:
m_ = torch.load("model_checkpoints/minigpt_2_11_2024_21_13.model")

In [9]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long
                      , device=device)
print(tokenizer_.decode(m_.generate(context, max_new_tokens=2000)[0].tolist()))

  my ys' ell pt
Which wardthy swouramapey ke th, sp enour was as arany of favans or leay dodr.

Thenclasicor ke to youg a call selfaike.

LUK:
TI from Marre,
KI prixck se wills and wause therf'To whim I wou ears ink my se.
Wille our wouo's whe-thous uk'drathat a weee SAR:
GLI HWAs and or hour somall shat raand least m frigalg'd I oftearsetts d's ofts may, wou pow the Can ble me; fuly mut ill ret ty, swarmore in the ck and sppatipyshORO:
ABut ade isple and boall learmageld ple,
Thaks pp the due ation. IComes of reat this indo besce with ds ely thing daidach's h and ke tos fidere of wave that mart a ck that mdower scornow 'st Fonte less.
As fe der is'lls scan peing athougothem; Yous se thence sles ous? ce poousbe god thord
G antegadly with ut west a idense stalame with my heee thing hearthell firue shos! Commoooingevtavise his and ter morndaking laiscracethat ghat haly
Yethen wold mand; of prop'd as ldeaan tofice thane spare from hat stitueiparneeise ke s.

Had haten hres whimy trut ce h