In [8]:
import torch
import tiktoken
import torch.nn as nn

In [9]:
input_text = "This is a pen."

In [14]:
tokenizer = tiktoken.get_encoding("gpt2")

In [18]:
token_ids = tokenizer.encode(input_text)
token_ids

[1212, 318, 257, 3112, 13]

In [30]:
torch.manual_seed(42)

token_embeddings = torch.randn(len(token_ids), 5) # going with 5 embedding dimensions for simplicity
print(f"token_embeddings.shape: {token_embeddings.shape}")
print(f"token_embeddings[0]: {token_embeddings[0]}")
print(f"token_embeddings[0].shape: {token_embeddings[0].shape}")
print(f"token_embeddings[0].unsqueeze(0).shape: {token_embeddings[0].unsqueeze(0).shape}")

torch.Size([1, 5])

In [31]:
token_embeddings

tensor([[ 1.9269,  1.4873,  0.9007, -2.1055,  0.6784],
        [-1.2345, -0.0431, -1.6047, -0.7521, -0.6866],
        [-0.4934,  0.2415, -1.1109,  0.0915, -2.3169],
        [-0.2168, -1.3847, -0.3957,  0.8034, -0.6216],
        [-0.5920, -0.0631, -0.8286,  0.3309, -1.5576]])

```python
token_embeddings = tensor([[ 1.9269,  1.4873,  0.9007, -2.1055,  0.6784],
        [-1.2345, -0.0431, -1.6047, -0.7521, -0.6866],
        [-0.4934,  0.2415, -1.1109,  0.0915, -2.3169],
        [-0.2168, -1.3847, -0.3957,  0.8034, -0.6216],
        [-0.5920, -0.0631, -0.8286,  0.3309, -1.5576]])
```

writing the token embeddings tensor above in latex below:

$$
\begin{bmatrix}
\begin{bmatrix} 1.9269 & 1.4873 & 0.9007 & -2.1055 & 0.6784 \\ -1.2345 & -0.0431 & -1.6047 & -0.7521 & -0.6866 \\ -0.4934 & 0.2415 & -1.1109 & 0.0915 & -2.3169 \\ -0.2168 & -1.3847 & -0.3957 & 0.8034 & -0.6216 \\ -0.5920 & -0.0631 & -0.8286 & 0.3309 & -1.5576 \end{bmatrix}
\end{bmatrix}
$$


In [44]:
def pos_encoding(seq_len, d, n=10000):
    P = torch.zeros((seq_len, d))
    for k in range(seq_len):
        for i in range(d):
            denominator = torch.tensor(n).pow(-i/d)
            P[k, i] = torch.sin(k * denominator) if i % 2 == 0 else torch.cos(k * denominator)
    return P

In [45]:
token_embed_w_pos_encoding = token_embeddings + pos_encoding(len(token_ids), token_embeddings.shape[1])
token_embed_w_pos_encoding

tensor([[ 1.9269,  2.4873,  0.9007, -1.1055,  0.6784],
        [-0.3931,  0.9444, -1.5796,  0.2479, -0.6860],
        [ 0.4159,  1.1917, -1.0607,  1.0915, -2.3157],
        [-0.0757, -0.4956, -0.3204,  1.8033, -0.6197],
        [-1.3488,  0.7426, -0.7282,  1.3308, -1.5550]])

Positional Encoding Logic:

The formula for the positional encoding is:

$$
P(pos, 2i) = \sin\left(\frac{pos}{n^{2i/d}}\right)
$$
$$
P(pos, 2i+1) = \cos\left(\frac{pos}{n^{2i/d}}\right)
$$

Here, the first $P(pos, 2i)$ is the sine function and the second $P(pos, 2i+1)$ is the cosine function.

The logic here is that we use sine when the position is even and cosine when the position is odd.

This is because the sine and cosine functions are orthogonal to each other, which allows us to represent any function as a sum of sine and cosine functions.

token_embed_w_pos_encoding

$$
\begin{bmatrix}
\begin{bmatrix} 1.9269 & 2.4873 & 0.9007 & -1.1055 & 0.6784 \\ -0.3931 & 0.9444 & -1.5796 & 0.2479 & -0.6860 \\ 0.4159 & 1.1917 & -1.0607 & 1.0915 & -2.3157 \\ -0.0757 & -0.4956 & -0.3204 & 1.8033 & -0.6197 \\ -1.3488 & 0.7426 & -0.7282 & 1.3308 & -1.5550 \end{bmatrix}
\end{bmatrix}
$$


In [None]:
class SelfAttention(nn.Module):
    def __init__(self, dim_size, row_size=0, col_size=1):
        super().__init__()
        self.key = nn.Linear(dim_size, dim_size, bias=False)
        self.query = nn.Linear(dim_size, dim_size, bias=False)
        self.value = nn.Linear(dim_size, dim_size, bias=False)
        self.row_size = row_size
        self.col_size = col_size
        
    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        