# NanoGPT - Embeddings

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
from torch.nn import functional as F

torch.set_printoptions(precision=2)
generator = torch.manual_seed(42)

vocab_size = 8 # 8 characters or language tokens possible
token_embedding_table = nn.Embedding(vocab_size, vocab_size)

Let's look at the contents of our embedding table now:

In [None]:
print(token_embedding_table.weight)

Parameter containing:
tensor([[ 1.93,  1.49,  0.90, -2.11,  0.68, -1.23, -0.04, -1.60],
        [-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76],
        [ 1.64, -0.16, -0.50,  0.44, -0.76,  1.08,  0.80,  1.68],
        [ 1.28,  1.30,  0.61,  1.33, -0.23,  0.04, -0.25,  0.86],
        [-1.38, -0.87, -0.22,  1.72,  0.32, -0.42,  0.31, -0.77],
        [-1.56,  1.00, -0.88, -0.60, -1.27,  2.12, -1.23, -0.49],
        [-0.91, -0.66,  0.08,  0.53, -0.49,  1.19, -0.81, -0.74],
        [-1.40,  0.04, -0.06,  0.68, -0.10,  1.84, -1.18,  1.38]],
       requires_grad=True)


We can index into this embedding table:

In [None]:
# let's assume a batch of 4 independent rows (B-dimension) which each have 6 characters/tokens (T-dimension):
batch_size = 4
context_length = 6
idx = torch.randint(low=0, high=vocab_size, size=(batch_size, context_length))
idx

tensor([[6, 1, 3, 0, 3, 5],
        [1, 1, 0, 1, 4, 1],
        [3, 3, 6, 3, 6, 3],
        [4, 7, 6, 2, 5, 0]])

In [None]:
# let's us that to index into our embedding table:
logits = token_embedding_table(idx)
logits

tensor([[[-0.91, -0.66,  0.08,  0.53, -0.49,  1.19, -0.81, -0.74],
         [-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76],
         [ 1.28,  1.30,  0.61,  1.33, -0.23,  0.04, -0.25,  0.86],
         [ 1.93,  1.49,  0.90, -2.11,  0.68, -1.23, -0.04, -1.60],
         [ 1.28,  1.30,  0.61,  1.33, -0.23,  0.04, -0.25,  0.86],
         [-1.56,  1.00, -0.88, -0.60, -1.27,  2.12, -1.23, -0.49]],

        [[-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76],
         [-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76],
         [ 1.93,  1.49,  0.90, -2.11,  0.68, -1.23, -0.04, -1.60],
         [-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76],
         [-1.38, -0.87, -0.22,  1.72,  0.32, -0.42,  0.31, -0.77],
         [-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76]],

        [[ 1.28,  1.30,  0.61,  1.33, -0.23,  0.04, -0.25,  0.86],
         [ 1.28,  1.30,  0.61,  1.33, -0.23,  0.04, -0.25,  0.86],
         [-0.91, -0.66,  0.08,  0.53, -0.49,  1.19, -0.81,

So, what we have here is that for each of our independent batch items (our first dimension `batch_size`, 4 in total) we get back a 6 by 8 matrix.  Let's have a look at our first batch item: `[6, 1, 3, 0, 3, 5]`.  Here `6`, `1`, `3` etc are the integer indexes representing each a token (character or subword in LLM-world). 

Those numbers index into the embedding table:

- `6` looks up in the 7th row of our embedding and gives: `[-0.91, -0.66,  0.08,  0.53, -0.49,  1.19, -0.81, -0.74]`
- `1` looks up in the 2nd row of our embedding and gives: `[-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76]`
- `3` looks up in the 4th row of our embedding and gives: `[ 1.28,  1.30,  0.61,  1.33, -0.23,  0.04, -0.25,  0.86]`

So what we get back is for each token-index  of our batch item, a list (size `vocab_size`, here 8) with probabilities for each next token.  That's why we get an additional dimension returned: every token we input into our embedding, returns a list of probabilities for the next token.  So while our:

- input dimension is `batch_size` x `context_length`, here 4 x 6, the 
- output dimension is `batch_size` x `context_length` x `vocab_size`, here 4 x 6 x 8

> This last tensor is a B x T x C tensor. (Batch, Time, Channel).
>
> When calculating the Cross Entropy Loss, it wants to have a B x C x T tensor.

# Calculating the Cross Entropy Loss

Now that we have embeddings, for a simple bigram model, these represent the chances for each character in the vocabulary will be the next character/token.  These are logits: the unprocessed outcome of our network, before they're turned into probabilities.  We can compare that to what we see in reality in our training data to calculate our loss using Cross Entropy.

First let's see how we can shape tensors using their `view()` method.

## Reshaping tensors using `view()`

In [None]:
t = torch.randint(0, 10, (2,4,6))
t

tensor([[[5, 3, 7, 7, 5, 9],
         [1, 5, 1, 9, 1, 4],
         [0, 3, 7, 5, 7, 1],
         [5, 7, 5, 8, 5, 4]],

        [[1, 1, 0, 9, 0, 9],
         [1, 8, 9, 6, 7, 6],
         [0, 9, 5, 2, 9, 1],
         [7, 8, 6, 0, 6, 8]]])

In [None]:
t.view(2,24)  # combine the second and third dimension into one (4x6=24)

tensor([[5, 3, 7, 7, 5, 9, 1, 5, 1, 9, 1, 4, 0, 3, 7, 5, 7, 1, 5, 7, 5, 8, 5, 4],
        [1, 1, 0, 9, 0, 9, 1, 8, 9, 6, 7, 6, 0, 9, 5, 2, 9, 1, 7, 8, 6, 0, 6, 8]])

In [None]:
# we can do the same by having pytorch figure out the size of the remaining dimension, using `-1`
t.view(2, -1)

tensor([[5, 3, 7, 7, 5, 9, 1, 5, 1, 9, 1, 4, 0, 3, 7, 5, 7, 1, 5, 7, 5, 8, 5, 4],
        [1, 1, 0, 9, 0, 9, 1, 8, 9, 6, 7, 6, 0, 9, 5, 2, 9, 1, 7, 8, 6, 0, 6, 8]])

In [None]:
t.view(2,2,-1)

tensor([[[5, 3, 7, 7, 5, 9, 1, 5, 1, 9, 1, 4],
         [0, 3, 7, 5, 7, 1, 5, 7, 5, 8, 5, 4]],

        [[1, 1, 0, 9, 0, 9, 1, 8, 9, 6, 7, 6],
         [0, 9, 5, 2, 9, 1, 7, 8, 6, 0, 6, 8]]])

## Preparing logits tensor (BxTxC) for Cross Entropy Loss 

In [None]:
B, T, C = logits.shape
B, T, C

(4, 6, 8)

What the cross entropy loss function expects is for a multidimensional input, for the channels (C) to be the second dimension.

In [None]:
logits

tensor([[[-0.91, -0.66,  0.08,  0.53, -0.49,  1.19, -0.81, -0.74],
         [-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76],
         [ 1.28,  1.30,  0.61,  1.33, -0.23,  0.04, -0.25,  0.86],
         [ 1.93,  1.49,  0.90, -2.11,  0.68, -1.23, -0.04, -1.60],
         [ 1.28,  1.30,  0.61,  1.33, -0.23,  0.04, -0.25,  0.86],
         [-1.56,  1.00, -0.88, -0.60, -1.27,  2.12, -1.23, -0.49]],

        [[-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76],
         [-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76],
         [ 1.93,  1.49,  0.90, -2.11,  0.68, -1.23, -0.04, -1.60],
         [-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76],
         [-1.38, -0.87, -0.22,  1.72,  0.32, -0.42,  0.31, -0.77],
         [-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76]],

        [[ 1.28,  1.30,  0.61,  1.33, -0.23,  0.04, -0.25,  0.86],
         [ 1.28,  1.30,  0.61,  1.33, -0.23,  0.04, -0.25,  0.86],
         [-0.91, -0.66,  0.08,  0.53, -0.49,  1.19, -0.81,

In [None]:
logits = logits.view(B*T, C) # moves channels (probs for next token for each item in vocabulary) into second dim
logits

tensor([[-0.91, -0.66,  0.08,  0.53, -0.49,  1.19, -0.81, -0.74],
        [-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76],
        [ 1.28,  1.30,  0.61,  1.33, -0.23,  0.04, -0.25,  0.86],
        [ 1.93,  1.49,  0.90, -2.11,  0.68, -1.23, -0.04, -1.60],
        [ 1.28,  1.30,  0.61,  1.33, -0.23,  0.04, -0.25,  0.86],
        [-1.56,  1.00, -0.88, -0.60, -1.27,  2.12, -1.23, -0.49],
        [-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76],
        [-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76],
        [ 1.93,  1.49,  0.90, -2.11,  0.68, -1.23, -0.04, -1.60],
        [-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76],
        [-1.38, -0.87, -0.22,  1.72,  0.32, -0.42,  0.31, -0.77],
        [-0.75,  1.65, -0.39, -1.40, -0.73, -0.56, -0.77,  0.76],
        [ 1.28,  1.30,  0.61,  1.33, -0.23,  0.04, -0.25,  0.86],
        [ 1.28,  1.30,  0.61,  1.33, -0.23,  0.04, -0.25,  0.86],
        [-0.91, -0.66,  0.08,  0.53, -0.49,  1.19, -0.81, -0.74],
        [ 

## Calculate loss versus targets

In [None]:
# let's create some made-up targets to play with, shaped: B x T
idy = torch.randint(low=0, high=vocab_size, size=(batch_size, context_length))
idy

tensor([[0, 0, 6, 0, 7, 0],
        [3, 7, 7, 6, 2, 2],
        [0, 7, 2, 2, 0, 2],
        [4, 1, 6, 1, 0, 3]])

For the Cross Entropy Loss, pytorch expects a one-dimensional tensor for our targets:

In [None]:
targets = idy.view(batch_size * context_length)
targets

tensor([0, 0, 6, 0, 7, 0, 3, 7, 7, 6, 2, 2, 0, 7, 2, 2, 0, 2, 4, 1, 6, 1, 0, 3])

In [None]:
loss = F.cross_entropy(logits, targets)
loss

tensor(2.83, grad_fn=<NllLossBackward0>)