In [7]:
# some theory on attention mechanism

import torch

B,T,C = 4,8,2
x = torch.randn(B,T,C) # this corresponds to the BTC definition we see in our logits matrix. B is batch, T is time/sequence, C is characters/classes
print(x.shape)

# we would like these tokens to talk to each other to gather their autocorrelative concepts.
# Currently, we do not have this functionality enabled in the Bigram model or any other language model 
# we have developed.

# The simplest way to do this by averaging over the preceeding elements in the sequence/time dimension.
# this is very rudimentary and weak, but we can try this way.
# This can be achieved in the following way:
xbow = torch.zeros(B,T,C) # bow is bag of words, a jargon in NLP denoting some samples of words averaged over
for b in range(B): #looping over batch dimension
    for t in range(T): # looping over sequences
        xprev = x[b,:t+1,:] # +1 is how python works
        xbow[b,t] = xprev.mean(dim=0)

print(xbow.shape,x[:,:2,:].shape)



torch.Size([4, 8, 2])
torch.Size([4, 8, 2]) torch.Size([4, 2, 2])


In [13]:
# While the average somehow works, it is inefficient. We can do this
# better using matrix multiplication

torch.manual_seed(42)
a = torch.ones(3,3)
b = torch.randint(0,10,(3,2)).float()
C = a @ b
print(a.shape,b.shape,C.shape)
print(a,b,C,sep='\n')
# Shows that if a contains only ones, then the matrix multiplication is essentially
# a sum over columns of b. 
# However, the matrix is summing over all the elements, but we want to sum over only
# the preceeding elements. We can do this by using a mask.
# let us create a mask for this purpose
a = torch.tril(torch.ones(3,3))
C  = a @ b
print(a,b,C,sep='\n')
# Now we see that there is some consideration only for the preceeding elements.
# In order to average, we need to sum over the number of elements considered in the average.
a_count = torch.tril(torch.ones(3,3)).sum(dim=1)
print(a_count)
# Therefore, we get the following
C = (a @ b) / a_count[:,None]
print(C)


torch.Size([3, 3]) torch.Size([3, 2]) torch.Size([3, 2])
tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
tensor([[14., 16.],
        [14., 16.],
        [14., 16.]])
tensor([[1., 0., 0.],
        [1., 1., 0.],
        [1., 1., 1.]])
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
tensor([[ 2.,  7.],
        [ 8., 11.],
        [14., 16.]])
tensor([1., 2., 3.])
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [16]:
# Let us now implement this for the xbow matrix

wei = torch.tril(torch.ones(T,T))
wei = wei / wei.sum(dim=1)[:,None]
wei
xbow2 = wei @ x # which is (B,T,T) @ (B,T,C) matrix. 
# In pytorch, the B will be non interactive, (T,T) will multiply with (T,C) to give (T,C)
# giving us a (B,T,C) matrix
print(xbow2.shape)

# Therefore, the attention mechanism is a matrix multiplication 
# that achieves a weighted sum of the preceeding elements in the sequence,
# The weights we have used thus far have the same value for all the elements
# Therefore all the elements are weighted equally. This means that 
# all the previous characters have equal effect on the next character prediction.


torch.Size([4, 8, 2])


tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [22]:
# There is another way to assemble the wei matrix.

tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril==0,float('-inf'))
# This fills up the matrix with negative infinite in the upper triangular part
print(wei)
# If we now take a softmax of this matrix:
wei = torch.nn.functional.softmax(wei,dim=1)
# we get the following effect:
print(wei)
# This is the same, except now, we have used a softmax to get the same effect
xbow3 = wei @ x
torch.allclose(xbow,xbow3)

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


True

In [38]:
# Self attention
torch.manual_seed(1337)
B,T,C = 4,8,32
x = torch.randn(B,T,C) # this corresponds to the BTC definition we see in our logits matrix. B is batch, T is time/sequence, C is characters/classes

# wei is simply averaging uniformly, but we want it to be more dynamic way.
# Self attention is a way to do this.
# Every single node will emit two vectors, a query and key.
# query: what am I looking for?
# key: what do I contain?

# let us see a single Head perform self attention
head_size = 16
key = torch.nn.Linear(C,head_size,bias=False)
query = torch.nn.Linear(C,head_size,bias=False)
value = torch.nn.Linear(C,head_size,bias=False)
k = key(x) # B,T,head_size
q = query(x) # B,T,head_size
wei = q @ k.transpose(-2,-1) # B,T,n_heads @ B,n_heads,T = B,T,T

# We can now apply the masking to this instead.
tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril==0,float('-inf'))
wei = torch.nn.functional.softmax(wei,dim=-1)
#print(wei)
#out = wei @ x
v = value(x)
out = wei @ v
print(wei[0]) # B,T,head_size

# So self attention is a way to get a weighted sum of the preceeding elements
# where the weights are determined by the query and key vectors.
# and the query and key vectors are learned from the data through training.

# Attention seems to be a communication mechanism between the nodes in the graph.
# In our case, the communication is done in an autoregressive manner, where future
# nodes are not allowed to communicate with the past nodes.

# As the operation is done batch-wise, batches do not communicate.

# We can also delete the tril tensor from being used by removing it.
# This allows all the nodes to talk to each other. This is called an 'encoder block'.

# In the decoder block, the tril tensor will have.

# Self attention means the key, value and query are coming from the same source.
# Cross attention means the key, value and query are coming from different sources.


tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)


In [36]:
# Scaled dot product attention
# This is a normalization applied to the wei matrix.

# This is because:
k = torch.randn(B,T,head_size)
q = torch.randn(B,T,head_size)
wei = q @ k.transpose(-2,-1) # B,T,n_heads @ B,n_heads,T = B,T,T
print(k.var(),q.var(),wei.var())
# We get an enormous variance in the wei matrix. This makes it unstable during training.
# We can normalize this by dividing by the square root of the head_size.
wei = wei / (head_size**0.5)
print(wei.var())
# This is important because as wei is fed into softmax, it will be exponentiated.
# this will converge the wei to a one hot vector as the softmax sharpens the distribution towards
# the maximum.


tensor(0.9946) tensor(0.9790) tensor(15.1783)
tensor(0.9486)
