In [9]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import matplotlib.pyplot as plt
%matplotlib inline


In [10]:
with open('tiny_shakespear.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print("length of dataset in characters: ", len(text))
print(text[:500])

length of dataset in characters:  1115393
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [11]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

char_to_int = dict()
int_to_char = dict()

for i in range (len(chars)):
    char_to_int[chars[i]] = i
    int_to_char[i] = chars[i]

print(char_to_int)
print(int_to_char)

data = []
for char in text:
    data.append(char_to_int[char])

data = torch.tensor(data)
split_idx = int(len(data)*0.8)
train_data = data[:split_idx]
val_data = data[split_idx:]
print(len(train_data))
print(len(val_data))


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65
{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
{0: '\n', 1: ' ', 2: '!', 3: '$', 4: '&', 5: "'", 6: ',', 7: '-', 8: '.', 9: '3', 10: ':', 11: ';', 12: '?', 13: 'A', 14: 'B', 15: 'C', 16: 'D', 17: 'E', 18: 'F', 19: 'G', 20: 'H', 21: 'I', 22: 'J', 23: 'K', 24: 'L', 25: 'M', 26: 'N', 27: 'O', 28: 'P', 29: 'Q', 30: 'R', 31: 'S', 32: 'T', 33: 'U', 34: 'V', 35: 'W', 36: 'X', 37: 'Y', 38: 'Z', 39: 'a', 40

In [13]:

batch_size = 4 
context_length = 8

def get_batch(n = batch_size, type = "train"):
    if type == "train":
        data = train_data

    elif type == "val":
        data = val_data

    data_len = len(data) - context_length
    ix = torch.randint(data_len, size=(n,))

    x = torch.stack([data[i:i+context_length] for i in ix])
    y = torch.stack([data[i+1:i+1+context_length] for i in ix])

    return x, y

x, y = get_batch()

for i in range(context_length):
    print(f"input: {x[0,:i+1]} target: {y[0,i]}")

print(x.shape)

input: tensor([33]) target: 25
input: tensor([33, 25]) target: 14
input: tensor([33, 25, 14]) target: 17
input: tensor([33, 25, 14, 17]) target: 30
input: tensor([33, 25, 14, 17, 30]) target: 24
input: tensor([33, 25, 14, 17, 30, 24]) target: 13
input: tensor([33, 25, 14, 17, 30, 24, 13]) target: 26
input: tensor([33, 25, 14, 17, 30, 24, 13, 26]) target: 16
torch.Size([4, 8])


In [18]:
class Model(torch.nn.Module):
    def __init__(self, vocab_size, context_length = 8, vocab_embed_dim = 32):
        super().__init__()

        self.vocab_size = vocab_size
        self.context_length = context_length
        self.vocab_embed_dim = vocab_embed_dim

        self.token_embedding_table = nn.Embedding(self.vocab_size, self.vocab_embed_dim)
        self.position_embedding_table = nn.Embedding(self.context_length, self.vocab_embed_dim)

        self.l1 = nn.Linear(32, self.vocab_size)


    def forward(self, x, y=None):
        print(x.shape)
        
        # (B, T, vocab_embed) + (T, vocab_embed)
        tok_emb = self.token_embedding_table(x)
        pos_emb = self.position_embedding_table(torch.arange(self.context_length)) 
        x_enc =  tok_emb + pos_emb
        print(x_enc.shape)
        logits = self.l1(x_enc)

        if y == None:
            loss = None
        else:
            B, T, C = logits.shape #Batch, Time, Classes
            logits_flat = logits.view(B*T, C)
            y = y.view(B*T)
            loss = F.cross_entropy(logits_flat, y)

        return logits, loss
    
    def generate(self, prompt, max_response_len):
        for _ in range(max_response_len):
            logits, loss = self.forward(prompt)
            logits = logits[:,-1,:]
            probs = F.softmax(logits,dim=-1)

            next_token = torch.multinomial(probs, num_samples=1)
            prompt = torch.cat((prompt, next_token), dim=1)

        return prompt

In [19]:
m = Model(65)

In [20]:
logits, loss = m.forward(x, y)
print(logits.shape)
print(loss)

torch.Size([4, 8])
torch.Size([4, 8, 32])
torch.Size([4, 8, 65])
tensor(4.4324, grad_fn=<NllLossBackward0>)


In [14]:
logits, loss = m.forward(torch.zeros((1,1), dtype=torch.long))
print(logits.shape)


torch.Size([1, 1, 65])


In [123]:
response = m.generate(prompt= torch.zeros((1,1), dtype=torch.long), max_response_len=100)

data = ''.join([int_to_char[v.item()] for v in response[0]])
print(data)


#Pure randomness, but structure can be found... simply by looking at probabilites, conditioned on state...


Se, gos bene thear ou O,
I surulllee pp, bepade acow, hese lld ndyousoondre y bureerars se irade,
Fi


In [41]:

optimizer = torch.optim.Adam(m.parameters(), lr=0.001, amsgrad=False)


In [104]:
for i in range(10000):
    X, Y = get_batch()
    logits, loss = m.forward(X, Y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
print(loss.item())


2.630314588546753


Mathematical trick for efficent self-attention (vectorized)

In [50]:
# How to extend to multi-dimensions?
# https://pytorch.org/docs/stable/generated/torch.matmul.html

B, T, C = 1, 8, 32
x = torch.randn(B, T, C)

head_size = 16

#encode node into key...
key = nn.Linear(32, head_size, bias=False)

#encode node to query. Dimension must match key dim.
query = nn.Linear(32, head_size, bias=False)

#Determine what information to share (x -> private information, v -> public information)

value = nn.Linear(32, head_size, bias=False)

# Each token, which right now is represented by a C-dimensional vector we are now going to encode it into a key/query. We do this by taking a linear sum of it's values
# That key represents what each token "has", and the query is what it's looking for. 
# It's a soft search in contious space, so if things are not exactly the same they can still show up.

K = key(x)
Q = query(x)
V = value(x)


print(Q.transpose(-2,-1).shape)

print(K.shape)
print(Q.shape)
A = K @ Q.transpose(-2,-1)
print(A.shape)
print(A[0])
tril = torch.tril(torch.ones((B,T,T)))
A = A.masked_fill(tril == 0, float('-inf'))
A = F.softmax(A, dim=-1)

output = A @ V

print(A[0][0])
print(output[0][0])


torch.Size([1, 16, 8])
torch.Size([1, 8, 16])
torch.Size([1, 8, 16])
torch.Size([1, 8, 8])
tensor([[-0.5249,  0.7228,  1.5818,  3.6537, -0.3164, -0.4668,  0.2571, -0.6779],
        [-1.9168, -0.0153, -0.6245, -1.5969,  1.4872, -0.7036, -0.4554,  0.4036],
        [-1.1580,  1.6528,  0.1651, -0.4505,  1.2689, -1.2856, -0.1286, -1.5790],
        [ 0.1870,  1.7720,  2.0020,  0.9950,  0.7860, -1.0472, -1.0654, -2.0595],
        [-2.0954,  1.1104,  0.8074, -3.3844,  0.6437, -0.6074, -0.3781,  0.4568],
        [-0.4413,  1.6315,  0.4316,  0.0982, -0.4590,  0.1698,  1.1595, -0.4861],
        [-0.1293,  0.6867,  1.0631,  2.9071, -1.3107,  1.2985,  0.6031, -1.0113],
        [-1.5245,  2.3170,  0.8273, -3.2456,  2.0506, -2.5999,  0.6020,  1.0496]],
       grad_fn=<SelectBackward0>)
tensor([1., 0., 0., 0., 0., 0., 0., 0.], grad_fn=<SelectBackward0>)
tensor([ 0.1824,  0.6020,  0.0880,  0.0272,  0.1562,  0.4874,  0.0839, -0.8607,
         0.0247,  0.6225,  0.2833,  0.3509, -0.0809,  0.5528,  0.9446,

In [47]:
x = torch.randn(11)

x @ x


tensor(9.7836)

How does uniform averaging work?

Lets say want to determine the sentiment of document. Actually just a 1 dimensional vector would be enough.

Enocde each word with a [+1] if positive, and [-1] if negative. You can then average all vectors together. the more positive it is, the better! Doesn't depend on document length, but rather of the percent of words that are positive vs negative...

**Why do we add the position encoding?**

https://datascience.stackexchange.com/questions/55901/in-a-transformer-model-why-does-one-sum-positional-encoding-to-the-embedding-ra

Example:

in a single dimension a key could be 1 and the query could be /1. actually hard to make it pluck out a single positional encoding...

1 / -1 + 1

2D vector, I can have basically infite going around a circle, and then I can have another pointing in the same direction, and it will be the max!...

**Common defs**

- in Encoder blocks we typically remove the masked attention, to allow all tokens to communicate with each other
- in decoder blocks, we have the masked attention, beacuse the future is basically unknown?
- self-attention is when the same nodes produce the key, queries, values
- cross attention is when one set of nodes produce the keys and values, and another the queries.



In [57]:
k = torch.randn(B, T, 16)
q = torch.randn(B, T, 16)

a = k @ q.transpose(-2,-1)

print(k.var())
print(q.var())
print(a.var())

tensor(0.9817)
tensor(0.9941)
tensor(29.5017)
