In [28]:
import numpy as np
import math

In [64]:
# numpy playground 
data = [100, 200, 300]
key = np.array([[1,2,3], [10, 20, 30]])
query = np.array([[4,5,6], [40, 50, 60]])
value = np.array([[7,8,9], [70, 80, 90]])

print("key[0] @ query[0]:", key[0] @ query[0])

print("key @ query.T:", key @ query.T)

print("\nkey.T @ query:" , key.T @ query)

key[0] @ query[0]: 32
key @ query.T: [[  32  320]
 [ 320 3200]]

key.T @ query: [[ 404  505  606]
 [ 808 1010 1212]
 [1212 1515 1818]]


In [30]:
# each node is a token
class Node:
    def __init__(self):
        # the vector storoed at this node
        self.data = np.random.rand(20)
        
        # weights governing how this node interacts with other nodes
        # generates a 20x20 NumPy array filled with random numbers 
        self.wkey = np.random.randn(20,20)
        self.wquery = np.random.randn(20,20)
        self.wvalue = np.random.randn(20,20)
        
    def __str__(self):
        return f"""
        self.data shape : {self.data.shape} ; 
        self.wquery shape : {self.wquery.shape} ; 
        self.wkey shape : {self.wkey.shape} ; 
        self.wvalue shape : { self.wvalue.shape} ; 
        """
        
    def key(self):
        # @ Multiplication in NumPy
        return self.wkey@self.data
    
    def query(self):
        return self.wquery@self.data
    
    def value(self):
        return self.wvalue@self.data
    
node = Node()
print(node)
            


        self.data shape : (20,) ; 
        self.wquery shape : (20, 20) ; 
        self.wkey shape : (20, 20) ; 
        self.wvalue shape : (20, 20) ; 
        


In [58]:
def softmax(scores):
    """
    Calculates the softmax function for a vector of scores.
    Args:
        scores: A NumPy array or list of scores.
    Returns:
        A NumPy array representing the softmax probabilities.
    """
    scores = np.array(scores)  # Ensure scores is a NumPy array
    exp_scores = np.exp(scores - np.max(scores))  # Subtract max for numerical stability
    return exp_scores / np.sum(exp_scores)

class Graph:
    def __init__(self):
        # 5 nodes
        self.nodes = [Node() for _ in range(5)]
        
        # 40 edegs, each edge is [node_index_from, node_index_to]
        randi = lambda: np.random.randint(len(self.nodes))
        self.edges = [[randi(), randi()] for _ in range(40)]
        
    def run(self):
        updates = []
        
        # for each node in the graph
        for i,n in enumerate(self.nodes):
            
            q = n.query()
            
            # find all edges that are input to this node n
            inputs = [self.nodes[ifrom] for (ifrom, ito) in self.edges if ito == i]
            if(len(inputs) == 0):
                continue;
                
            keys = [m.key() for m in inputs]
            
            scores = [q @ k for k in keys]
            
            # calcuate the softmax for scores[] sum to 1
            scores = softmax(scores)
            
            values = [m.value() for m in inputs]
            update = sum([s*v for s, v in zip(scores, values)])
            updates.append(update)
        
        # 
        for n, u in zip(self.nodes, updates):
            n.data = n.data + u # residual connection
            

graph = Graph()
graph.run()

In [65]:
# tokenize
def read_file(file):
    f = open(file, "r")
    journal = f.read()
    f.close()
    return journal

text = read_file("./oracle.txt")

# unique chars occur in the text 
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("vocab_size: ", vocab_size)
print("everying in vocab", ''.join(chars))

# create mapping/dict between chars to integers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

# encoder, take a string s, output a list of integers
encode = lambda s: [stoi[c] for c in s]
# decoder, take a list of integers l, output a string
decode = lambda l: ''.join([itos[i] for i in l])

print("all encode:", encode("all"))
print("decode:", decode([56,67,67]))

vocab_size:  90
everying in vocab 
 !"$%&'()*+,-./0123456789:;=?@ABCDEFGHIJKLMNOPQRSTUVWXYabcdefghijklmnopqrstuvwxyz~–—‘’“”…
all encode: [56, 67, 67]
decode: all


In [66]:
import torch


data = torch.tensor(encode(text), dtype=torch.long)
print(f"data.type: {data.type}, data.shape: {data.shape}", )
print("data[:1000]", data[:1000])

data.type: <built-in method type of Tensor object at 0x115c39360>, data.shape: torch.Size([81716])
data[:1000] tensor([45, 73, 56, 58, 67, 60, 26,  1, 35, 67, 67, 64, 74, 70, 69,  7, 74,  1,
        52, 70, 80, 56, 62, 60,  1, 75, 70,  1, 49, 70, 61, 75, 78, 56, 73, 60,
         1, 56, 69, 59,  1, 32, 60, 80, 70, 69, 59,  0, 55, 70, 76,  1, 58, 56,
        69,  1, 67, 64, 74, 75, 60, 69,  1, 75, 70,  1, 75, 63, 64, 74,  1, 34,
        60, 60, 71,  1, 34, 64, 77, 60,  1, 63, 60, 73, 60,  0,  0,  8, 44, 70,
        75, 60, 26,  1, 78, 63, 64, 67, 60,  1, 39,  1, 76, 74, 76, 56, 67, 67,
        80,  1, 76, 74, 60,  1, 68, 80,  1, 70, 78, 69,  1, 31, 39,  1, 77, 70,
        64, 58, 60,  1, 75, 70,  1, 69, 56, 73, 73, 56, 75, 60,  1, 75, 63, 60,
         1, 34, 60, 60, 71,  1, 34, 64, 77, 60, 12,  1, 39,  1, 63, 56, 77, 60,
         1, 59, 60, 58, 64, 59, 60, 59,  1, 75, 70,  1, 76, 74, 60,  1, 56,  1,
         3, 71, 73, 70, 61, 60, 74, 74, 64, 70, 69, 56, 67,  1, 31, 39,  1, 77,
        7

In [70]:
# building a batch of data
# each batch of data (x, y)

# how many independent chuncks of text we have in the batch
# this is how many sequence we will process in parrell
batch_size = 4 

# max context length, predicting the 9th char in the sequence 
block_size = 8 

# Get a random batch from data;
# x: input ; y: desired ouput. y is behind x by 1 in time
# both has shape: batch_size * block_size (B*T)
def get_bacth(data):    
    # generates a vector length of batch_size, each element is a random integer in the range [0, high).
    # (batch_size,) creates a 1D tensor (a vector) with batch_size elements.
    high = len(data) - block_size
    ix = torch.randint(0, high, (batch_size,))
    print("ix.shape: ", ix.shape)
    print("ix: ", ix)
    # wrong x = torch.stack(data[i:i+block_size] for i in ix)
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    
    return x,y
    
    
data = torch.tensor(encode(text), dtype=torch.long)
xb, yb = get_bacth(data)
print("input xb shape: ", xb.shape)
print(xb)
print("target yb shape: ", yb.shape)
print(yb)

ix.shape:  torch.Size([4])
ix:  tensor([ 3147,  7054, 74286, 32000])
input xb shape:  torch.Size([4, 8])
tensor([[80,  1, 61, 64, 69, 56, 67, 67],
        [70,  1, 62, 56, 77, 60,  1, 75],
        [58, 56, 68, 60,  1, 56,  1, 74],
        [56, 67, 67, 80,  1, 57, 60, 60]])
target yb shape:  torch.Size([4, 8])
tensor([[ 1, 61, 64, 69, 56, 67, 67, 80],
        [ 1, 62, 56, 77, 60,  1, 75, 63],
        [56, 68, 60,  1, 56,  1, 74, 60],
        [67, 67, 80,  1, 57, 60, 60, 69]])


In [71]:
# process the batch
# each batch_i in the batch can be trained in parallel
# time dimention can also be trained in parallel
# the real "batch size" / "parallel training" is Batch_size * time (block_size) 
def process_batch(xb, yb):
    for batch_i in range(0, batch_size): # batch dimension
        print(f"""for batch: {batch_i}""")
        for t in range(0, block_size): # time dimension
            context = xb[batch_i, 0:t+1]
            target = yb[batch_i,t]
            print(f""" when input is: {context.tolist()} ; output is: {target.tolist()} """)
            
# the print out is the examples that model can learn from for a single batch xb, yb
process_batch(xb, yb)  


for batch: 0
 when input is: [80] ; output is: 1 
 when input is: [80, 1] ; output is: 61 
 when input is: [80, 1, 61] ; output is: 64 
 when input is: [80, 1, 61, 64] ; output is: 69 
 when input is: [80, 1, 61, 64, 69] ; output is: 56 
 when input is: [80, 1, 61, 64, 69, 56] ; output is: 67 
 when input is: [80, 1, 61, 64, 69, 56, 67] ; output is: 67 
 when input is: [80, 1, 61, 64, 69, 56, 67, 67] ; output is: 80 
for batch: 1
 when input is: [70] ; output is: 1 
 when input is: [70, 1] ; output is: 62 
 when input is: [70, 1, 62] ; output is: 56 
 when input is: [70, 1, 62, 56] ; output is: 77 
 when input is: [70, 1, 62, 56, 77] ; output is: 60 
 when input is: [70, 1, 62, 56, 77, 60] ; output is: 1 
 when input is: [70, 1, 62, 56, 77, 60, 1] ; output is: 75 
 when input is: [70, 1, 62, 56, 77, 60, 1, 75] ; output is: 63 
for batch: 2
 when input is: [58] ; output is: 56 
 when input is: [58, 56] ; output is: 68 
 when input is: [58, 56, 68] ; output is: 60 
 when input is: [58, 5