In [1]:
import torch

In [2]:
x = torch.arange(6, dtype=torch.float32).reshape(2,3)
y = torch.ones_like(x)
x.shape, y.shape, x.device

(torch.Size([2, 3]), torch.Size([2, 3]), device(type='cpu'))

In [3]:
x

tensor([[0., 1., 2.],
        [3., 4., 5.]])

In [4]:
b = torch.tensor([10.0, 20.0, 30.0])
(x+b).shape

torch.Size([2, 3])

In [5]:
(x+b)[0]

tensor([10., 21., 32.])

In [6]:
device=(
    'cuda' if torch.cuda.is_available() else
    'mps' if getattr(torch.backends, 'mps', None) and torch.backends.mps.is_available() else 'cpu')
device

'cpu'

In [7]:
x.to(device)

tensor([[0., 1., 2.],
        [3., 4., 5.]])

***Autograd in a nutshell***

In [8]:
w=torch.tensor([2.0,-3.0,0.5],requires_grad=True)
v=torch.tensor([1.0,2.0,3.0])
loss=(w*v).sum()
loss.backward()
w.grad

tensor([1., 2., 3.])

In [9]:
w.grad.zero_()

tensor([0., 0., 0.])

In [10]:
model=torch.nn.Linear(3,1)
opt=torch.optim.AdamW(model.parameters(),lr=3e-3)
crit=torch.nn.MSELoss()

In [11]:
crit

MSELoss()

***Linear Regression***

In [12]:
import torch
device='cpu'
w_true=torch.tensor([2.0,-3.5],device=device)
b_true=torch.tensor(0.5, device=device)
N=128
X=torch.randn(N,2,device=device)
y=(X@w_true)+b_true+0.1*torch.randn(N, device=device)
model=torch.nn.Linear(2,1).to(device)
opt=torch.optim.Adam(model.parameters(),lr=3e-2)
loss_fn=torch.nn.MSELoss()

for step in range(401):
    opt.zero_grad()
    pred=model(X).squeeze(-1)
    loss=loss_fn(pred, y)
    loss.backward()
    opt.step()
    if step% 100 ==0:
        print(step, round(loss.item(),4))


0 17.6037
100 2.0197
200 0.1039
300 0.01
400 0.0087


***From ids to vectors***

In [18]:
from ch6_tokenize import SimpleTokenizer
tok = SimpleTokenizer.from_file('mini.txt',level='char')
ids=tok.encode('Hello world')
tok.decode(ids)

'Hello world'

In [19]:
E=torch.nn.Embedding(num_embeddings=len(tok.vocab),embedding_dim=8)
batch=[tok.encode('Hello'),tok.encode('Vectors')]
lens=max(len(x) for x in batch)
# pad to the same length (simple left pad with PAD=0)
P=tok.pad
x=torch.tensor([s+[P]*(lens-len(s)) for s in batch])
E(x).shape

torch.Size([2, 7, 8])

In [20]:
x

tensor([[ 5,  4,  2,  2,  3,  0,  0],
        [ 1,  4, 14, 15,  3,  7, 16]])

***word level example***

In [23]:
from ch6_tokenize import SimpleTokenizer
tok_w = SimpleTokenizer.from_file('mini.txt', level='word')
len(tok.vocab), len(tok_w.vocab)

(18, 9)

In [24]:
tok_w.encode('Hello vectors.')

[4, 7, 2]

***Padding***

In [26]:
P = tok.pad
batch = [tok.encode('Hello'), tok.encode('vectors')]
L = max(len(s) for s in batch)
right_pad = [s + [P] * (L- len(s)) for s in batch]
left_pad = [[P] * (L-len(s)) + s for s in batch]
right_pad, left_pad

([[5, 4, 2, 2, 3, 0, 0], [13, 4, 14, 15, 3, 7, 16]],
 [[0, 0, 5, 4, 2, 2, 3], [13, 4, 14, 15, 3, 7, 16]])

***Attention masks***

In [33]:
x = torch.tensor(right_pad)
pad_mask = (x != P).float()
T = x.size(1)
causal = torch.tril(torch.ones(T,T))
combined = pad_mask[:, None, :] * causal
pad_mask.shape, causal.shape,combined.shape

(torch.Size([2, 7]), torch.Size([7, 7]), torch.Size([2, 7, 7]))

***Tiny NumPy Implementation of scaled Dot-Product Attention***

In [45]:
import numpy as np

def scaled_dot_product_attention(X, WQ, WK, WV, causal=True):
    """
    X: (T, d_model) token embeddings
    WQ: (d_model, d_K)
    WK: (d_model, d_K)
    WV: (d_model, d_v)
    """

    # 1. build Q, K, V
    Q = X @ WQ # (T, d_k)
    K = X @ WK # (T, d_k)
    V = X @ WV # (T, d_v)

    # 2. Dot produects between all queries and keys
    dk = Q.shape[1]
    S = (Q @ K.T) / np.sqrt(dk)

    # 3. causal mask: prevent attending to future tokens
    if causal:
        T = S.shape[0]
        mask = np.triu(np.ones((T,T)), k=1) * -1e9 # -inf above diagonal
        S = S + mask

    # 4. Softmax along each row
    S_exp = np.exp( S - np.max(S, axis=1,keepdims=True))
    A = S_exp / np.sum(S_exp, axis = 1, keepdims =True)

    # 5. Weighted sum of values
    Y = A @ V # (T, d_v)

    return Y, A # return outputs and attention weights

In [46]:
T = 4       # sequence length
d_model = 6 # input embedding size+(20/60)
d_k = 4     # query / key size
d_v = 5     # value size

# Random toy data
np.random.seed(0)
X = np.random.randn(T, d_model)
WQ = np.random.randn(d_model, d_k)
WK = np.random.randn(d_model, d_k)
WV = np.random.randn(d_model, d_v)

Y, A = scaled_dot_product_attention(X, WQ, WK, WV)

print("Output Y:\n", Y)
print("|attention weights A:\n", A)


Output Y:
 [[ 0.47457463 -4.50608804  4.82442894 -1.24297684  4.05417016]
 [ 0.47531953 -4.50529114  4.82360616 -1.24222098  4.05263508]
 [ 0.4980276  -4.28557389  4.65295421 -1.13288223  3.86366691]
 [-0.97233489  2.97425006 -2.13287955 -3.93428412 -1.86363758]]
|attention weights A:
 [[1.00000000e+00 0.00000000e+00 0.00000000e+00 0.00000000e+00]
 [9.99767274e-01 2.32725656e-04 0.00000000e+00 0.00000000e+00]
 [9.43771823e-01 7.13238239e-05 5.61568536e-02 0.00000000e+00]
 [1.72126286e-03 5.60257549e-02 1.32270120e-01 8.09982862e-01]]


In [41]:
T = 3       # sequence length
d_model = 6 # input embedding size+(20/60)
d_k = 2     # query / key size
d_v = 2    # value size

# Random toy data
np.random.seed(0)
X = np.random.randn(T, d_model)
WQ = np.random.randn(d_model, d_k)
WK = np.random.randn(d_model, d_k)
WV = np.random.randn(d_model, d_v)

Y, A = scaled_dot_product_attention(X, WQ, WK, WV)

print("Output Y:\n", Y)
print("|attention weights A:\n", A)

array([[ 1.13940068, -1.23482582,  0.40234164, -0.68481009, -0.87079715],
       [-0.57884966, -0.31155253,  0.05616534, -1.16514984,  0.90082649],
       [ 0.46566244, -1.53624369,  1.48825219,  1.89588918,  1.17877957],
       [-0.17992484, -1.07075262,  1.05445173, -0.40317695,  1.22244507],
       [ 0.20827498,  0.97663904,  0.3563664 ,  0.70657317,  0.01050002],
       [ 1.78587049,  0.12691209,  0.40198936,  1.8831507 , -1.34775906]])