# SELF-ATTENTION

## Self Attention

$$
\text{self attention} = softmax\bigg(\frac{Q.K^T}{\sqrt{d_k}}+M\bigg)
$$

$$
\text{new V} = \text{self attention}.V
$$ 

In [1]:
import numpy as np
L,d_k,d_v=4,8,8 
# L is length of sentence = "My name is Akshata" =4
# set dimension to a random value here 8

In [2]:
q=np.random.randn(L,d_k)
k=np.random.randn(L,d_k)
v=np.random.randn(L,d_v)

#generate q,k,v, vectors using random np arrays

In [3]:
print("v\n",v)

v
 [[-1.45501613  1.12104651  0.1601064  -0.64056377 -1.44402873  0.79275413
   1.72701777 -0.18697646]
 [-0.0090312  -0.85755707  0.25167518 -1.44080247 -0.88817109  0.61161258
   0.17531728  0.1869186 ]
 [-0.28668059 -0.42184698 -2.36067457 -2.42037027  0.90825863  0.39698552
  -0.43702076  0.41276585]
 [-1.38236774 -1.53473945  0.76468046 -1.05179959  1.63642933 -0.06838755
   0.58329393  0.53298586]]


In [4]:
np.matmul(q, k.T)

array([[ 0.50996634,  0.78673348,  1.64510885,  3.10856557],
       [ 3.98074989,  0.75993506,  2.19098897, -1.59813831],
       [-0.85928699,  0.33467945,  0.91460177, -0.428045  ],
       [ 0.63647349,  4.17020268,  1.08279774,  7.43765164]])

In [5]:
q.var(), k.var(),  np.matmul(q, k.T).var()

(0.990742029467248, 0.9085049295453562, 4.696035782038038)

In [6]:
import math
scaled=np.matmul(q, k.T)/math.sqrt(d_k)

In [7]:
scaled.var()

0.5870044727547546

In [8]:
mask=np.tril(np.ones((L,L)))

In [9]:
scaled

array([[ 0.18030033,  0.27815229,  0.58163381,  1.0990439 ],
       [ 1.40740762,  0.26867762,  0.77463158, -0.56502722],
       [-0.30380383,  0.11832706,  0.32336056, -0.15133676],
       [ 0.22502736,  1.4743893 ,  0.38282681,  2.62960696]])

In [10]:
mask

array([[1., 0., 0., 0.],
       [1., 1., 0., 0.],
       [1., 1., 1., 0.],
       [1., 1., 1., 1.]])

In [11]:
mask[mask==0]=-np.infty
mask[mask==1]=0

In [12]:
scaled+mask

array([[ 0.18030033,        -inf,        -inf,        -inf],
       [ 1.40740762,  0.26867762,        -inf,        -inf],
       [-0.30380383,  0.11832706,  0.32336056,        -inf],
       [ 0.22502736,  1.4743893 ,  0.38282681,  2.62960696]])

In [13]:
def softmax(x):
    e_x=np.exp(x)
    return e_x/np.sum(e_x, axis=-1, keepdims=True)

In [14]:
print(softmax(scaled+mask))

[[1.         0.         0.         0.        ]
 [0.75744639 0.24255361 0.         0.        ]
 [0.22740183 0.34683511 0.42576306 0.        ]
 [0.05976278 0.20845956 0.06997812 0.66179954]]


In [15]:
q.shape[-1]

8

In [16]:
def attention(q,k,v,mask=None):
    d_k=q.shape[-1]
    
    scaled=np.matmul(q,k.T)/math.sqrt(d_k)
    if mask is not None:
        scaled=scaled+mask
    attention=softmax(scaled)
    out=np.matmul(attention,v)
    
    return out, attention

In [17]:
import torch
import random

def early_stopping(train_fn, val_fn, patience=3):
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(20):  # example max epochs
        train_loss = train_fn()
        val_loss = val_fn()

        print(f"Epoch {epoch}: Train={train_loss:.3f}, Val={val_loss:.3f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping at epoch {epoch}")
                break

# Simulated training
def train_fn(): return random.uniform(0.3, 0.7)
def val_fn(): return random.uniform(0.2, 0.8)

early_stopping(train_fn, val_fn, patience=2)


Epoch 0: Train=0.457, Val=0.300
Epoch 1: Train=0.647, Val=0.378
Epoch 2: Train=0.628, Val=0.346
Early stopping at epoch 2
