In [4]:
import numpy as np
def softmax(x):
     return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T
list1 = [2.0, 1.0, 1.0]
print(np.exp(list1))
print(softmax(list1))


[7.3890561  2.71828183 2.71828183]
[0.57611688 0.21194156 0.21194156]


In [3]:
print(np.sum(np.exp(list1), axis=-1))

12.82561975584874


# Python Functions Description

## softmax(x)

This function computes the softmax activation function over the input array `x`.

- It exponentiates each element of `x`.
- It sums along the last axis.
- It divides each element by the sum to normalize the values into a probability distribution.
- The transpose operations ensure correct broadcasting and alignment of dimensions.

## scaleddotproduct(q, k, v, mask=None)

This function computes the scaled dot-product attention mechanism commonly used in transformer-based architectures.

- It takes in three inputs: query (`q`), key (`k`), and value (`v`) matrices, along with an optional mask matrix (`mask`).
- It calculates the dot product of `q` and the transpose of `k`.
- It scales the result by the square root of the dimension of `k`.
- It applies the softmax function to obtain attention scores.
- If a mask is provided, it adds the mask to the scaled dot product.
- Finally, it computes the weighted sum of `v` based on the attention scores and returns the result along with the attention scores.


In [7]:
# L, dk and dv are having dimension query
L,dk,dv = 4,8,8
q = np.random.randn(L,dk)
k = np.random.randn(L,dk)
v = np.random.randn(L,dv)
print(q)
print(k)
print(v)

[[-0.05077099 -0.13968095  0.28794113 -0.11155697  0.75073963 -0.52329682
   1.04938617  1.04672942]
 [-0.54786664 -0.50825837 -0.69942501 -0.6401618   1.39375713 -1.50627131
  -0.30578947 -0.5676279 ]
 [ 0.64067017  0.79348516 -1.3860305   0.06075065  0.56191492  0.03487981
   0.26233773  0.7136653 ]
 [ 0.06370629  0.28620743  1.64315643  1.30902162  1.37355287  0.73041375
  -1.53309107 -0.79906111]]
[[-0.86570656  0.78788674 -1.80806456 -0.92054313  0.17036489 -0.30346231
   0.64347536  0.93605272]
 [-0.52105998 -0.0313016   1.34147804 -2.25087616 -1.35403948 -0.65355577
  -1.1342191  -0.36495378]
 [ 1.61557539 -0.96355223 -1.31860745 -0.88487251  0.82636694 -0.25747757
  -0.43899771 -0.29155016]
 [ 1.91897767  0.99055125 -0.36616923 -0.32582732  0.45342366 -0.79588718
  -0.8223103  -1.32627147]]
[[ 1.88377581  1.74823426  1.19604261  1.28779537 -0.93109576  0.13238352
  -1.21585038  0.89339695]
 [-0.5080888  -0.84069984  0.3165722   0.28480814 -0.13819108  0.18558206
   0.24996386  

In [5]:




def softmax(x):
     return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T
def scaleddotproduct(q,k,v,mask=None):
      dk = q.shape[-1]
      scaled = np.matmul(q, k.T) / np.sqrt(dk)
      if mask is not None:
           scaled = scaled + mask
      print(scaled)
      attention = softmax(scaled)
      print(attention)
      out = np.matmul(attention, v)
      return out, attention





In [12]:
values, attention = scaleddotproduct(q, k, v, mask=None)
print(values)
print(attention)

[[ 0.51538378 -0.55811038 -0.08454548 -0.63609788]
 [ 0.66969732  0.16095966  1.03685225  0.61693558]
 [ 1.21715534 -1.30687151  0.76950626  0.55434879]
 [-2.02505242 -0.38575158 -0.58154784  0.61500213]]
[[0.45313856 0.15488823 0.24870531 0.1432679 ]
 [0.25040806 0.15055887 0.36149448 0.23753859]
 [0.44749377 0.03586048 0.28600621 0.23063954]
 [0.04098185 0.21112071 0.17357916 0.57431828]]
[[ 0.8462015   0.94276141  0.65825374  0.89297512  0.03293829 -0.22626037
  -0.58643318  0.20988252]
 [ 0.47806371  0.71333222  0.40491278  0.81477456  0.47330919 -0.44448659
  -0.35912241 -0.04738871]
 [ 0.86003452  1.06161203  0.53397194  1.0350406   0.2048366  -0.40368168
  -0.58547501  0.20342478]
 [-0.31653878 -0.00677676 -0.48469504  1.3307802   0.83211977 -0.94803479
   0.20296573  0.1122782 ]]
[[0.45313856 0.15488823 0.24870531 0.1432679 ]
 [0.25040806 0.15055887 0.36149448 0.23753859]
 [0.44749377 0.03586048 0.28600621 0.23063954]
 [0.04098185 0.21112071 0.17357916 0.57431828]]


In [10]:
mask = np.tril(np.ones((L,L)))
print(mask)

[[1. 0. 0. 0.]
 [1. 1. 0. 0.]
 [1. 1. 1. 0.]
 [1. 1. 1. 1.]]


In [11]:
mask[mask==0]=-np.inf
mask[mask==1]=0
print(mask)

[[  0. -inf -inf -inf]
 [  0.   0. -inf -inf]
 [  0.   0.   0. -inf]
 [  0.   0.   0.   0.]]
