<a href="https://colab.research.google.com/github/yeabwang/tensorflow/blob/main/linear_algebra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import tensorflow as tf

In [None]:
# matrix multiplication // not element wise
# For this to happen the number of columns in the first row should be equal to the number of columns in the second row.
#  2x3 and 3x2 - valid
# 2x3 and 2x3 - invalid
# In case of 3d matrix, we are going to do the multiplication the first batch of the first matrix to the first batch of the second matrix
# is_sparse is used when we have a matrix full of  zeros and we are telling our processor so that it compute the operations efficienctly

x1 = tf.constant([[1,2,0],
                [3,5,-1]])
x2 = tf.constant([[1,2,0],
                 [3,5,-1],
                [3,5,-1]])

tf.linalg.matmul(x1,x2)
print(x1@x2)
tf.linalg.matmul(x1,x2,transpose_b=False, transpose_a = False,adjoint_a=False, a_is_sparse=False,output_type=None, name=None)


tf.Tensor(
[[ 7 12 -2]
 [15 26 -4]], shape=(2, 3), dtype=int32)


<tf.Tensor: shape=(2, 3), dtype=int32, numpy=
array([[ 7, 12, -2],
       [15, 26, -4]], dtype=int32)>

In [None]:
# Transposing
tf.transpose(x1)
tf.transpose(x2)

<tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[ 1,  3,  3],
       [ 2,  5,  5],
       [ 0, -1, -1]], dtype=int32)>

In [22]:
# Adjoint
tf.linalg.adjoint(x1)

<tf.Tensor: shape=(3, 2), dtype=int32, numpy=
array([[ 1,  3],
       [ 2,  5],
       [ 0, -1]], dtype=int32)>

In [34]:
# Band: A band matrix is a special type of matrix that has non-zero entries concentrated around the main diagonal, with all other entries being zero.
# tf.linalg.band_part(input, lwr_bound, upr_bound)
# The main condition we are checking is
# (lwr_bound < 0 or m-n <=lower) and (upper<0 or n-m <= upper)

# Special cases
# lwr_bound 0 and upr_bound -1 -> Upper triangle part
# lwr_bound -1 and upr_bound 0 -> Lower triangle part
# lwr_bound 0 and upr_bound 0 ->  Diagonal

x = tf.constant([[1, 2, 3],
                 [4, 5, 6],
                 [7, 8, 9]])

tf.linalg.band_part(x, 0, -1)
# tf.linalg.band_part(x, -1, 0)
#tf.linalg.band_part(x, 0, 0)


<tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[1, 0, 0],
       [0, 5, 0],
       [0, 0, 9]], dtype=int32)>

In [46]:
# inverse
tens_matrix = tf.constant([[1, 2, 3],
                 [0, 1, 4],
                 [5, 6, 0]], dtype=tf.float32)
# x= tf.cast(x,tf.float32)
tensor_inverse = tf.linalg.inv(tens_matrix)
tens_matrix@tensor_inverse # the result will be an identity matrix



<tf.Tensor: shape=(3, 3), dtype=float32, numpy=
array([[ 1.0000010e+00, -1.4305115e-06,  0.0000000e+00],
       [ 0.0000000e+00,  1.0000000e+00,  0.0000000e+00],
       [ 0.0000000e+00,  0.0000000e+00,  1.0000000e+00]], dtype=float32)>

In [47]:
# svd(singular value decomposition) -- A=UΣV^T
# The main goal here is to eliminate the part which contain less important information
# Σ(s) - Singular value - epresent the magnitude of the stretching (or shrinking) applied by A
      # To calculate the singular value
      # Transpose the matrix.
      # Multiply the matrix with its transpose.
      # Find the eigenvalues of the resulting matrix.
      # Take the square roots of those eigenvalues to get the singular values.

# U - Left singular values
      #(A*AT−λI)u=0 --> (A * A Transpose - lamda(identity)) = 0

# V - Right singular value
      # (AT*A -λI)v = 0 --> (A Transpose * A - lamda(identity)) = 0



In [54]:
tens_matrix = tf.constant([[1, 2, 3],
                 [0, 1, 4],
                 [5, 6, 0]], dtype=tf.float32)

tf.linalg.svd(tens_matrix, full_matrices=False, compute_uv=True, name=None)
s,u,v = tf.linalg.svd(tens_matrix)
print(s)
print(u)
print(v)

tf.Tensor([8.27884    4.8435745  0.02493798], shape=(3,), dtype=float32)
tf.Tensor(
[[ 0.33780327  0.5131884  -0.7890036 ]
 [ 0.19885956  0.78044283  0.5927598 ]
 [ 0.9199693  -0.3571372   0.16158365]], shape=(3, 3), dtype=float32)
tf.Tensor(
[[ 0.5964182  -0.26271883  0.7584618 ]
 [ 0.7723647  -0.06937096 -0.63137984]
 [ 0.21849047  0.9623757   0.16154054]], shape=(3, 3), dtype=float32)


In [60]:
# einsum - Einstein summation notation
# tf.einsum is function that allows for efficient computation of various tensor operations
#  particularly those involving summation, contraction, and index manipulation.
# So we can say its just a way of making these operations clean and efficient

x1 = tf.constant([
    [2,6,4,2],
    [2,-2,2,3],
    [1,5,4,0]])
x2 = tf.constant([
    [2,9,0,3,0],
    [3,6,8,-2,2],
    [1,3,5,0,1],
    [3,0,2,0,5]])

print(x1.shape)
print(x2.shape , "\n")
print("Multiplication with mul = \n")
x_mult = tf.linalg.matmul(x1,x2)
print(x_mult , "\n")

print("Multioplication with einsum = \n")
x_ein = tf.einsum('ij,jk -> ik', x1,x2)
print(x_ein)

(3, 4)
(4, 5) 

Multiplication with mul = 

tf.Tensor(
[[32 66 72 -6 26]
 [ 9 12  0 10 13]
 [21 51 60 -7 14]], shape=(3, 5), dtype=int32) 

Multioplication with einsum = 

tf.Tensor(
[[32 66 72 -6 26]
 [ 9 12  0 10 13]
 [21 51 60 -7 14]], shape=(3, 5), dtype=int32)


In [65]:
#einsum element wise operation - They must have same shape

x1 = tf.constant([
    [2,6,4,2],
    [2,-2,2,3],
    [1,5,4,0]])
x2 = tf.constant([
    [2,9,0,3],
    [3,6,8,-2],
    [1,3,5,0],])

print(x1.shape)
print(x2.shape , "\n")

print("Element wise multiplication = \n")
el_mul = x1*x2
print(el_mul, "\n")

el_ein = tf.einsum('ij, ij -> ij', x1,x2)
print(el_ein)

(3, 4)
(3, 4) 

Element wise multiplication = 

tf.Tensor(
[[  4  54   0   6]
 [  6 -12  16  -6]
 [  1  15  20   0]], shape=(3, 4), dtype=int32) 

tf.Tensor(
[[  4  54   0   6]
 [  6 -12  16  -6]
 [  1  15  20   0]], shape=(3, 4), dtype=int32)


In [67]:
# einsum matrix transpose
x1 = tf.constant([
    [2,6,4,2],
    [2,-2,2,3],
    [1,5,4,0]])

print("Matrix Transpose = \n")
el_trn = tf.transpose(x1)
print(el_trn, "\n")

print("Transpose with ein = \n")
ein_trn = tf.einsum('ij -> ji', x1)
print(ein_trn, "\n")

Matrix Transpose = 

tf.Tensor(
[[ 2  2  1]
 [ 6 -2  5]
 [ 4  2  4]
 [ 2  3  0]], shape=(4, 3), dtype=int32) 

Transpose with ein = 

tf.Tensor(
[[ 2  2  1]
 [ 6 -2  5]
 [ 4  2  4]
 [ 2  3  0]], shape=(4, 3), dtype=int32) 



In [74]:
#ensum with 3d tensor
# When we are dealing with a 3d matrix multiplication the batch size of the two must be equal
# a = b,i,j amd x = b,i,j  the batch of the a and the x matrix has to be equal
# Then we will perform a batch wise multiplication

# Element wise operation
x1 = tf.constant([
    [[2,6,4,2],
    [2,-2,2,3],
    [1,5,4,0]],

    [[2,6,4,2],
    [2,-2,2,3],
    [1,5,4,0]]
])
x2 = tf.constant([
    [[2,9,0,3],
    [3,6,8,-2],
    [1,3,5,0]]

    ,[[2,9,0,3],
    [3,6,8,-2],
    [1,3,5,0]]
])

print("3D multiplication with normal = \n")
threed_mul = x1*x2
print(threed_mul, "\n")

print("3D multiplication with einsum = \n")
three_einsum = tf.einsum('bij,bij -> bij', x1,x2)
print(three_einsum)

3D multiplication with normal = 

tf.Tensor(
[[[  4  54   0   6]
  [  6 -12  16  -6]
  [  1  15  20   0]]

 [[  4  54   0   6]
  [  6 -12  16  -6]
  [  1  15  20   0]]], shape=(2, 3, 4), dtype=int32) 

3D multiplication with einsum = 

tf.Tensor(
[[[  4  54   0   6]
  [  6 -12  16  -6]
  [  1  15  20   0]]

 [[  4  54   0   6]
  [  6 -12  16  -6]
  [  1  15  20   0]]], shape=(2, 3, 4), dtype=int32)


In [80]:
# 3d matrix multiplication - it has to be
# x1 = b,i,j and x2 = b,i,j   -- b and b has to be equal and i of the x1 has to be equal to j of x2.
# b (batch dimension) must be equal in both tensors.
# j (third dimension of x1) must match i (second dimension of x2).
# The result will have the shape [b, i, k],
# where i is the second dimension of x1 and k is the third dimension of x2.

# x1. shape = 2,3,4
x1 = tf.constant([
    [[2,6,4,2],
    [2,-2,2,3],
    [1,5,4,0]],

    [[2,6,4,2],
    [2,-2,2,3],
    [1,5,4,0]]
])

# x2.shape = 2,4,3
x2 = tf.constant([
    [[2,9,0,3],
    [3,6,8,-2],
    [1,3,5,0],
    [1,3,5,0]]

    ,[[2,9,0,3],
    [3,6,8,-2],
    [1,3,5,0],
    [1,3,5,0]]
])

#three_mx_mul = x1@x2
three_mx_mul = tf.linalg.matmul(x1,x2)
print( "The matrix multiplication result will be \n" ,three_mx_mul, "\n")

three_einsum = tf.einsum('bij,bjk -> bik', x1,x2)
print( "The einsum matrix multiplication result will be \n" ,three_einsum)

The matrix multiplication result will be 
 tf.Tensor(
[[[28 72 78 -6]
  [ 3 21  9 10]
  [21 51 60 -7]]

 [[28 72 78 -6]
  [ 3 21  9 10]
  [21 51 60 -7]]], shape=(2, 3, 4), dtype=int32) 

The einsum matrix multiplication result will be 
 tf.Tensor(
[[[28 72 78 -6]
  [ 3 21  9 10]
  [21 51 60 -7]]

 [[28 72 78 -6]
  [ 3 21  9 10]
  [21 51 60 -7]]], shape=(2, 3, 4), dtype=int32)


In [88]:
# summing up 3d arrays

# x1. shape = 2,3,4
x1 = tf.constant([
    [[2,6,4,2],
    [2,-2,2,3],
    [1,5,4,0]],

    [[2,6,4,2],
    [2,-2,2,3],
    [1,5,4,0]]
])

# x2.shape = 2,4,3
x2 = tf.constant([
    [[2,9,0,3],
    [3,6,8,-2],
    [1,3,5,0]]

    ,[[2,9,0,3],
    [3,6,8,-2],
    [1,3,5,0]]
])

eins_sum = tf.einsum('bij,bik -> j', x1,x2) # Summing up all elements in the columns
print("Column wise addition",eins_sum, "\n")
eins_sum = tf.einsum('bij,bik -> i', x1,x2) # Summing up all the elements in rows
print("Row wise addition",eins_sum,"\n")
eins_sum = tf.einsum('bij, bik -> b', x1,x2) # Summing up all the elements in batch wise
print("Batch wise addition",eins_sum,"\n")

Column wise addition tf.Tensor([134 198 244 146], shape=(4,), dtype=int32) 

Row wise addition tf.Tensor([392 150 180], shape=(3,), dtype=int32) 

Batch wise addition tf.Tensor([361 361], shape=(2,), dtype=int32) 



In [90]:
# Attention Mechanism

# '''

# batch_size: This is the number of samples in a single batch.
# s_q || s_k || s_d || s_v: This represents the sequence length.
# model_size: This is the size of the embedding or feature vector for each token in the sequence.  (e.g., 512, 1024, etc.).

# Query
# Q = batchsize,s_q,modelsize  - What are we asking? - Queries are used to compare against keys to compute attention scores.
# # The attention mechanism calculates how much focus (or weight) each key should have in relation to each query.

# Key
# K = batchsize,s_k,modelsize # Key - What can we compare to? - Keys are used to compute the attention scores in relation to queries.
# # Each key vector corresponds to a token in the sequence and is compared with the queries to determine how much attention a query should pay to a particular key.

# Dimensionality
# D = batchsize,s_d,modelsize # Dimensionality of model size
# # It defines the number of features (or dimensions) used to represent each token in the sequence. Common values are 512, 1024, etc., depending on the model architecture.

# Value
# V = batchsize,s_v,modelsize # Value - the actual data that will be used to produce the output after attention.
# # Values hold the actual content that the model needs to process after the attention mechanism has determined which parts of the input are most important.

# Why Was It Needed?
# Before attention mechanisms, models like RNNs and LSTMs would process input sequences sequentially and remember information through hidden states. However, they often struggled with long sequences because the model had to compress all information into a single fixed-length vector (the context vector) before making predictions. This led to information loss, especially for long sentences.

# Problems with this approach:

# Vanishing Gradient: In long sequences, important details from earlier parts of the sequence could be lost.
# Fixed-size context: Compressing the entire input into one vector means losing nuanced or complex dependencies in longer sequences.

# How the Attention Mechanism Solves It:
# The attention mechanism allows the model to dynamically focus on relevant parts of the input sequence parallely during each step of the output generation, instead of relying on a fixed-size context vector. It computes attention scores to determine which tokens (words) in the input are most important for producing each output.

# How it works:
# Query (Q): What are we trying to focus on?
# Key (K): What information from the input could be relevant?
# Value (V): What actual content do we want to use for the output?
# By comparing Q (query) with K (key), we get an attention score, which tells the model how much focus (weight) to give to each part of the input when computing the output.

# '''

In [None]:
Q = tf.random.normal((32,64,512))
K = tf.random.normal((32,128,512))
V = tf.random.uniform((32,200, 512))
D = tf.random.uniform((32,512,512))

# print(Q, "\n")
# print(K, "\n")
# print(V, "\n")
# print(D, "\n")

atten_e_shape = tf.einsum('bqm, bkm -> bqk', Q,K).shape
print(atten_e_shape)
atten_e = tf.einsum('bqm, bkm -> bqk', Q,K)
print(atten_e)

In [16]:
A = tf.random.normal((2,4,6,4,2)) #bcij
B = tf.random.normal((2,4,6,4,1)) #bcik
print(A.shape)
print(B.shape)
# B^T * A
C = tf.einsum('bcdik, bcdij-> bcdkj', B, A)
print(C)
print(C.shape)

(2, 4, 6, 4, 2)
(2, 4, 6, 4, 1)
tf.Tensor(
[[[[[ 2.406842   -0.04552498]]

   [[-0.31809473  0.9641504 ]]

   [[ 1.116933    0.2236335 ]]

   [[-0.3253295   0.6036027 ]]

   [[-0.75789577 -1.5392165 ]]

   [[ 0.05710427  1.588268  ]]]


  [[[-2.2975512  -2.1292772 ]]

   [[ 1.8588835  -0.6293715 ]]

   [[-0.0529765  -0.18849061]]

   [[-2.3053195   2.959392  ]]

   [[ 1.4402726   0.4412862 ]]

   [[ 0.22686625 -2.5896754 ]]]


  [[[-1.7841402  -0.4344794 ]]

   [[-1.4639571   0.08281344]]

   [[-0.57325864 -2.0132964 ]]

   [[ 0.06471872  1.133959  ]]

   [[ 1.5523272  -0.6473383 ]]

   [[-0.17417105 -1.9038916 ]]]


  [[[ 1.9693708   0.08062457]]

   [[-3.432708    2.900423  ]]

   [[-0.49154678  1.2095435 ]]

   [[-1.6264522  -1.0453346 ]]

   [[ 1.460628   -3.0688553 ]]

   [[ 2.3701985   2.0919256 ]]]]



 [[[[ 2.10514    -1.5235132 ]]

   [[-1.9733005  -1.339136  ]]

   [[-0.88734376  2.937897  ]]

   [[ 0.58056843  1.1942999 ]]

   [[-1.8686435  -2.9539752 ]]

   [[ 1.1370962  -0