In [1]:
import numpy as np
import tensorflow as tf

### Similarity Scores

In [4]:
# Two vector example
# Input data
v1 = np.array([1, 2, 3], dtype = float)
v2 = np.array([1, 2, 3.5], dtype = float)

# Try modifying the vector v2 to see how it impacts the cosine similarity
# v2 = v1                   # identical vector
# v2 = v1 * -1              # opposite vector
# v2 = np.array([0,-42,1], dtype=float)  # random example

print("-- Input --")
print("V1 : ", v1)
print("V2 : ", v2, "\n")

def cosine_similarity(v1, v2):
  numerator = tf.math.reduce_sum(v1 * v2)
  denominator = tf.math.sqrt(tf.math.reduce_sum(v1 * v1) * tf.math.reduce_sum(v2 * v2))
  return numerator / denominator

print("-- Outputs --")
print("Cosine similarity : ", cosine_similarity(v1, v2))

-- Input --
V1 :  [1. 2. 3.]
V2 :  [-1. -2. -3.] 

-- Outputs --
Cosine similarity :  tf.Tensor(-1.0, shape=(), dtype=float64)


### Two Batches of Vectors

In [11]:
# Two batches of vectors example
# Input data

"""
v1 :
array([[ 1.,  2.,  3.],
       [ 9.,  8.,  7.],
       [-1., -4., -2.],
       [ 1., -7.,  2.]])
"""
v1_1 = np.array([1.0, 2.0, 3.0])
v1_2 = np.array([9.0, 8.0, 7.0])
v1_3 = np.array([-1.0, -4.0, -2.0])
v1_4 = np.array([1.0, -7.0, 2.0])
v1 = np.vstack([v1_1, v1_2, v1_3, v1_4])

"""
v2:
array([[ 4.72843251, -0.31416263,  3.0220996 ],
       [ 9.20017366,  9.47500835,  7.32925269],
       [-5.79729922, -4.17808562, -2.54299621],
       [ 3.72376963, -7.63641921,  2.29885007]])
"""
v2_1 = v1_1 + np.random.normal(0, 2, 3)  # add some noise to create approximate duplicate
v2_2 = v1_2 + np.random.normal(0, 2, 3)
v2_3 = v1_3 + np.random.normal(0, 2, 3)
v2_4 = v1_4 + np.random.normal(0, 2, 3)
v2 = np.vstack([v2_1, v2_2, v2_3, v2_4])

print("-- Input --")
print(f"v1 : \n{v1}\n")
print(f"v2 : \n{v2}\n")

# Batch sizes must match
b = len(v1)
print(f"Batch sizes match : {b == len(v2)}\n")

# Similarity scores
# Option 1 : nested loops and the cosine similarity function
sim_1 = np.zeros([b, b])
for row in range(0, sim_1.shape[0]):
  for col in range(0, sim_1.shape[1]):
    sim_1[row,col] = cosine_similarity(v2[row], v1[col]).numpy()

print("-- Outputs --")
print("Option 1 : loop")
print(sim_1)

-- Input --
v1 : 
[[ 1.  2.  3.]
 [ 9.  8.  7.]
 [-1. -4. -2.]
 [ 1. -7.  2.]]

v2 : 
[[-2.92852677  1.60778267  3.41559857]
 [ 6.15808817  7.36827865  5.78058324]
 [-3.76842578 -2.95374122 -1.0572413 ]
 [-0.81672465 -6.6080942   3.82946569]]

Batch sizes match : True

-- Outputs --
Option 1 : loop
[[ 0.58924083  0.15650085 -0.47197698 -0.20939565]
 [ 0.91173463  0.99178717 -0.91879297 -0.41108239]
 [-0.70026323 -0.95094808  0.78761446  0.41055358]
 [-0.08853597 -0.312278    0.55655378  0.94073422]]


In [12]:
# Option 2 : vector normalization and dot product
def norm(x):
  return tf.math.l2_normalize(x, axis = 1) # Use tensorflow built in normalization

sim_2 = tf.linalg.matmul(norm(v2), norm(v1), transpose_b = True)

print("-- Outputs --")
print("Option 2 : vector normalization and dot product")
print(sim_2, "\n")

# Check
print(f"Outputs are the same : {np.allclose(sim_1, sim_2)}")

-- Outputs --
Option 2 : vector normalization and dot product
tf.Tensor(
[[ 0.58924083  0.15650085 -0.47197698 -0.20939565]
 [ 0.91173463  0.99178717 -0.91879297 -0.41108239]
 [-0.70026323 -0.95094808  0.78761446  0.41055358]
 [-0.08853597 -0.312278    0.55655378  0.94073422]], shape=(4, 4), dtype=float64) 

Outputs are the same : True


### Hard Negative Mining
L1=max(𝑚𝑒𝑎𝑛_𝑛𝑒𝑔−s(𝐴,𝑃)+𝛼,0)

L2=max(𝑐𝑙𝑜𝑠𝑒𝑠𝑡_𝑛𝑒𝑔−s(𝐴,𝑃)+𝛼,0)