# implementation using the cross_entropy loss (such as in n_pairs_loss)

In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np

In [40]:
import time

class SupervisedContrastiveLoss(keras.losses.Loss):
    def __init__(self, temperature=1, name=None):
        super(SupervisedContrastiveLoss, self).__init__(reduction=tf.keras.losses.Reduction.NONE,name=name)
        self.temperature = temperature

    def __call__(self, y_true, feature_vectors1,feature_vectors2, sample_weight=None):
        #adapted from https://github.com/HobbitLong/SupContrast/blob/master/losses.py
        #only change should be the change of computation of the maximum for the numerical stability trick
        
        
        #feature vectors1_2 : shape (batch_size,_)
        
        t=time.time()
        feature_vectors=tf.concat([feature_vectors1,feature_vectors2],axis=0)
        # Normalize feature vectors
        print("contcatenation",time.time()-t)
        t=time.time()
        feature_vectors_normalized = tf.math.l2_normalize(feature_vectors, axis=1)
        # Compute logits
        
        
        logits = tf.divide(
            tf.matmul(
                feature_vectors_normalized, tf.transpose(feature_vectors_normalized)
            ),
            self.temperature,
        )
        
        print("similarity computation",time.time()-t)
        t=time.time()
        #y_pred = tf.convert_to_tensor(logits)
        #print(y_pred.shape)
        y_true = tf.cast(y_true, logits.dtype)

        # Expand to [2*batch_size, 1]
        y_true=tf.concat([y_true,y_true],axis=0)
        
        #y_true = tf.expand_dims(y_true, -1)
        is_similar = tf.cast(tf.equal(y_true, tf.transpose(y_true)), logits.dtype)
        
        print("label computation",time.time()-t)
        t=time.time()
        
        
        #mask for self_contrasting terms
        bsz=y_true.shape[0]
        indices=[[i,i] for i in range(bsz)]
        update=[0 for i in range(bsz)]
        mask_logits=tf.ones_like(logits)
        mask_logits=tf.tensor_scatter_nd_update(mask_logits,indices,update)
        is_similar=is_similar*mask_logits
        
        print("mask computation",time.time()-t)
        t=time.time()
        #for numerical stability 
        
        max_logits=tf.math.reduce_max(mask_logits*logits,axis=1,keepdims=True)
        
        logits_unstabalize=logits
        logits=logits-tf.stop_gradient(max_logits)
        
        
        print("numerical stability computation",time.time()-t)
        t=time.time()
        
        log_softmax=logits-tf.math.log(tf.reduce_sum(mask_logits*tf.math.exp(logits),axis=1,keepdims=True))
        
        loss=tf.reduce_sum(is_similar*log_softmax,axis=1)/tf.reduce_sum(is_similar,axis=1)#compute the mean of the log likelhood
        loss=-tf.math.reduce_mean(loss)
        
        print("loss computation",time.time()-t)
      
        
        
        return loss 

    
class SupervisedContrastiveLoss2(keras.losses.Loss):
    def __init__(self, temperature=1, name=None):
        super(SupervisedContrastiveLoss2, self).__init__(reduction=tf.keras.losses.Reduction.NONE,name=name)
        self.temperature = temperature

    def __call__(self, y_true, feature_vectors1,feature_vectors2, sample_weight=None):
        #adapted from https://github.com/HobbitLong/SupContrast/blob/master/losses.py
        #only change should be the change of computation of the maximum for the numerical stability trick
        
        
        #feature vectors1_2 : shape (batch_size,_)
        
        t=time.time()
        feature_vectors=tf.concat([feature_vectors1,feature_vectors2],axis=0)
        # Normalize feature vectors
        print("contcatenation",time.time()-t)
        t=time.time()
        feature_vectors_normalized = tf.math.l2_normalize(feature_vectors, axis=1)
        # Compute logits
        
        
        logits = tf.divide(
            tf.matmul(
                feature_vectors_normalized, tf.transpose(feature_vectors_normalized)
            ),
            self.temperature,
        )
        
        print("similarity computation",time.time()-t)
        t=time.time()
        #y_pred = tf.convert_to_tensor(logits)
        #print(y_pred.shape)
        y_true = tf.cast(y_true, logits.dtype)

        # Expand to [2*batch_size, 1]
        y_true=tf.concat([y_true,y_true],axis=0)
        
        #y_true = tf.expand_dims(y_true, -1)
        is_similar = tf.cast(tf.equal(y_true, tf.transpose(y_true)), logits.dtype)
        
        print("label computation",time.time()-t)
        t=time.time()
        
        
        #mask for self_contrasting terms
        bsz=y_true.shape[0]
        indices=[[i,i] for i in range(bsz)]
        update=[0 for i in range(bsz)]
        mask_logits=tf.ones_like(logits)
        mask_logits=tf.tensor_scatter_nd_update(mask_logits,indices,update)
        is_similar=is_similar*mask_logits
        
        print("mask computation",time.time()-t)
        t=time.time()
        #for numerical stability 
        
        max_logits=tf.math.reduce_max(logits,axis=1,keepdims=True)
        
        logits_unstabalize=logits
        logits=logits-tf.stop_gradient(max_logits)
        
        
        print("numerical stability computation",time.time()-t)
        t=time.time()
        
        log_softmax=logits-tf.math.log(tf.reduce_sum(mask_logits*tf.math.exp(logits),axis=1,keepdims=True))
        
        loss=tf.reduce_sum(is_similar*log_softmax,axis=1)/tf.reduce_sum(is_similar,axis=1)#compute the mean of the log likelhood
        loss=-tf.math.reduce_mean(loss)
        
        print("loss computation",time.time()-t)
      
        
        
        return loss 
               
size=128



temperature=0.1
contrastive_loss=SupervisedContrastiveLoss(temperature=temperature)
contrastive_loss2=SupervisedContrastiveLoss2(temperature=temperature)



embeding1=tf.convert_to_tensor(np.random.randint(0,2,(size,1048)),dtype=tf.float64)
embeding2=tf.convert_to_tensor(np.random.randint(0,2,(size,1048)),dtype=tf.float64)
y_true=tf.convert_to_tensor(np.random.randint(0,2,(size)))

with tf.GradientTape() as tape:
    tape.watch(embeding1)
    
    loss=contrastive_loss(y_true,embeding1,embeding2)

gradients = tape.gradient(loss, embeding1)
gradients


y_true=tf.convert_to_tensor(np.random.randint(0,2,(size)))
with tf.GradientTape() as tape:
    tape.watch(embeding1)
    
    loss=contrastive_loss2(y_true,embeding1,embeding2)

gradients2 = tape.gradient(loss, embeding1)
np.max(np.abs(((gradients2-gradients)/gradients).numpy()))



contcatenation 0.0
similarity computation 0.0
label computation 0.0
mask computation 0.01604318618774414
numerical stability computation 0.0
loss computation 0.0
contcatenation 0.0
similarity computation 0.0
label computation 0.03124690055847168
mask computation 0.0
numerical stability computation 0.0
loss computation 0.0


6.859067013451332e-11