# ARCFace


This is a simply guide that helped me to understand how arcface (https://arxiv.org/abs/1801.07698) and sphere-face (https://arxiv.org/abs/1704.08063) works.

This guide will gently moves from softmax to arcface and sphere face.
To better visualize what's going on MNIST dataset is used and a very simple architecture is used as a backbone.

Have fun!


In [1]:
%matplotlib widget

# Checking if everything is alright with the GPU
import tensorflow as tf
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["TF_DETERMINISTIC_OPS"] = "1"
from plot import plot_scatter

print(tf.test.gpu_device_name())

/device:GPU:0


In [2]:
# Simple architecture used in our experiments

def backbone(inputs, include_top=False, n_classes=10, training=None):
    tf.random.set_seed(0)

    x = tf.keras.layers.Conv2D(64, 3, padding="same", activation="relu", name="conv1")(inputs)
    x = tf.keras.layers.MaxPool2D((2,2), name="maxp1")(x)
    x = tf.keras.layers.BatchNormalization(name="batch_norm1")(x, training=training)
    x = tf.keras.layers.Conv2D(32, 3, padding="same", activation="relu", name="conv2")(x)
    x = tf.keras.layers.MaxPool2D((2,2), name="maxp2")(x)
    x = tf.keras.layers.BatchNormalization(name="batch_norm2")(x, training=training)
    x = tf.keras.layers.Flatten(input_shape=x.shape[1:])(x)
    x = tf.keras.layers.Dense(128, activation='relu', name="fc1")(x)
    x = tf.keras.layers.BatchNormalization(name="batch_norm3")(x, training=training)
    x = tf.keras.layers.Dense(20, activation=None, name="embeddings")(x)
    x = tf.keras.layers.Dropout(0.2)(x, training=training)
    if include_top:
        x = tf.keras.layers.Dense(n_classes, name="hot", activation=None)(x)
    return x


# From Softmax cross entropy to ArcFace

One of the work horses of Machine Learning is the Softmax + Cross Entropy loss.
The softmax is defined as:


$$\text{soft}(x_i) = \frac{ \text{exp}(Wx_i + b) }{\sum_{j=1}^n \text{exp}(Wx_j + b)}  $$

and the cross entropy loss is defined as:

$$L_1 = -\frac{1}{m}\sum\limits_{i=1}^m y_i \text{log}(\text{soft}(x_i)) $$

Below follow an example using Tensorflow on how to train a CNN using this loss.
What is important in this exercise is to observe the how the embedding space is organized in this 10 class classification problem.

One of the main drawbacks for the cross-entropy loss for face representation is its hability to generate a face space discriminative enough in open set scenarious.
It's possible to have a grasp of this in the t-SNE below where the representation for each one of the 10 classes is less "visual" compact (within class variability) compared with other examples .


In [3]:
import tensorflow as tf
import numpy as np

# Loading MNIST
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

train_dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train))
train_dataset = train_dataset.batch(64)

def softmax_cross_entropy(target, output, sparse=True, n_classes=10):
    
    target = tf.squeeze(target)
    
    ## SIMPLIFIED
    
    logits_max = tf.math.reduce_max(output, axis=-1, keepdims=True)
    N = logits_max.shape[0]
    shifted_logits = output - logits_max
    exp_shifted_logits = tf.math.exp(shifted_logits)
    
    if sparse:
        y = tf.one_hot(tf.cast(target,"int32"), depth=n_classes)
    else:
        y = target
        
    sum_exp = tf.math.reduce_sum(exp_shifted_logits,axis=-1, keepdims=True)
    log_sum_exp = tf.math.log(sum_exp)
    
    sub = shifted_logits - log_sum_exp
    mul = tf.math.multiply(tf.math.negative(y), sub)
    #L = tf.math.reduce_sum(mul)/N
    L = tf.math.reduce_mean(tf.math.reduce_sum(mul, axis=-1))
    """    


    ### NOT SIMPLIFIED
    logits_max = tf.math.reduce_max(output, axis=-1, keepdims=True)
    shifted_logits = output - logits_max    
    exp_shifted_logits = tf.math.exp(shifted_logits)
    sum_exp = tf.math.reduce_sum(exp_shifted_logits,axis=-1, keepdims=True)
    softmax = exp_shifted_logits/sum_exp

    if sparse:
        y = tf.one_hot(tf.cast(target,"int32"), depth=n_classes)
    else:
        y = target

    mul = tf.math.multiply(y, tf.math.log(tf.clip_by_value(softmax,1e-10,1.0)))
    #L = -tf.reduce_sum(mul)/N    
    L = -tf.math.reduce_mean(tf.math.reduce_sum(mul, axis=-1))
    """

    return L


tf.random.set_seed(0)

inputs = tf.keras.layers.Input([28, 28, 1], name="Input")
logits = backbone(inputs, include_top=True)
model = tf.keras.Model(inputs=inputs, outputs=logits)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01)


model.compile(optimizer=optimizer,
              loss=softmax_cross_entropy,
              metrics=['accuracy'])

#print(model.summary())

model.fit(train_dataset, epochs=10, steps_per_epoch=None)


predict_model = tf.keras.Model(inputs=inputs, outputs=model.get_layer("embeddings").output)
embeddings = predict_model.predict(x_test[0:1000])
plot_scatter(embeddings, y_test[0:1000])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# CenterLoss

Centerloss addresses this representation issue from the cross-entropy by tackling explicitly the within-class variability of the face space (the layer where the embeddings will be taken).
This is carried out by centering each class at a particular point in space.
Then, the euclidean norm between a sample from one class and its center is minimized using the loss below.

$$L_2 = 0.5 \sum\limits_{i=1}^{m}||x_i - x_{\text{center }i} ||_{2}^{2} $$

Normally, a face recognition deep model is trained jointly using $L_1$ and $L_2$.

Below follow the same example as aforementioned, but using the center loss.
It's possible to observe from the t-SNE below that the representation for each one of the 10 classes is more "visual" compact (within class variability) compared with the cross-entropy example .


In [59]:
import tensorflow as tf
import numpy as np


class CenterLossLayer(tf.keras.layers.Layer):

    def __init__(self, n_classes, n_features, **kwargs):
        super().__init__(**kwargs)
        self.n_classes = n_classes
        self.n_features = n_features
        self.centers = tf.Variable(
            tf.zeros([n_classes, n_features]),
            name="centers",
            trainable=False,
            # in a distributed strategy, we want updates to this variable to be summed.
            aggregation=tf.VariableAggregation.SUM,
        )

    def call(self, x):
        # pass through layer
        return tf.identity(x)

    def get_config(self):
        config = super().get_config()
        config.update({"n_classes": self.n_classes, "n_features": self.n_features})
        return config
    
class CenterLoss(tf.keras.losses.Loss):

    def __init__(
        self,
        centers_layer,
        alpha=0.9,
        update_centers=True,
        name="center_loss",
        **kwargs
    ):
        super().__init__(name=name, **kwargs)
        self.centers_layer = centers_layer
        self.centers = self.centers_layer.centers
        self.alpha = alpha
        self.update_centers = update_centers

    def call(self, sparse_labels, prelogits):
        sparse_labels = tf.reshape(sparse_labels, (-1,))
        centers_batch = tf.gather(self.centers, sparse_labels)
        # the reduction of batch dimension will be done by the parent class
        center_loss = tf.keras.losses.mean_squared_error(prelogits, centers_batch)

        # update centers
        if self.update_centers:
            diff = (1 - self.alpha) * (centers_batch - prelogits)
            updates = tf.scatter_nd(sparse_labels[:, None], diff, self.centers.shape)
            # using assign_sub will make sure updates are added during distributed
            # training
            self.centers.assign_sub(updates)

        return center_loss


class CenterLossModel(tf.keras.Model):
    def compile(
        self,
        cross_entropy,
        center_loss,
        loss_weights,
        train_loss,
        train_cross_entropy,
        train_center_loss,
        test_acc,
        **kwargs,
    ):
        super().compile(**kwargs)
        self.cross_entropy = cross_entropy
        self.center_loss = center_loss
        self.loss_weights = loss_weights
        self.train_loss = train_loss
        self.train_cross_entropy = train_cross_entropy
        self.train_center_loss = train_center_loss
        self.test_acc = test_acc

    def train_step(self, data):
        images, labels = data
        with tf.GradientTape() as tape:
            logits, prelogits = self(images, training=True)
            loss_cross = self.cross_entropy(labels, logits)
            loss_center = self.center_loss(labels, prelogits)
            loss = (
                loss_cross * self.loss_weights[self.cross_entropy.name]
                + loss_center * self.loss_weights[self.center_loss.name]
            )
        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)        
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        self.train_loss(loss)
        self.train_cross_entropy(loss_cross)
        self.train_center_loss(loss_center)
        return {
            m.name: m.result()
            for m in [self.train_loss, self.train_cross_entropy, self.train_center_loss]
        }

    def test_step(self, data):
        images, labels = data
        logits, prelogits = self(images, training=False)
        self.test_acc(accuracy_from_embeddings(labels, prelogits))
        return {m.name: m.result() for m in [self.test_acc]}
    
    

# Loading MNIST
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
y_train = y_train.astype("int32")
y_test = y_test.astype("int32")

train_dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train))
train_dataset = train_dataset.batch(64)


n_classes = 10

tf.random.set_seed(0)
inputs = tf.keras.layers.Input([28, 28, 1], name="Input")

embeddings = backbone(inputs, include_top=False)
embeddings = CenterLossLayer(
        n_classes=n_classes, n_features=embeddings.shape[-1], name="centers"
    )(embeddings)

logits = tf.keras.layers.Dense(n_classes, name="hot", activation=None)(embeddings)


model = CenterLossModel(
    inputs=inputs, outputs=[logits, embeddings]
)

print(model.summary())


cross_entropy = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, name="cross_entropy"
)
center_loss = CenterLoss(
    centers_layer=model.get_layer("centers"),
    alpha=0.9,
    name="center_loss",
)


optimizer=tf.keras.optimizers.Adam(learning_rate=0.1)

train_loss = tf.keras.metrics.Mean(name="loss")
train_cross_entropy = tf.keras.metrics.Mean(name="cross_entropy")
train_center_loss = tf.keras.metrics.Mean(name="center_loss")
test_acc = tf.keras.metrics.Mean(name="accuracy")

model.compile(
    optimizer=optimizer,
    cross_entropy=cross_entropy,
    center_loss=center_loss,
    loss_weights= {"cross_entropy": 1.0, "center_loss": 0.5},
    train_loss=train_loss,
    train_cross_entropy=train_cross_entropy,
    train_center_loss=train_center_loss,
    test_acc=test_acc,
    metrics=['accuracy']
)


model.fit(train_dataset, epochs=10)


predict_model = tf.keras.Model(inputs=inputs, outputs=model.get_layer("embeddings").output)
embeddings = predict_model.predict(x_test[0:1000])
plot_scatter(embeddings, y_test[0:1000])


Model: "center_loss_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Input (InputLayer)           [(None, 28, 28, 1)]       0         
_________________________________________________________________
conv1 (Conv2D)               (None, 28, 28, 64)        640       
_________________________________________________________________
maxp1 (MaxPooling2D)         (None, 14, 14, 64)        0         
_________________________________________________________________
batch_norm1 (BatchNormalizat (None, 14, 14, 64)        256       
_________________________________________________________________
conv2 (Conv2D)               (None, 14, 14, 32)        18464     
_________________________________________________________________
maxp2 (MaxPooling2D)         (None, 7, 7, 32)          0         
_________________________________________________________________
batch_norm2 (BatchNormalizat (None, 7, 7, 32)    

  fig = plt.figure()


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Angular Boundaries

## Modified Softmax from the paper Sphere-Face

$$\text{soft}(x_i) = \frac{exp(||x_i||\text{cos}(\theta_{yi}))}{\sum_j  exp(||x_i||\text{cos}(\theta_{j}))   }$$, where $cos(\theta_i)=W_i^{\intercal}x_i$

Below follow the same example as aforementioned, but using the modified softmax.




In [19]:
import tensorflow as tf
import math
import numpy as np

class ModifiedSoftMaxHead(tf.keras.layers.Layer):

    def __init__(self, n_classes=10):
        super(ModifiedSoftMaxHead, self).__init__(name ="modified_softmax_logits")
        self.n_classes = n_classes

        
    def build(self, input_shape):
        super(ModifiedSoftMaxHead, self).build(input_shape[0])
        shape = [input_shape[-1], self.n_classes]
        
        self.W = self.add_variable("W", shape=shape)
        

    def call(self, X, training=None):
        
        # normalize feature
        W = tf.nn.l2_normalize(self.W, axis=0)

        # cos between X and W            
        cos_yi = tf.nn.l2_normalize(X, axis=1) @ W
                
        logits = tf.norm(X)*cos_yi
        
        return logits
    
        
tf.random.set_seed(0)

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

train_dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train))
train_dataset = train_dataset.batch(64)


# PRE MODEL WITH CROSS ENTROPY
n_classes = 10
inputs = tf.keras.layers.Input([28, 28, 1], name="input")
labels = tf.keras.layers.Input([], name="label")

embeddings = backbone(inputs, include_top=False)
logits_cross_entropy = tf.keras.layers.Dense(n_classes, name="hot", activation=None)(embeddings)
pre_model = tf.keras.Model(inputs=inputs, outputs=logits_cross_entropy)

#### NOW THE MODIFIED CROSS ENTROPY

logits_modsoft = ModifiedSoftMaxHead()(embeddings)

modsoft_model = tf.keras.Model(inputs, outputs=logits_modsoft)
#print(arcface_model.summary())

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01)


# First do cross entropy
pre_model.compile(optimizer=optimizer,
              loss=softmax_cross_entropy,
              metrics=['accuracy'])
pre_model.fit(x_train, y_train, epochs=1)


# second do arcface 
modsoft_model.compile(optimizer=optimizer,
              loss=softmax_cross_entropy,
              metrics=['accuracy'])
modsoft_model.fit(train_dataset, epochs=9)


predict_model = tf.keras.Model(inputs=inputs, outputs=modsoft_model.get_layer("embeddings").output)
embeddings = predict_model.predict(x_train[0:1000])
plot_scatter(embeddings, y_train[0:1000])



Tensor("softmax_cross_entropy/Squeeze:0", shape=(32,), dtype=uint8)
Tensor("softmax_cross_entropy/one_hot:0", shape=(32, 10), dtype=float32)
Tensor("softmax_cross_entropy/Squeeze:0", shape=(32,), dtype=uint8)
Tensor("softmax_cross_entropy/one_hot:0", shape=(32, 10), dtype=float32)
Epoch 1/9
Tensor("softmax_cross_entropy/Squeeze:0", dtype=uint8)
Tensor("softmax_cross_entropy/one_hot:0", dtype=float32)
Tensor("softmax_cross_entropy/Squeeze:0", dtype=uint8)
Tensor("softmax_cross_entropy/one_hot:0", dtype=float32)
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

# Sphere Face

$$\text{soft}(x_i) = \frac{exp(||x_i||\text{cos}(\psi(\theta_{yi})))}{exp(||x_i||\text{cos}(\psi(\theta_{yi}))) + \sum_{j;j\neq yi}  exp(||x_i||\text{cos}(\psi(\theta_{j})))   }$$,

where, $\psi(\theta) = -1^k \text{cos}(m\theta)-2k$.

In this case, $theta \in [\frac{k\pi}{m} , \frac{(k+1)\pi}{m}]$  $k \in [0, m-1]$

In [4]:
import tensorflow as tf
import math
import numpy as np


# HYPERPARAMETERS

M = 4

class AngularHead(tf.keras.layers.Layer):

    def __init__(self, n_classes=10, scale=1.):
        super(AngularHead, self).__init__(name ="sphereface_logits")
        self.n_classes = n_classes
        self.scale = scale

        
    def build(self, input_shape):
        super(AngularHead, self).build(input_shape[0])
        shape = [input_shape[-1], self.n_classes]
        
        self.W = self.add_variable("W", shape=shape)
        
    def call(self, X, training=None):
        
        # normalize feature
        X_ = tf.nn.l2_normalize(X, axis=1)
        W = tf.nn.l2_normalize(self.W, axis=0)

        # cos between X and W            
        logits = self.scale*(X_ @ W)
        
        return logits

    
def sphere_face_loss(target, output, sparse=True, n_classes=10, m=4):
    
    # The output is cos(theta)
    
    target = tf.squeeze(target)
    pi = tf.constant(math.pi)
    
    if sparse:
        y = tf.one_hot(tf.cast(target,"int32"), depth=n_classes)
    else:
        y = target

    # Creating a mask to transforme the targets 
    # from [[1,0,..0]..] to [[0,1,..1]..]
    mask = 1-y
    
    # cos(m*theta)
    theta = tf.math.acos(output)    
    cos_theta_m = tf.math.cos(m*theta)

    # k [] 
    k = m * (theta / pi)
        
    # phi = -1**k * cos(m \theta) - 2k
    phi = (-1**k)* tf.math.cos(m*theta) - 2*k
    
    
    # ||x||
    x_norm = tf.norm(output, axis=-1, keepdims=True)

    
    # exp( ||x||cos(m \theta) )
    exp_x_cos_mtheta = x_norm * phi
        
    #  \sum_{j;j\neq yi}  exp(||x_i||\text{cos}(\psi(\theta_{j})))
    sum_exp_x_cos_mtheta = tf.math.reduce_sum(mask * exp_x_cos_mtheta, axis=-1, keepdims=True)
    
    
    mod_soft_max = exp_x_cos_mtheta / (exp_x_cos_mtheta + sum_exp_x_cos_mtheta)
    

    ### NOT SIMPLIFIED

    mul = -tf.math.log(tf.clip_by_value(mod_soft_max,1e-10,1.0))
    L = tf.math.reduce_mean(tf.math.reduce_sum(mul, axis=-1))

    return L
    

tf.random.set_seed(0)

(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

train_dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train))
train_dataset = train_dataset.batch(64)


# PRE MODEL WITH CROSS ENTROPY
n_classes = 10
inputs = tf.keras.layers.Input([28, 28, 1], name="input")
labels = tf.keras.layers.Input([], name="label")

embeddings = backbone(inputs, include_top=False)
logits_cross_entropy = tf.keras.layers.Dense(n_classes, name="hot", activation=None)(embeddings)
pre_model = tf.keras.Model(inputs=inputs, outputs=logits_cross_entropy)

#### NOW THE MODIFIED CROSS ENTROPY

logits_sphereface = AngularHead()(embeddings)

sphereface_model = tf.keras.Model(inputs, outputs=logits_sphereface)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01)


# First do cross entropy
pre_model.compile(optimizer=optimizer,
              loss=loss_fn,
              metrics=['accuracy'])
pre_model.fit(train_dataset, epochs=1)


# second do arcface 
sphereface_model.compile(optimizer=optimizer,
              loss=sphere_face_loss,
              metrics=['accuracy'])
sphereface_model.fit(train_dataset, epochs=9)


predict_model = tf.keras.Model(inputs=inputs, outputs=sphereface_model.get_layer("embeddings").output)
embeddings = predict_model.predict(x_test[0:1000])
plot_scatter(embeddings, y_test[0:1000])





Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## ArcFace simple

Essentially, in ArcFace, the logit is transform as follows $W^{\intercal}x_i = ||W|| ||x_i|| cos(\theta)$ where $\theta$ is the angle between the weight $W$ and the feture $x_i$.
Then basically the softmax-cors is replaced by:


$$\text{arc}(x_i) = \frac{\text{exp}(s(cos(\theta_i) + m))}{\text{exp}(s(cos(\theta_i) + m)) + \sum\limits_{j=1;j\neq i}^{m} \text{exp}(s(cos(\theta_j) + m))} $$,
where $s$ is a scaling factor and $m$ is a margin penalty.

Below follow the same example as aforementioned, but using the center loss.
It's possible to observe from the t-SNE below that the representation for each one of the 10 classes is more "visual" compact (within class variability) compared with the cross-entropy example .

In [None]:
import tensorflow as tf
import math
import numpy as np
from functools import partial

# ARCFACE HYPER PARAMETERS
S = 30.
M = 3.


def arcface_loss(target, output, sparse=True, n_classes=10, m=5., s=30.):
        
    # output = s*cos(theta)    
    cos_theta = output # just to wrap my head well.
    cos_m = tf.math.cos(m)
    sin_m = tf.math.sin(m)
        
    target = tf.squeeze(target)
    
    if sparse:
        y = tf.one_hot(tf.cast(target,"int32"), depth=n_classes)
    else:
        y = target
    
    # Creating a mask to transforme the targets 
    # from [[1,0,..0]..] to [[0,1,..1]..]
    mask = 1-y

    # sin(x) = 1-cos(x)**2
    sin_theta = tf.clip_by_value(tf.math.sqrt(1 - cos_theta ** 2), 0, 1)
    
    
    # cos(x+m) = cos(x)*cos(m) - sin(x)*sin(m)
    cos_theta_m = cos_theta * cos_m - sin_theta * sin_m    
    
    #exp(s*cos(theta))
    exp_s_cos_theta = tf.math.exp(s*cos_theta)
    
    #exp(s* cos(theta + m))
    exp_s_cos_theta_m = tf.math.exp(s*cos_theta_m)
    
    
    #  \sum\limits_{j=1;j\neq i}^{m} \text{exp}(s(cos(\theta_j) + m))
    sum_exp_cos_theta = tf.math.reduce_sum(mask * exp_s_cos_theta, axis=-1, keepdims=True)
    
        
    # Modified softmax    
    mod_soft_max = (exp_s_cos_theta_m) / (exp_s_cos_theta_m + sum_exp_cos_theta)
    

    mul = -tf.math.log(tf.clip_by_value(mod_soft_max,1e-10,1.0))
    L = tf.math.reduce_mean(tf.math.reduce_sum(mul, axis=-1))

    return L        

                    
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

train_dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train))
train_dataset = train_dataset.batch(64)


tf.random.set_seed(0)


# PRE MODEL WITH CROSS ENTROPY
n_classes = 10
inputs = tf.keras.layers.Input([28, 28, 1], name="input")
labels = tf.keras.layers.Input([], name="label")

embeddings = backbone(inputs, include_top=False)
logits_cross_entropy = tf.keras.layers.Dense(n_classes, name="hot", activation=None)(embeddings)
pre_model = tf.keras.Model(inputs=inputs, outputs=logits_cross_entropy)
#print(pre_model.summary())

# ARC FACE MODEL
logits_arcface = AngularHead()(embeddings)
arcface_model = tf.keras.Model(inputs, outputs=logits_arcface)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01)


# First do cross entropy
pre_model.compile(optimizer=optimizer,
              loss=loss_fn,
              metrics=['accuracy'])
pre_model.fit(train_dataset, epochs=1)


# second do arcface
#
arcface_model.compile(optimizer=optimizer,
              loss=partial(arcface_loss,m=M,s=S),
              metrics=['accuracy'])
arcface_model.fit(train_dataset, epochs=9)


predict_model = tf.keras.Model(inputs=inputs, outputs=arcface_model.get_layer("embeddings").output)
embeddings = predict_model.predict(x_test[0:1000])
plot_scatter(embeddings, y_test[0:1000])






## ArcFace + Sphere Face + CosFace

In [7]:
import tensorflow as tf
import math
import numpy as np
from functools import partial

# ARCFACE HYPER PARAMETERS
S = 30.
M1 = 1.
M2 = 0.5
M3 = 0.



def arcface_three_penalties_loss(target, output, sparse=True, n_classes=10, m1=5., m2=5., m3=5., s=30.):
        
    # The output is cos(theta)    
    cos_theta = output # just to wrap my head well.

    # Getting the angle
    theta = tf.math.acos(cos_theta)
        
    target = tf.squeeze(target)    
    if sparse:
        y = tf.one_hot(tf.cast(target,"int32"), depth=n_classes)
    else:
        y = target
    
    # Creating a mask to transforme the targets 
    # from [[1,0,..0]..] to [[0,1,..1]..]
    mask = 1-y

    
    # exp(s * cos(\theta))
    exp_s_cos_theta = tf.math.exp(s*cos_theta)
    
    #exp(cos_theta + m)
    exp_s_cos_theta_m = tf.math.exp( s* tf.math.cos(m1*theta+m2)-m3 )
    
    
    #  \sum\limits_{j=1;j\neq i}^{m} \text{exp}(s(cos(\theta_j) + m))
    sum_exp_s_cos_theta = tf.math.reduce_sum(mask * exp_s_cos_theta, axis=-1, keepdims=True)
    
        
    # Modified softmax    
    mod_soft_max = (exp_s_cos_theta_m) / (exp_s_cos_theta_m + sum_exp_s_cos_theta)
    

    mul = -tf.math.log(tf.clip_by_value(mod_soft_max,1e-10,1.0))
    L = tf.math.reduce_mean(tf.math.reduce_sum(mul, axis=-1))

    return L        

                    
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

train_dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train))
train_dataset = train_dataset.batch(64)


tf.random.set_seed(0)


# PRE MODEL WITH CROSS ENTROPY
n_classes = 10
inputs = tf.keras.layers.Input([28, 28, 1], name="input")
labels = tf.keras.layers.Input([], name="label")

embeddings = backbone(inputs, include_top=False)
logits_cross_entropy = tf.keras.layers.Dense(n_classes, name="hot", activation=None)(embeddings)
pre_model = tf.keras.Model(inputs=inputs, outputs=logits_cross_entropy)
#print(pre_model.summary())

# ARC FACE MODEL
logits_arcface = AngularHead()(embeddings)
arcface_model = tf.keras.Model(inputs, outputs=logits_arcface)

loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01)


# First do cross entropy
pre_model.compile(optimizer=optimizer,
              loss=loss_fn,
              metrics=['accuracy'])
pre_model.fit(train_dataset, epochs=1)


# second do arcface
#
arcface_model.compile(optimizer=optimizer,
              loss=partial(arcface_three_penalties_loss, m1=M1, m2=M2, m3=M3, s=S),
              metrics=['accuracy'])
arcface_model.fit(train_dataset, epochs=9)


predict_model = tf.keras.Model(inputs=inputs, outputs=arcface_model.get_layer("embeddings").output)
embeddings = predict_model.predict(x_test[0:1000])
plot_scatter(embeddings, y_test[0:1000])



Epoch 1/9
Epoch 2/9
Epoch 3/9
Epoch 4/9
Epoch 5/9
Epoch 6/9
Epoch 7/9
Epoch 8/9
Epoch 9/9


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').