In [1]:
import os
os.environ["KERAS_BACKEND"] = "torch"

In [2]:
from deel.lip.layers import (
    SpectralDense,
    SpectralConv2D,
    ScaledL2NormPooling2D,
    FrobeniusDense,
)

from keras import Sequential
# from deel.lip.model import Sequential
from deel.lip.activations import GroupSort
from deel.lip.losses import MulticlassHKR, MulticlassKR
from keras.layers import Input, Flatten, Conv2D, Dense, Layer
from keras.optimizers import Adam
from keras.datasets import fashion_mnist
from keras.utils import to_categorical
import keras.ops as K
import numpy as np
import keras

In [3]:
# load data
(x_train, y_train_ord), (x_test, y_test_ord) = fashion_mnist.load_data()
# standardize and reshape the data
x_train = np.expand_dims(x_train, -1) / 255
x_test = np.expand_dims(x_test, -1) / 255
# one hot encode the labels
y_train = to_categorical(y_train_ord)
y_test = to_categorical(y_test_ord)

In [25]:
@keras.saving.register_keras_serializable()
class MyScaledL2NormPooling2D(keras.layers.AveragePooling2D):
    def __init__(
        self,
        pool_size=(2, 2),
        strides=None,
        padding="valid",
        data_format=None,
        k_coef_lip=1.0,
        eps_grad_sqrt=1e-6,
        **kwargs,
    ):
        if not ((strides == pool_size) or (strides is None)):
            raise RuntimeError("stride must be equal to pool_size")
        if padding != "valid":
            raise RuntimeError("ScaledL2NormPooling2D only supports padding='valid'")
        if eps_grad_sqrt < 0.0:
            raise RuntimeError("eps_grad_sqrt must be positive")
        super(MyScaledL2NormPooling2D, self).__init__(
            pool_size=pool_size,
            strides=pool_size,
            padding=padding,
            data_format=data_format,
            **kwargs,
        )
        self.built = False
        # self.set_klip_factor(k_coef_lip)
        self.eps_grad_sqrt = eps_grad_sqrt
        self._kwargs = kwargs

    def build(self, input_shape):
        super(MyScaledL2NormPooling2D, self).build(input_shape)
        # self._init_lip_coef(input_shape)
        self.built = True

    def _compute_lip_coef(self, input_shape=None):
        return np.sqrt(np.prod(np.asarray(self.pool_size)))

    def call(self, x):
        return K.sqrt(super().call(K.square(x)) + self.eps_grad_sqrt)*2
    def get_config(self):
        base_config = super(MyScaledL2NormPooling2D, self).get_config()
        return dict(list(base_config.items()))

In [26]:
@keras.saving.register_keras_serializable()
class GroupSort(Layer):
    def __init__(self, n=None, data_format="channels_last", k_coef_lip=1.0, **kwargs):
        # self.set_klip_factor(k_coef_lip)
        super(GroupSort, self).__init__(**kwargs)
        if data_format == "channels_last":
            self.channel_axis = -1
        elif data_format == "channels_first":
            raise RuntimeError(
                "channels_first not implemented for GroupSort activation"
            )
        else:
            raise RuntimeError("data format not understood")
        self.n = n
        self.data_format = data_format

    def build(self, input_shape):
        super(GroupSort, self).build(input_shape)
        # self._init_lip_coef(input_shape)
        if (self.n is None) or (self.n > input_shape[self.channel_axis]):
            self.n = input_shape[self.channel_axis]
        if (input_shape[self.channel_axis] % self.n) != 0:
            raise RuntimeError("self.n has to be a divisor of the number of channels")
        input_shape = tuple(input_shape)
        self.flat_shape = (
            (-1,) + input_shape[1:-1] + (input_shape[-1] // self.n, self.n)
        )
        self.out_shape = (-1,) + input_shape[1:]

    def _compute_lip_coef(self, input_shape=None):
        return 1.0

    def call(self, x):
        fv = K.reshape(x, self.flat_shape)
        if self.n == 2:
            b, c = K.split(fv, 2, axis=-1)
            newv = K.concatenate([K.minimum(b, c), K.maximum(b, c)], axis=-1)
            newv = K.reshape(newv, self.out_shape)
            return newv 

        newv = K.sort(fv)
        newv = K.reshape(newv, self.out_shape)
        return newv 

    def get_config(self):
        config = {
            "n": self.n,
            # "k_coef_lip": self.k_coef_lip,
            "data_format": self.data_format,
        }
        base_config = super(GroupSort, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def compute_output_shape(self, input_shape):
        return input_shape

In [27]:
@keras.saving.register_keras_serializable()
class GroupSort2(GroupSort):
    def __init__(self, **kwargs):
        kwargs["n"] = 2
        super().__init__(**kwargs)

In [None]:
# Sequential (resp Model) from deel.model has the same properties as any lipschitz model.
# It act only as a container, with features specific to lipschitz
# functions (condensation, vanilla_exportation...)
model = Sequential(
    [
        Input(shape=x_train.shape[1:]),
        # Lipschitz layers preserve the API of their superclass ( here Conv2D )
        # an optional param is available: k_coef_lip which control the lipschitz
        # constant of the layer
        Conv2D(
            filters=16,
            kernel_size=(3, 3),
            # activation=orGroupSt(2),
            use_bias=True,
            kernel_initializer="orthogonal",
            padding='same'
        ),
        GroupSort2(),
        # usual pooling layer are implemented (avg, max...), but new layers are also available
        MyScaledL2NormPooling2D(pool_size=(2, 2), data_format="channels_last"),
        Conv2D(
            filters=32,
            kernel_size=(3, 3),
            # activation=GroupSort(2),
            use_bias=True,
            kernel_initializer="orthogonal",
            padding='same',
        ),
        GroupSort2(),
        MyScaledL2NormPooling2D(pool_size=(2, 2), data_format="channels_last"),
        # our layers are fully interoperable with existing keras layers
        Flatten(),
        Dense(
            64,
            # activation=GroupSort(2),
            use_bias=True,
            kernel_initializer="orthogonal",
        ),
        GroupSort2(),
        Dense(
            y_train.shape[-1],
            activation=None,
            use_bias=False,
            kernel_initializer="orthogonal",
        ),
    ],
    # similary model has a parameter to set the lipschitz constant
    # to set automatically the constant of each layer
    name="hkr_model",
)

# HKR (Hinge-Krantorovich-Rubinstein) optimize robustness along with accuracy
model.compile(
    # decreasing alpha and increasing min_margin improve robustness (at the cost of accuracy)
    # note also in the case of lipschitz networks, more robustness require more parameters.
    loss=MulticlassHKR(alpha=100, min_margin=0.25),
    optimizer=Adam(1e-4),
    metrics=["accuracy", MulticlassKR()],
)
model.summary()

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import LPPool2d

In [9]:
@keras.saving.register_keras_serializable()
class MyLPPool2D(Layer):
    def __init__(self, pool_size=(2, 2), eps_grad_sqrt=1e-6):
        super(MyLPPool2D, self).__init__()
        
        self.pool_size = pool_size
        self.stride = pool_size  # Stride fixé à pool_size comme dans ta version Keras
        self.eps_grad_sqrt = eps_grad_sqrt

    def call(self, x):
        x_squared = torch.square(x)  # Élève au carré (équivalent à K.square(x))
        x_avg_pool = F.avg_pool2d(x_squared, kernel_size=self.pool_size, stride=self.stride, padding=0)  # Moyenne des valeurs au carré
        x_out = torch.sqrt(x_avg_pool + self.eps_grad_sqrt)  # Ajoute eps_grad_sqrt et prend la racine carrée
        return x_out


In [10]:
@keras.saving.register_keras_serializable()
class My2LPPool2D(keras.layers.AveragePooling2D):
    def __init__(
        self,
        pool_size=(2, 2),
        strides=None,
        padding="valid",
        data_format=None,
        eps_grad_sqrt=1e-6,
        **kwargs,
    ):
        if not ((strides == pool_size) or (strides is None)):
            raise RuntimeError("stride must be equal to pool_size")
        if padding != "valid":
            raise RuntimeError("ScaledL2NormPooling2D only supports padding='valid'")
        if eps_grad_sqrt < 0.0:
            raise RuntimeError("eps_grad_sqrt must be positive")
        super(My2LPPool2D, self).__init__(
            pool_size=pool_size,
            strides=pool_size,
            padding=padding,
            data_format=data_format,
            **kwargs,
        )
        self.eps_grad_sqrt = eps_grad_sqrt
        self._kwargs = kwargs
    def call(self, x):
        return K.sqrt(super().call(K.square(x))+ self.eps_grad_sqrt)

In [11]:
# Sequential (resp Model) from deel.model has the same properties as any lipschitz model.
# It act only as a container, with features specific to lipschitz
# functions (condensation, vanilla_exportation...)
model = Sequential(
    [
        Input(shape=x_train.shape[1:]),
        # Lipschitz layers preserve the API of their superclass ( here Conv2D )
        # an optional param is available: k_coef_lip which control the lipschitz
        # constant of the layer
        Conv2D(
            filters=16,
            kernel_size=(3, 3),
            # activation=orGroupSt(2),
            use_bias=True,
            kernel_initializer="orthogonal",
            padding='same'
        ),
        GroupSort2(),
        # usual pooling layer are implemented (avg, max...), but new layers are also available
        My2LPPool2D(),
        Conv2D(
            filters=32,
            kernel_size=(3, 3),
            # activation=GroupSort(2),
            use_bias=True,
            kernel_initializer="orthogonal",
            padding='same',
        ),
        GroupSort2(),
        My2LPPool2D(),
        # our layers are fully interoperable with existing keras layers
        Flatten(),
        Dense(
            64,
            # activation=GroupSort(2),
            use_bias=True,
            kernel_initializer="orthogonal",
        ),
        GroupSort2(),
        Dense(
            y_train.shape[-1],
            activation=None,
            use_bias=False,
            kernel_initializer="orthogonal",
        ),
    ],
    # similary model has a parameter to set the lipschitz constant
    # to set automatically the constant of each layer
    name="hkr_model",
)

# HKR (Hinge-Krantorovich-Rubinstein) optimize robustness along with accuracy
model.compile(
    # decreasing alpha and increasing min_margin improve robustness (at the cost of accuracy)
    # note also in the case of lipschitz networks, more robustness require more parameters.
    loss=MulticlassHKR(alpha=100, min_margin=0.25),
    optimizer=Adam(1e-4),
    metrics=["accuracy", MulticlassKR()],
)
model.summary()

In [29]:
model_tf = keras.saving.load_model("convdense_gs2_pool_fashionMNIST.keras")

In [40]:
model_tf.compile(
    # decreasing alpha and increasing min_margin improve robustness (at the cost of accuracy)
    # note also in the case of lipschitz networks, more robustness require more parameters.
    loss=MulticlassHKR(alpha=100, min_margin=0.25),
    optimizer=Adam(1e-4),
    metrics=["accuracy", MulticlassKR()],
)

In [37]:
model_tf.built

True

In [41]:
model_tf.predict(x_test[:1])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


array([[-0.45261514,  0.6381362 ,  0.04317522,  0.40430403, -1.2057297 ,
        -0.71725094, -0.66472304, -0.01536794,  0.50754654, -0.5321394 ]],
      dtype=float32)

In [30]:
model_tf.summary()

In [31]:
model.set_weights(model_tf.get_weights())

In [32]:
model.predict(x_test[:1])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


array([[-0.4526153 ,  0.63813627,  0.04317522,  0.4043044 , -1.2057296 ,
        -0.7172508 , -0.664723  , -0.0153679 ,  0.50754684, -0.5321399 ]],
      dtype=float32)

array([[-0.45264804,  0.63833827,  0.04332048,  0.40417057, -1.2056267 ,
        -0.7172017 , -0.6649481 , -0.01542646,  0.50755   , -0.53229195]],
      dtype=float32)

In [33]:
submodel = Sequential([MyScaledL2NormPooling2D()])
submodel.compile(
    # decreasing alpha and increasing min_margin improve robustness (at the cost of accuracy)
    # note also in the case of lipschitz networks, more robustness require more parameters.
    loss=MulticlassHKR(alpha=100, min_margin=0.25),
    optimizer=Adam(1e-4),
    metrics=["accuracy", MulticlassKR()],
)
pred1 = submodel.predict(x_test[:1])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [34]:
pred1

array([[[[0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ]],

        [[0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ]],

        [[0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ]],

        [[0.002     ],
     

In [18]:
submodel = Sequential([My2LPPool2D()])
submodel.compile(
    # decreasing alpha and increasing min_margin improve robustness (at the cost of accuracy)
    # note also in the case of lipschitz networks, more robustness require more parameters.
    loss=MulticlassHKR(alpha=100, min_margin=0.25),
    optimizer=Adam(1e-4),
    metrics=["accuracy", MulticlassKR()],
)
pred2 = submodel.predict(x_test[:1])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [19]:
pred2

array([[[[0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ]],

        [[0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ]],

        [[0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ],
         [0.001     ]],

        [[0.001     ],
     

In [20]:
np.max(np.abs(pred1-pred2))

np.float32(0.0)

In [21]:
submodel = keras.saving.load_model('toy.keras')
submodel.compile(
    # decreasing alpha and increasing min_margin improve robustness (at the cost of accuracy)
    # note also in the case of lipschitz networks, more robustness require more parameters.
    loss=MulticlassHKR(alpha=100, min_margin=0.25),
    optimizer=Adam(1e-4),
    metrics=["accuracy", MulticlassKR()],
)
pred3 = submodel.predict(x_test[:1])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [22]:
pred3

array([[[[0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ]],

        [[0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ]],

        [[0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ]],

        [[0.002     ],
     

In [23]:
submodel = keras.saving.load_model('toy2.keras')
submodel.compile(
    # decreasing alpha and increasing min_margin improve robustness (at the cost of accuracy)
    # note also in the case of lipschitz networks, more robustness require more parameters.
    loss=MulticlassHKR(alpha=100, min_margin=0.25),
    optimizer=Adam(1e-4),
    metrics=["accuracy", MulticlassKR()],
)
pred4 = submodel.predict(x_test[:1])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [24]:
pred4

array([[[[0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ]],

        [[0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ]],

        [[0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ],
         [0.002     ]],

        [[0.002     ],
     