# Regularizing gradient norm

To see if gradient norm (or any metric) is implicitly regularized, we can try regularizing it _explicitly_ and seeing how the results change. If regularizing the gradient norm does not decrease it significantly, then probably it is already being regularized implicitly.

In [None]:
import os
import sys
# If we don't need CUDA, do this before importing TF
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
import numpy as np
import pandas as pd
import tqdm
import tqdm.notebook
import matplotlib.pyplot as plt
import seaborn as sns
import IPython
sns.set()

In [None]:
%load_ext autoreload
%aimport smooth.datasets
%aimport smooth.model
%aimport smooth.analysis
%aimport smooth.callbacks
%aimport smooth.measures
%aimport smooth.util
%autoreload 1

In [None]:
dataset = smooth.datasets.GaussianProcessDataset(samples_train=10, lengthscale=0.3, seed=1, dim=1)

In [None]:
class RegularizedGradient(tf.keras.Model):

    def __init__(self, dataset):
        super(RegularizedGradient, self).__init__()
        self.model = smooth.model.get_shallow(
            dataset,
            learning_rate=1e-2,
            init_scale=100,
            hidden_size=16,
            activation="relu",
        )
        self.loss = self.model.loss
        self.loss_functions = self.model.loss_functions
        self.optimizer = self.model.optimizer

    def call(self, x):
#         return self.model(x)
#         x = tf.reshape(x, (-1, 1))
        with tf.GradientTape() as tape:
            tape.watch(x)
            y = self.model(x)

#         g = tape.batch_jacobian(y, x)
# #         print(tf.reduce_sum(g ** 2))
#         self.add_loss(1e-3 * tf.reduce_sum(g ** 2))

        return y

    
def get_model(dataset):
    model = smooth.model.get_shallow(
        dataset,
        learning_rate=1e-2,
        init_scale=1,
        hidden_size=32,
        activation="relu",
    )

In [None]:
def train(dataset, reg_coef, sparsity):
    model = smooth.model.get_shallow(
        dataset,
        learning_rate=1e-2,
        init_scale=100,
        hidden_size=100,
        activation="relu",
    )
    model = smooth.model.RegularizedGradientModel(model, dataset.x_test, coef=reg_coef, sparsity=sparsity)
    
    batch_size = 64
    train_dataset = tf.data.Dataset.from_tensor_slices((dataset.x_train, dataset.y_train))
    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
    
    epochs = 5000
    progress = tqdm.notebook.tqdm(range(epochs))

    x_test = tf.constant(dataset.x_test)
    y_test = tf.constant(dataset.y_test)
    
    for epoch in progress:
#         print('Start of epoch %d' % (epoch,))
        for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
            with tf.GradientTape() as tape:
                with tf.GradientTape() as tape2:
                    tape2.watch(x_test)
                    y_test_pred = model(x_test)
                
                loss_value = model.loss_functions[0](y_batch_train, model(x_batch_train))
                # Add extra losses created during this forward pass:
                loss_value += sum(model.losses)

#                 g = tape2.batch_jacobian(y_test_pred, x_test)
#                 loss_value += (reg * tf.reduce_mean(g ** 2))

            grads = tape.gradient(loss_value, model.trainable_weights)
            model.optimizer.apply_gradients(zip(grads, model.trainable_weights))
        
        if loss_value < 1e-6:
            break
        
        if epoch % 10 == 0:
            progress.set_postfix(loss_train=float(loss_value))
        # Log every 200 batches.
#         if step % 200 == 0:
#             print('Training loss (for one batch) at step %s: %s' % (step, float(loss_value)))
#             print('Seen so far: %s samples' % ((step + 1) * 64))

    
    return model

model = train(dataset, 1)
smooth.analysis.plot_shallow(model, dataset)

In [None]:
dataset = smooth.datasets.GaussianProcessDataset(dim=32, lengthscale=0.3, samples_train=500, seed=3)

In [None]:
dataset = smooth.datasets.MnistParityDataset()
dataset.x_test = dataset.x_test[:10000]
dataset.y_test = dataset.y_test[:10000]

In [None]:
l = []

for coef in tqdm.notebook.tqdm([1e-4, 1e-4, 1e-3, 1e-2, 1e-1, 1e0]):
    model = smooth.model.train_shallow(
        dataset=dataset,
        learning_rate=1e-2,
        init_scale=1,
        hidden_size=100,
        epochs=20000,
        verbose=0,
        loss_threshold=1e-5,
        gradient_norm_reg_coef=coef,
        callbacks=[
            tqdm.keras.TqdmCallback(verbose=0)
        ]
    )
#     print("Coef:", coef)
#     smooth.analysis.plot_shallow(model, dataset)
    l.append(smooth.measures.get_measures(model, dataset))

ms = pd.DataFrame(l)

In [None]:
tf.keras.models.save_model(model.model, "test.h5")

In [None]:
ms

In [None]:
smooth.analysis.plot_shallow(model, dataset)
smooth.analysis.plot_shallow(model, dataset)

In [None]:
l = []
for sparsity in [10, 100]:
    model = train(dataset, 1., sparsity)
    measures = smooth.measures.get_measures(model, dataset)
    l.append(measures)

ms = pd.DataFrame(l)

In [None]:
x = tf.constant(dataset.x_train)
with tf.GradientTape() as tape:
    tape.watch(x)
    y = model(x)

tape.batch_jacobian(y, x)

In [None]:
y

In [None]:
model

In [None]:
model = tf.keras.Sequential([tf.keras.layers.Dense(1, input_shape=(1,))])
model.compile(optimizer=tf.keras.optimizers.SGD(1e-2), loss="mse")

model.fit(np.random.rand(10, 1), np.random.rand(10, 1), epochs=100, verbose=0, callbacks=[
    smooth.callbacks.Stopping(0.2),
    tqdm.keras.TqdmCallback(),
#     tf.keras.callbacks.EarlyStopping(monitor="loss", min_delta=10, patience=50)
])

In [None]:
np.random.rand(4)

In [None]:
os.chdir("/nfs/scistore12/chlgrp/vvolhejn/smooth/logs_debug/0318-163604/")
ms = pd.read_feather("./measures.feather")
ms = ms.sort_values("model.gradient_norm_reg_coef")

In [None]:
ms

In [None]:
os.chdir("/nfs/scistore12/chlgrp/vvolhejn/smooth/logs_debug/0318-171746/")
ms = pd.read_feather("./measures.feather")
ms = ms.sort_values("model.gradient_norm_reg_coef")
ms

In [None]:
os.chdir("/nfs/scistore12/chlgrp/vvolhejn/smooth/logs_debug/0320-154905/")
ms = pd.read_feather("./measures.feather")
ms = ms.sort_values("model.gradient_norm_reg_coef")
ms = ms.iloc[:8]
ms

In [None]:
ms[ms["model.gradient_norm_reg_coef"] == 0.001]

In [None]:
ms[["loss_train", "gradient_norm", "model.gradient_norm_reg_coef"]]

In [None]:
(ms["gradient_norm"] * ms["model.gradient_norm_reg_coef"]) / ms["loss_train"]

In [None]:
ms["path_length_d_test"] / ms["weights_rms"]

In [None]:
model_name = "./bs=64_d=mnistparity-300_e=10000_hs=64_is=1.0_i=0_lr=0.01_rc=0.001/model.h5"
model = tf.keras.models.load_model(model_name)

In [None]:
dataset = smooth.datasets.from_params(name="mnistparity", samples_train=300)

In [None]:
smooth.measures.gradient_norm(model, dataset.x_test)
# model.predict(dataset.x_train[:10])

In [None]:
smooth.measures.gradient_norm(model, dataset.x_test)

In [None]:
y = model(x)

In [None]:
y

In [None]:
with tf.GradientTape() as g:
    x = tf.constant(dataset.x_train[:13])
    g.watch(x)
    y = model(x)

dy_dx = g.batch_jacobian(y, x)
# axes_to_sum = tuple(range(2, len(dy_dx.shape)))

In [None]:
tf.norm(tf.reshape(dy_dx, (len(x),) + (model.output_shape[1:]) + (-1,)), axis=-1)

In [None]:
np.linalg.norm(dy_dx, axis=axes_to_sum)

In [None]:
isinstance(tf.constant(0), tf.Tensor)