# Regularizing gradient norm

To see if gradient norm (or any metric) is implicitly regularized, we can try regularizing it _explicitly_ and seeing how the results change. If regularizing the gradient norm does not decrease it significantly, then probably it is already being regularized implicitly.

In [None]:
import os
import sys
# If we don't need CUDA, do this before importing TF
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
import numpy as np
import pandas as pd
import tqdm
import tqdm.notebook
import matplotlib.pyplot as plt
import seaborn as sns
import IPython
sns.set()

In [None]:
%load_ext autoreload
%aimport smooth.datasets
%aimport smooth.model
%aimport smooth.analysis
%aimport smooth.callbacks
%aimport smooth.measures
%aimport smooth.util
%autoreload 1

In [None]:
dataset = smooth.datasets.GaussianProcessDataset(samples_train=10, lengthscale=1., seed=1, dim=1)

In [None]:
model = smooth.model.interpolate_relu_network(dataset, use_test_set=False)

from types import MethodType

def f(self, x):
    print("yaboi")
    return self.call2(x)

model.call2 = model.call
model.call = MethodType(f, model)

In [None]:
model([1])

In [None]:
from types import MethodType

def f(self, x):
    print("yaboi")
    return self.call(x)

model.call = f

In [None]:
model([1])

In [None]:
smooth.analysis.plot_shallow(model, dataset)

In [None]:
model.loss_functions[0]([0], [2])

In [None]:
class RegularizedGradient(tf.keras.Model):

    def __init__(self, dataset):
        super(RegularizedGradient, self).__init__()
        self.model = smooth.model.get_shallow(
            dataset,
            learning_rate=1e-2,
            init_scale=100,
            hidden_size=16,
            activation="relu",
        )
        self.loss = self.model.loss
        self.loss_functions = self.model.loss_functions
        self.optimizer = self.model.optimizer

    def call(self, x):
#         return self.model(x)
#         x = tf.reshape(x, (-1, 1))
        with tf.GradientTape() as tape:
            tape.watch(x)
            y = self.model(x)

#         g = tape.batch_jacobian(y, x)
# #         print(tf.reduce_sum(g ** 2))
#         self.add_loss(1e-3 * tf.reduce_sum(g ** 2))

        return y

    
def get_model(dataset):
    model = smooth.model.get_shallow(
        dataset,
        learning_rate=1e-2,
        init_scale=1,
        hidden_size=32,
        activation="relu",
    )

In [None]:
def train(dataset, reg=1e-3):
    model = smooth.model.get_shallow(
        dataset,
        learning_rate=1e-2,
        init_scale=100,
        hidden_size=100,
        activation="relu",
    )
    model = smooth.model.RegularizedGradientModel(model, dataset.x_test, coef=1)

    batch_size = 64
    train_dataset = tf.data.Dataset.from_tensor_slices((dataset.x_train, dataset.y_train))
    train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
    
    epochs = 1000
    progress = tqdm.notebook.tqdm(range(epochs))

    x_test = tf.constant(dataset.x_test)
    y_test = tf.constant(dataset.y_test)
    
    for epoch in progress:
#         print('Start of epoch %d' % (epoch,))
        for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
            with tf.GradientTape() as tape:
                with tf.GradientTape() as tape2:
                    tape2.watch(x_test)
                    y_test_pred = model(x_test)
                
                loss_value = model.loss_functions[0](y_batch_train, model(x_batch_train))
                # Add extra losses created during this forward pass:
                loss_value += sum(model.losses)

#                 g = tape2.batch_jacobian(y_test_pred, x_test)
#                 loss_value += (reg * tf.reduce_mean(g ** 2))

            grads = tape.gradient(loss_value, model.trainable_weights)
            model.optimizer.apply_gradients(zip(grads, model.trainable_weights))
        
        if loss_value < 1e-6:
            break
        
        if epoch % 10 == 0:
            progress.set_postfix(loss_train=float(loss_value))
        # Log every 200 batches.
#         if step % 200 == 0:
#             print('Training loss (for one batch) at step %s: %s' % (step, float(loss_value)))
#             print('Seen so far: %s samples' % ((step + 1) * 64))

    
    return model

model = train(dataset, 1)
smooth.analysis.plot_shallow(model, dataset)

In [None]:
x = tf.constant(dataset.x_train)
with tf.GradientTape() as tape:
    tape.watch(x)
    y = model(x)

tape.batch_jacobian(y, x)

In [None]:
y

In [None]:
model