# Use Deuterium to bound the Gradient Norm

In [21]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from deuterium import Variable, to_vec, random_symbols, get_gradients
from sympy import sympify
from scipy.optimize import shgo


import symengine as se
from sklearn.metrics import accuracy_score
import sys
sys.setrecursionlimit(1_000_000)
import warnings
warnings.filterwarnings("ignore")

Define some utility functions, notably the loss functions and tempered sigmoid activation functions.

In [22]:
to_data = np.vectorize(lambda x: x.data)

def sigmoid(x, s=1, T=1, o=0):
        return (s/(1+np.exp(-T*x)))-o

def tanh(x):
    return sigmoid(x, 2, 2, 1)

bce_loss = lambda y_pred, y_true: -np.mean(np.multiply(y_true, np.log(y_pred)) + np.multiply((1 - y_true), np.log(1 - y_pred)))
normalize = lambda x: (x-x.min())/(x.max()-x.min())

mse_loss = lambda y_pred, y_true: 1

Define the network architecture

In [139]:
IN=100
INTERMEDIATE=100

In [140]:
# This creates symbolic representations for all the layers
x = to_vec(np.array(random_symbols(IN, "x")).reshape((1,IN))) 
y = to_vec(np.array(random_symbols(1, "y")))

w1 = to_vec(np.array(random_symbols(IN*INTERMEDIATE, "w1")).reshape(IN, INTERMEDIATE))
b = to_vec(np.array(random_symbols(INTERMEDIATE, "b")).reshape(1, INTERMEDIATE))
w2 = to_vec(np.array(random_symbols(INTERMEDIATE, "w2")).reshape(INTERMEDIATE,1))

In [141]:
# This replaces some of the variables with *values*
# w1 (weights layer 1)
# w2 (weights layer 2)
# b (bias terms)
# y (label)

w1 = to_vec(np.random.normal(size=IN*INTERMEDIATE).reshape(IN, INTERMEDIATE))
b = to_vec(np.random.normal(size=INTERMEDIATE).reshape(1, INTERMEDIATE))
w2 = to_vec(np.random.normal(size=INTERMEDIATE).reshape(INTERMEDIATE,1))
y = to_vec(np.array(1))


Symbolically calculate the network output

In [142]:
layer_1 = (x@w1)+b
y_pred = (layer_1@w2)
loss = bce_loss(y_pred, y)

Obtain the gradients w.r.t all inputs

In [143]:
loss.backward()

In [144]:
x_grad = np.array([i.grad for i in x.flatten().tolist()])
y_grad = np.array([i.grad for i in y.flatten().tolist()])
w1_grad = np.array([i.grad for i in w1.flatten().tolist()])
b_grad = np.array([i.grad for i in b.flatten().tolist()])
w2_grad = np.array([i.grad for i in w2.flatten().tolist()])

full_grad = to_vec(np.concatenate((x_grad, y_grad, w1_grad, b_grad, w2_grad)))


In [145]:
# the gradient we care about is just wrt the weights

my_grad = to_vec(np.concatenate((w1_grad, b_grad, w2_grad)))
len(my_grad)
#sympify(my_grad)

10200

In [146]:
import symengine.lib.symengine_wrapper as sew

def op(e, depth):
    if isinstance(e, sew.RealDouble):
        print(' '* depth, 'found double:', e)
    elif isinstance(e, sew.Symbol):
        print(' '* depth, 'found symbol:', e)
    elif isinstance(e, sew.Mul):
        print(' '* depth, 'found Mul\t # args:', len(e.args))
        for a in e.args:
            op(a, depth+1)
    elif isinstance(e, sew.Pow):
        print(' '* depth, 'found Pow\t # args:', len(e.args))
        for a in e.args:
            op(a, depth+1)
    elif isinstance(e, sew.Add):
        print(' '* depth, 'found Add\t # args:', len(e.args))
        for a in e.args:
            op(a, depth+1)
    else:
        print('fail:', type(e))
    

In [147]:
def int_analysis(e):
    if isinstance(e, sew.RealDouble):
        return (float(e), float(e))
    elif isinstance(e, sew.Integer):
        return (float(e), float(e))
    elif isinstance(e, sew.Symbol):
        return (-1, 1)
    elif isinstance(e, sew.Mul):
        results = [int_analysis(a) for a in e.args]
        lowers = [r[0] for r in results]
        uppers = [r[1] for r in results]
        
        out_lower = lowers[0]
        out_upper = uppers[0]

        for i in range(1, len(e.args)):
            options = [out_lower*lowers[i],
                       out_lower*uppers[i],
                       out_upper*lowers[i],
                       out_upper*uppers[i]]
            out_lower = np.min(options)
            out_upper = np.max(options)

        return out_lower, out_upper
    elif isinstance(e, sew.Pow):
        a, b = e.args
        a_lower, a_upper = int_analysis(a)
        b_lower, b_upper = int_analysis(b)
        options = [a_lower**b_lower,
                   a_upper**b_lower,
                   a_lower**b_upper,
                   a_upper**b_upper]
        return (np.min(options), np.max(options))
    elif isinstance(e, sew.Add):
        results = [int_analysis(a) for a in e.args]
        lowers = [r[0] for r in results]
        uppers = [r[1] for r in results]
        return np.sum(lowers), np.sum(uppers)
    else:
        print('fail:', type(e))

In [148]:
%%time
lower, upper = int_analysis(np.linalg.norm(my_grad, ord=2).data)
upper - lower

CPU times: user 27.7 s, sys: 20 ms, total: 27.7 s
Wall time: 27.8 s


0.0037524471764013856

# Optimize the Gradient Norm one Element at a Time

In [379]:
%%time

print(my_grad_norm.data.free_symbols)
intervals = [(-1, 1) for _ in np.sum(my_grad).data.free_symbols]

elems = []
for g in my_grad:
    gp = g**2
    f = se.Lambdify(list(gp.data.free_symbols), gp.data)
    sol = shgo(f, intervals)
    elems.append(sol.fun)

print(elems)
np.sqrt(np.sum(elems))

{x_1, x_0}
[0.0, 0.0, 0.0, 0.0, 0.037232887245820745, 0.1297134117750557, 3.3288945997629245e-13, 4.543889776067925e-16]
CPU times: user 20.4 ms, sys: 34 µs, total: 20.4 ms
Wall time: 28.9 ms


0.4085906252243311

# Optimize the Gradient Norm all at once

In [380]:
my_grad_norm = np.linalg.norm(my_grad, ord=2)
my_grad_norm.data.free_symbols
my_grad_norm_func = se.Lambdify(list(my_grad_norm.data.free_symbols), my_grad_norm.data)

In [340]:
%%time

print(my_grad_norm.data.free_symbols)
intervals = [(-1, 1) for _ in my_grad_norm.data.free_symbols]
sol = shgo(my_grad_norm_func, intervals)
sol.fun

{x_1, x_0}
CPU times: user 6.76 ms, sys: 0 ns, total: 6.76 ms
Wall time: 6.66 ms


0.5778750969544236