## Exploding Gradients

In [7]:
import time
from tqdm import tqdm

import numpy as np
from sklearn.datasets import make_regression
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

import neptune

# The init() function called this way assumes that
NEPTUNE_API_TOKEN="<api-token-here>"

neptune.init('<username>/sandbox',api_token = NEPTUNE_API_TOKEN)
neptune.create_experiment('Gradient-Clipping-monitoring-example')

https://ui.neptune.ai/theaayushbajaj/sandbox/e/SAN-7


Experiment(SAN-7)

In [8]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(20, 25)
        self.fc2 = nn.Linear(25, 1)

        self.ordered_layers = [self.fc1,
                               self.fc2]


    def forward(self, x):
        x = F.relu(self.fc1(x))
        outputs = self.fc2(x)
        return outputs

In [11]:
def train_model(model,  
                criterion, 
                optimizer,  
                num_epochs):
    since = time.time()
    dataset_size = 1000

    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs))
        print('-' * 10)
        time.sleep(0.1)
        model.train()  # Set model to training mode
        
        running_loss = 0.0
        batch_norm = []

        # Iterate over data.
        for idx,(inputs,label) in enumerate(tqdm(train_loader)):
            inputs = inputs.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward
            logits = model(inputs)
            loss = criterion(logits, label)

            # backward
            loss.backward()
            # calculate gradient norms
            for layer in model.ordered_layers:
                norm_grad = layer.weight.grad.norm()
                batch_norm.append(norm_grad.numpy())

            optimizer.step()

            # statistics
            running_loss += loss.item() * inputs.size(0)

        epoch_loss = running_loss / dataset_size
        
        neptune.log_metric('loss', epoch_loss)
        neptune.log_metric('Gradient Norm', np.mean(batch_norm))

        print('Train Loss: {:.4f}'.format(epoch_loss))

        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

In [21]:
if __name__ == "__main__":
    device = torch.device("cpu")

    # prepare data
    X,y = make_regression(n_samples=1000, n_features=20, noise=0.1, random_state=1)

    X = torch.Tensor(X)
    y = torch.Tensor(y)

    dataset = torch.utils.data.TensorDataset(X,y)
    train_loader = torch.utils.data.DataLoader(dataset=dataset,batch_size=128, shuffle=True)
        
    model = Net().to(device)
    optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.7)
    criterion = nn.MSELoss()

    norms = train_model(model=model,
                        criterion=criterion,
                        optimizer=optimizer,
                        num_epochs=50
                        )

100%|██████████| 8/8 [00:00<00:00, 137.62it/s]
Invalid metric value: inf for channel loss. Metrics with nan or +/-inf values will not be sent to server
Invalid metric value: inf for channel Gradient Norm. Metrics with nan or +/-inf values will not be sent to server
Epoch 0/50
----------
Train Loss: inf

Epoch 1/50
----------
100%|██████████| 8/8 [00:00<00:00, 246.19it/s]
Invalid metric value: nan for channel loss. Metrics with nan or +/-inf values will not be sent to server
Invalid metric value: nan for channel Gradient Norm. Metrics with nan or +/-inf values will not be sent to server
100%|██████████| 8/8 [00:00<00:00, 257.88it/s]
Invalid metric value: nan for channel loss. Metrics with nan or +/-inf values will not be sent to server
Invalid metric value: nan for channel Gradient Norm. Metrics with nan or +/-inf values will not be sent to server
Train Loss: nan

Epoch 2/50
----------
Train Loss: nan

Epoch 3/50
----------
100%|██████████| 8/8 [00:00<00:00, 298.12it/s]
Invalid metric v

## Tensorflow

In [21]:
import tensorflow as tf
from tensorflow.keras import Model, layers
import numpy as np
import tensorflow_datasets as tfds

print(tf.__version__)

import neptune

# The init() function called this way assumes that
NEPTUNE_API_TOKEN="<api-token-here>"

neptune.init('<username>/sandbox',api_token = NEPTUNE_API_TOKEN)
neptune.create_experiment('Tensorflow-Gradient-Clipping-Example')

2.3.1
psutil is not installed. You will not be able to abort this experiment from the UI.
psutil is not installed. Hardware metrics will not be collected.
https://ui.neptune.ai/theaayushbajaj/sandbox/e/SAN-4


Experiment(SAN-4)

In [22]:
num_classes = 10 # total classes (0-9 digits).
num_features = 784 # data features (img shape: 28*28).

# Training Parameters
learning_rate = 0.001
training_steps = 1000
batch_size = 32
display_step = 100

# Network Parameters
# MNIST image shape is 28*28px, we will then handle 28 sequences of 28 timesteps for every sample.
num_input = 28 # number of sequences.
timesteps = 28 # timesteps.
num_units = 32 # number of neurons for the LSTM layer.

In [23]:
from tensorflow.keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
# Convert to float32.
x_train, x_test = np.array(x_train, np.float32), np.array(x_test, np.float32)
# Flatten images to 1-D vector of 784 features (28*28).
x_train, x_test = x_train.reshape([-1, 28, 28]), x_test.reshape([-1, num_features])
# Normalize images value from [0, 255] to [0, 1].
x_train, x_test = x_train / 255., x_test / 255.

In [24]:
# Use tf.data API to shuffle and batch data.
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.repeat().shuffle(5000).batch(batch_size).prefetch(1)


In [25]:
# Create LSTM Model.
class LSTM(Model):
    # Set layers.
    def __init__(self):
        super(LSTM, self).__init__()
        # RNN (LSTM) hidden layer.
        self.lstm_layer = layers.LSTM(units=num_units)
        self.out = layers.Dense(num_classes)

    # Set forward pass.
    def __call__(self, x, is_training=False):
        # LSTM layer.
        x = self.lstm_layer(x)
        # Output layer (num_classes).
        x = self.out(x)
        if not is_training:
            # tf cross entropy expect logits without softmax, so only
            # apply softmax when not training.
            x = tf.nn.softmax(x)
        return x

# Build LSTM model.
lstm_net = LSTM()

In [26]:
# Cross-Entropy Loss.
# Note that this will apply 'softmax' to the logits.
def cross_entropy_loss(x, y):
    # Convert labels to int 64 for tf cross-entropy function.
    y = tf.cast(y, tf.int64)
    # Apply softmax to logits and compute cross-entropy.
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=x)
    # Average loss across the batch.
    return tf.reduce_mean(loss)

# Accuracy metric.
def accuracy(y_pred, y_true):
    # Predicted class is the index of highest score in prediction vector (i.e. argmax).
    correct_prediction = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
    return tf.reduce_mean(tf.cast(correct_prediction, tf.float32), axis=-1)

# Adam optimizer.
optimizer = tf.optimizers.Adam(learning_rate)

In [27]:
# Optimization process. 
def run_optimization(x, y):
    # Wrap computation inside a GradientTape for automatic differentiation.
    with tf.GradientTape() as tape:
        # Forward pass.
        pred = lstm_net(x, is_training=True)
        # Compute loss.
        loss = cross_entropy_loss(pred, y)
        
    # Variables to update, i.e. trainable variables.
    trainable_variables = lstm_net.trainable_variables

    # Compute gradients.
    gradients = tape.gradient(loss, trainable_variables)

    # Clip-by-value
    #gradients = [(tf.clip_by_value(grad, clip_value_min=-1.0, clip_value_max=1.0))
    #                              for grad in gradients]

    # clip-by-norm
    gradients = [(tf.clip_by_norm(grad, clip_norm=2.0)) for grad in gradients]
    
    # Update weights following gradients.
    optimizer.apply_gradients(zip(gradients, trainable_variables))

In [28]:
# Run training for the given number of steps.
for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1):
    # Run the optimization to update W and b values.
    run_optimization(batch_x, batch_y)
    
    if step % display_step == 0:
        pred = lstm_net(batch_x, is_training=True)
        loss = cross_entropy_loss(pred, batch_y)
        acc = accuracy(pred, batch_y)

        neptune.log_metric('loss', loss)
        neptune.log_metric('accuracy', acc)
        
        print("step: %i, loss: %f, accuracy: %f" % (step, loss, acc))

step: 100, loss: 1.832472, accuracy: 0.406250
step: 200, loss: 1.200897, accuracy: 0.625000
step: 300, loss: 0.689998, accuracy: 0.812500
step: 400, loss: 0.624656, accuracy: 0.812500
step: 500, loss: 0.746091, accuracy: 0.843750
step: 600, loss: 0.310500, accuracy: 0.968750
step: 700, loss: 0.338364, accuracy: 0.937500
step: 800, loss: 0.542293, accuracy: 0.843750
step: 900, loss: 0.315637, accuracy: 0.937500
step: 1000, loss: 0.291412, accuracy: 0.906250


## Keras

In [37]:
from tensorflow.keras.callbacks import Callback

In [29]:
from keras.layers import Dense, LSTM
from keras.models import Sequential
from keras.optimizers import SGD
from keras.callbacks import Callback

import neptune

# The init() function called this way assumes that
NEPTUNE_API_TOKEN="<api-token-here>"

neptune.init('<username>/sandbox',api_token = NEPTUNE_API_TOKEN)
neptune.create_experiment('Keras-Gradient-Clipping-Example')

psutil is not installed. You will not be able to abort this experiment from the UI.
psutil is not installed. Hardware metrics will not be collected.
https://ui.neptune.ai/theaayushbajaj/sandbox/e/SAN-5


Experiment(SAN-5)

In [34]:
# define model
model = Sequential()
model.add(LSTM(units=num_units))
model.add(Dense(num_classes))

In [40]:
class MonitoringCallback(Callback):
    def on_epoch_end(self, epochs,logs={}):
        for metric_name, metric_value in logs.items():
            neptune.log_metric(metric_name, metric_value)

In [41]:
# compile model
optimizer = SGD(lr=0.01, momentum=0.9, clipvalue=1.0)
model.compile(loss='binary_crossentropy', optimizer=optimizer)
# fit model
history = model.fit(train_data, epochs=5, verbose=1,steps_per_epoch=training_steps,callbacks=[MonitoringCallback()])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Pytorch

In [12]:
import os
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from torch.optim.lr_scheduler import StepLR

import neptune

# The init() function called this way assumes that
NEPTUNE_API_TOKEN="<api-token-here>"

neptune.init('<username>/sandbox',api_token = NEPTUNE_API_TOKEN)
neptune.create_experiment('Pytorch-Gradient-Clipping-Example')

psutil is not installed. You will not be able to abort this experiment from the UI.
psutil is not installed. Hardware metrics will not be collected.
https://ui.neptune.ai/theaayushbajaj/sandbox/e/SAN-3


Experiment(SAN-3)

In [13]:
n_epochs = 2
batch_size_train = 64
batch_size_test = 1000
learning_rate = 0.01
momentum = 0.5
log_interval = 10

random_seed = 1
torch.backends.cudnn.enabled = False
torch.manual_seed(random_seed)

&lt;torch._C.Generator at 0x7f2b167a7dc8&gt;

In [14]:
os.makedirs('files',exist_ok=True)
train_loader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST('files/', train=True, download=False,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=batch_size_train, shuffle=True)

test_loader = torch.utils.data.DataLoader(
  torchvision.datasets.MNIST('files/', train=False, download=False,
                             transform=torchvision.transforms.Compose([
                               torchvision.transforms.ToTensor(),
                               torchvision.transforms.Normalize(
                                 (0.1307,), (0.3081,))
                             ])),
  batch_size=batch_size_test, shuffle=True)

In [15]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Hyper-parameters
sequence_length = 28
input_size = 28
hidden_size = 128
num_layers = 2
num_classes = 10
batch_size = 100
num_epochs = 2
learning_rate = 0.01

In [16]:
# Recurrent neural network (many-to-one)
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x):
        # Set initial hidden and cell states 
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
        
        # Forward propagate LSTM
        out, _ = self.lstm(x, (h0, c0))  # out: tensor of shape (batch_size, seq_length, hidden_size)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

In [17]:
# Instantiate the model with hyperparameters
model = RNN(input_size, hidden_size, num_layers, num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [18]:
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.reshape(-1, sequence_length, input_size).to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()

        # Gradient Norm Clipping
        #nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2)

        #Gradient Value Clipping
        nn.utils.clip_grad_value_(model.parameters(), clip_value=1.0)

        optimizer.step()
        
        neptune.log_metric('loss', loss)
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

Epoch [1/2], Step [100/938], Loss: 0.3979
Epoch [1/2], Step [200/938], Loss: 0.3080
Epoch [1/2], Step [300/938], Loss: 0.1197
Epoch [1/2], Step [400/938], Loss: 0.1868
Epoch [1/2], Step [500/938], Loss: 0.1720
Epoch [1/2], Step [600/938], Loss: 0.2918
Epoch [1/2], Step [700/938], Loss: 0.0542
Epoch [1/2], Step [800/938], Loss: 0.1335
Epoch [1/2], Step [900/938], Loss: 0.2048
Epoch [2/2], Step [100/938], Loss: 0.1028
Epoch [2/2], Step [200/938], Loss: 0.2353
Epoch [2/2], Step [300/938], Loss: 0.1596
Epoch [2/2], Step [400/938], Loss: 0.3049
Epoch [2/2], Step [500/938], Loss: 0.1495
Epoch [2/2], Step [600/938], Loss: 0.0203
Epoch [2/2], Step [700/938], Loss: 0.1887
Epoch [2/2], Step [800/938], Loss: 0.2933
Epoch [2/2], Step [900/938], Loss: 0.1753
