# Optimization Tutorial

# Import required libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
%matplotlib inline

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor, Lambda

from torchsummary import summary

# Optimizing Model Parameters

Once we have a model and data it's time to train, validate and test our model by optimizing its parameters on our data. Training a model is an iterative process; in each iteration (called an *epoch*) the model makes a guess about the output, calculates the error in its guess (*loss*), collects the derivatives of the error with respect to its parameters, and **optimizes** these parameters using gradient descent. 

# Load dataset

In [3]:
# Define dataset path
dataset_path = os.path.normpath(r'E:\Sync_With_NAS_Ext\Datasets\Image_Datasets\Pytorch_Datasets')

# Load training and test data
training_data = datasets.FashionMNIST(root = dataset_path, train = True, download = False,\
                                      transform = ToTensor())

test_data = datasets.FashionMNIST(root = dataset_path, train = False, download = False,\
                                  transform = ToTensor())

print(training_data)
print()
print(test_data)

Dataset FashionMNIST
    Number of datapoints: 60000
    Root location: E:\Sync_With_NAS_Ext\Datasets\Image_Datasets\Pytorch_Datasets
    Split: Train
    StandardTransform
Transform: ToTensor()

Dataset FashionMNIST
    Number of datapoints: 10000
    Root location: E:\Sync_With_NAS_Ext\Datasets\Image_Datasets\Pytorch_Datasets
    Split: Test
    StandardTransform
Transform: ToTensor()


In [4]:
# Create dataloader objects
train_dataloader = DataLoader(training_data, batch_size = 64)
test_dataloader = DataLoader(test_data, batch_size = 64)

In [5]:
size = len(train_dataloader.dataset)
size

60000

# Define NN architecture

In [6]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
            nn.ReLU()
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [7]:
# Check GPU availability
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cpu device


In [8]:
model = NeuralNetwork().to(device)
print(summary(model, input_size = (1, 28, 28)))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
           Flatten-1                  [-1, 784]               0
            Linear-2                  [-1, 512]         401,920
              ReLU-3                  [-1, 512]               0
            Linear-4                  [-1, 512]         262,656
              ReLU-5                  [-1, 512]               0
            Linear-6                   [-1, 10]           5,130
              ReLU-7                   [-1, 10]               0
Total params: 669,706
Trainable params: 669,706
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.02
Params size (MB): 2.55
Estimated Total Size (MB): 2.58
----------------------------------------------------------------
None


In [9]:
model.parameters()

<generator object Module.parameters at 0x0000015CDF42E660>

# Hyperparameters 

Hyperparameters are adjustable parameters that let us control the model optimization process. Different hyperparameter values can impact model training and convergence rates ([`read more`](https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html) about hyperparameter tuning)

We define the following hyperparameters for training:
 - **Number of Epochs** - the number times to iterate over the dataset
 - **Batch Size** - the number of data samples seen by the model in each epoch
 - **Learning Rate** - how much to update models parameters at each batch/epoch. Smaller values yield slow learning speed, while large values may result in unpredictable behavior during training.

In [10]:
learning_rate = 1e-3
batch_size = 64
epochs = 5

# Optimization Loop


Once we set our hyperparameters, we can then train and optimize our model with an optimization loop. Each iteration of the optimization loop is called an **epoch**. 

Each epoch consists of two main parts:
 - **The Train Loop** - iterate over the training dataset and try to converge to optimal parameters.
 - **The Validation/Test Loop** - iterate over the test dataset to check if model performance is improving.

## Common Loss Functions

Common loss functions include:

* [`nn.MSELoss`](https://pytorch.org/docs/stable/generated/torch.nn.MSELoss.html#torch.nn.MSELoss) (Mean Square Error) for regression tasks,
* [`nn.NLLLoss`](https://pytorch.org/docs/stable/generated/torch.nn.NLLLoss.html#torch.nn.NLLLoss)(Negative Log Likelihood) for classification. 
* [`nn.CrossEntropyLoss`](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html#torch.nn.CrossEntropyLoss)combines ``nn.LogSoftmax`` and ``nn.NLLLoss``.

We pass our model's output logits to ``nn.CrossEntropyLoss``, which will normalize the logits and compute the prediction error.

In [11]:
# Initialize the loss function
loss_fn = nn.CrossEntropyLoss()

## Optimizer

All optimization logic is encapsulated in  the ``optimizer`` object. Here, we use the SGD optimizer; additionally, there are many [`different optimizers`](https://pytorch.org/docs/stable/optim.html)
available in PyTorch such as ADAM and RMSProp, that work better for different kinds of models and data.

We initialize the optimizer by registering the model's parameters that need to be trained, and passing in the learning rate hyperparameter.

In [12]:
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

Inside the training loop, optimization happens in three steps:
 * Call ``optimizer.zero_grad()`` to reset the gradients of model parameters. Gradients by default add up; to prevent double-counting, we explicitly zero them at each iteration.
 * Backpropagate the prediction loss with a call to ``loss.backwards()``. PyTorch deposits the gradients of the loss w.r.t. each parameter. 
 * Once we have our gradients, we call ``optimizer.step()`` to adjust the parameters by the gradients collected in the backward pass.  




# Define training and testing loops

We define a ``train_loop`` that loops over our optimization code, and a ``test_loop`` that evaluates the model's performance against our test data.

## Function: train_loop

In [13]:
def train_loop(dataloader, model, loss_fn, optimizer):
    
    size = len(dataloader.dataset) # Get size of training dataset
    for batch, (X, y) in enumerate(dataloader):  
        pred = model(X) # Forward-prop current batch through model
        loss = loss_fn(pred, y) # Obtain loss
        optimizer.zero_grad() # Reset gradients
        loss.backward() # Back-prop to compute gradients
        optimizer.step() # Update weights

        if batch % 100 == 0: # Print loss every 100 batches
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

## Verify function: train_loop

In [14]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
train_loop(train_dataloader, model, loss_fn, optimizer)

loss: 2.302107  [    0/60000]
loss: 2.294908  [ 6400/60000]
loss: 2.287379  [12800/60000]
loss: 2.296827  [19200/60000]
loss: 2.272985  [25600/60000]
loss: 2.254083  [32000/60000]
loss: 2.258187  [38400/60000]
loss: 2.229783  [44800/60000]
loss: 2.239270  [51200/60000]
loss: 2.225167  [57600/60000]


## Function: test_loop

In [15]:
def test_loop(dataloader, model, loss_fn):
    
    size = len(dataloader.dataset) # Get size of test set
    test_loss, correct = 0, 0

    with torch.no_grad(): # Turn off gradient computation to save memory
        for X, y in dataloader: # Iterate through dataloader
            pred = model(X) # Forward prop current batch through model
            test_loss += loss_fn(pred, y).item() # Update test loss
            correct += (pred.argmax(1) == y).type(torch.float).sum().item() # Update # correct preds
            
    test_loss /= size # Obtain averaged test_loss
    correct /= size # Obtain accuracy
    print(f"Test Error: \nAccuracy: {(100 * correct):>0.1f}%, Avg loss: {test_loss:>8f}\n")

In [16]:
test_loop(test_dataloader, model, loss_fn)

Test Error: 
Accuracy: 35.6%, Avg loss: 0.034966



## Train model

In [17]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = learning_rate)

epochs = 10
for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dataloader, model, loss_fn, optimizer)
    test_loop(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 2.217454  [    0/60000]
loss: 2.221717  [ 6400/60000]
loss: 2.192701  [12800/60000]
loss: 2.234955  [19200/60000]
loss: 2.167405  [25600/60000]
loss: 2.102466  [32000/60000]
loss: 2.150390  [38400/60000]
loss: 2.085717  [44800/60000]
loss: 2.113500  [51200/60000]
loss: 2.067840  [57600/60000]
Test Error: 
Accuracy: 41.7%, Avg loss: 0.032585

Epoch 2
-------------------------------
loss: 2.062531  [    0/60000]
loss: 2.059867  [ 6400/60000]
loss: 1.995784  [12800/60000]
loss: 2.085312  [19200/60000]
loss: 1.949120  [25600/60000]
loss: 1.862573  [32000/60000]
loss: 1.953682  [38400/60000]
loss: 1.846495  [44800/60000]
loss: 1.899884  [51200/60000]
loss: 1.826386  [57600/60000]
Test Error: 
Accuracy: 48.2%, Avg loss: 0.028861

Epoch 3
-------------------------------
loss: 1.817644  [    0/60000]
loss: 1.810714  [ 6400/60000]
loss: 1.714838  [12800/60000]
loss: 1.872805  [19200/60000]
loss: 1.685818  [25600/60000]
loss: 1.597331  [32000/60000]


# Further Reading

- [`Loss Functions`](https://pytorch.org/docs/stable/nn.html#loss-functions)
- [`torch.optim`](https://pytorch.org/docs/stable/optim.html)
- [`Warmstart`](https://pytorch.org/tutorials/recipes/recipes/warmstarting_model_using_parameters_from_a_different_model.html)