# 1. Data Preparation
1. `torch.utils.data.Dataset`
2. `torch.utils.data.DataLoader`
   - Batching the data.
   - Shuffling the data.
   - Loading the data in parallel using `multiprocessing` workers.

## 1-1. Cross Validation with Manual Separation

In [None]:
# how many training examples
propTraining = .8 # in proportion, not percent
nTraining = int(len(labels)*propTraining)

# initialize a boolean vector to select data and labels
traintestBool = np.zeros(len(labels),dtype=bool)

# is this the correct way to select samples?
# traintestBool[range(nTraining)] = True

# this is better, but why?
items2use4train = np.random.choice(range(len(labels)),nTraining,replace=False)
traintestBool[items2use4train] = True

traintestBool

In [None]:
# test whether it's balanced
print('Average of full data:')
print( torch.mean(labels.float()) ) # =1 by definition
print(' ')

print('Average of training data:')
print( torch.mean(labels[traintestBool].float()) ) # should be 1...
print(' ')

print('Average of test data:')
print( torch.mean(labels[~traintestBool].float()) ) # should also be 1...

In [None]:
# entire dataset
print( data.shape )

# training set
print( data[traintestBool,:].shape )

# test set
print( data[~traintestBool,:].shape )

In [None]:
# create the ANN model

# model architecture
ANNiris = nn.Sequential(
    nn.Linear(4,64),   # input layer
    nn.ReLU(),         # activation unit
    nn.Linear(64,64),  # hidden layer
    nn.ReLU(),         # activation unit
    nn.Linear(64,3),   # output units
      )

# loss function
lossfun = nn.CrossEntropyLoss()

# optimizer
optimizer = torch.optim.SGD(ANNiris.parameters(),lr=.01)

In [None]:
# train the model

numepochs = 1000

# initialize losses
losses = torch.zeros(numepochs)
ongoingAcc = []

# loop over epochs
for epochi in range(numepochs):

  # forward pass
  yHat = ANNiris(data[traintestBool,:])

  # compute accuracy (note: denser than previous code!)
  ongoingAcc.append( 100*torch.mean(
              (torch.argmax(yHat,axis=1) == labels[traintestBool]).float()) )

  # compute loss
  loss = lossfun(yHat,labels[traintestBool])
  losses[epochi] = loss

  # backprop
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

In [None]:
# compute train and test accuracies

# final forward pass USING TRAINING DATA
predictions = ANNiris(data[traintestBool,:])
trainacc = 100*torch.mean((torch.argmax(predictions,axis=1) == labels[traintestBool]).float())


# final forward pass USING TEST DATA!
predictions = ANNiris(data[~traintestBool,:])
testacc = 100*torch.mean((torch.argmax(predictions,axis=1) == labels[~traintestBool]).float())

In [None]:
# report accuracies

print('Final TRAIN accuracy: %g%%' %trainacc)
print('Final TEST accuracy:  %g%%' %testacc)

In [None]:
# [2]
fakedata = np.tile(np.array([1,2,3,4]),(10,1)) + np.tile(10*np.arange(1,11),(4,1)).T
fakelabels = np.arange(10)>4
print(fakedata), print(' ')
print(fakelabels)

In [None]:
# partition sizes in proportion
partitions = np.array([.8,.1,.1])
print('Partition proportions:')
print(partitions)
print(' ')

# convert those into integers
partitionBnd = np.cumsum(partitions*len(fakelabels)).astype(int)
print('Partition boundaries:')
print(partitionBnd)
print(' ')


# random indices
randindices = np.random.permutation(range(len(fakelabels)))
print('Randomized data indices:')
print(randindices)
print(' ')

In [None]:
# select rows for the training data
train_dataN   = fakedata[randindices[:partitionBnd[0]],:]
train_labelsN = fakelabels[randindices[:partitionBnd[0]]]

# select rows for the devset data
devset_dataN   = fakedata[randindices[partitionBnd[0]:partitionBnd[1]],:]
devset_labelsN = fakelabels[randindices[partitionBnd[0]:partitionBnd[1]]]

# select rows for the test data
test_dataN   = fakedata[randindices[partitionBnd[1]:],:]
test_labelsN = fakelabels[randindices[partitionBnd[1]:]]

In [None]:
# print out the sizes
print('Training data size: ' + str(train_dataN.shape))
print('Devset size: '        + str(devset_dataN.shape))
print('Test data size: '     + str(test_dataN.shape))
print(' ')

# print out the train/test data
print('Training data: ')
print(train_dataN)
print(' ')

print('Devset data: ')
print(devset_dataN)
print(' ')

print('Test data: ')
print(test_dataN)

## 1-2. Cross Validation with scikit-learn 

In [None]:
# [1]
from sklearn.model_selection import train_test_split

train_data,test_data, train_labels,test_labels = train_test_split(fakedata, fakelabels, test_size=.2)

# print out the sizes
print('Training data size: ' + str(train_data.shape))
print('Test data size: ' + str(test_data.shape))
print(' ')

# print out the train/test data
print('Training data: ')
print(train_data)
print(' ')

print('Test data: ')
print(test_data)

In [None]:
def createANewModel():

  # model architecture
  ANNiris = nn.Sequential(
      nn.Linear(4,64),   # input layer
      nn.ReLU(),         # activation unit
      nn.Linear(64,64),  # hidden layer
      nn.ReLU(),         # activation unit
      nn.Linear(64,3),   # output units
        )

  # loss function
  lossfun = nn.CrossEntropyLoss()

  # optimizer
  optimizer = torch.optim.SGD(ANNiris.parameters(),lr=.01)

  return ANNiris,lossfun,optimizer

In [None]:
# train the model

# global parameter
numepochs = 200

def trainTheModel(trainProp):

  # initialize losses
  losses = torch.zeros(numepochs)
  trainAcc = []
  testAcc  = []

  # loop over epochs
  for epochi in range(numepochs):

    # separate train from test data
    # Note 1: unique split for each epoch!
    # Note 2: here we specify the training size, not the testing size!
    X_train,X_test, y_train,y_test = train_test_split(data,labels, train_size=trainProp)


    # forward pass and loss
    yHat = ANNiris(X_train)
    loss = lossfun(yHat,y_train)

    # backprop
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # compute training accuracy
    trainAcc.append( 100*torch.mean((torch.argmax(yHat,axis=1) == y_train).float()).item() )

    # test accuracy
    predlabels = torch.argmax( ANNiris(X_test),axis=1 )
    testAcc.append( 100*torch.mean((predlabels == y_test).float()).item() )

  # function output
  return trainAcc,testAcc

In [None]:
# create a model
ANNiris,lossfun,optimizer = createANewModel()

# train the model
# NOTE: the input is the training proportion, not the test proportion!
trainAcc,testAcc = trainTheModel(.8)

In [None]:
# plot the results
fig = plt.figure(figsize=(10,5))

plt.plot(trainAcc,'ro-')
plt.plot(testAcc,'bs-')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.legend(['Train','Test'])
plt.show()

In [None]:
trainSetSizes = np.linspace(.2,.95,10)

allTrainAcc = np.zeros((len(trainSetSizes),numepochs))
allTestAcc = np.zeros((len(trainSetSizes),numepochs))

for i in range(len(trainSetSizes)):

  # create a model
  ANNiris,lossfun,optimizer = createANewModel()

  # train the model
  trainAcc,testAcc = trainTheModel(trainSetSizes[i])

  # store the results
  allTrainAcc[i,:] = trainAcc
  allTestAcc[i,:] = testAcc

In [None]:
fig,ax = plt.subplots(1,2,figsize=(13,5))

ax[0].imshow(allTrainAcc,aspect='auto',
             vmin=50,vmax=90, extent=[0,numepochs,trainSetSizes[-1],trainSetSizes[0]])
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('Training size proportion')
ax[0].set_title('Training accuracy')

p = ax[1].imshow(allTestAcc,aspect='auto',
             vmin=50,vmax=90, extent=[0,numepochs,trainSetSizes[-1],trainSetSizes[0]])
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Training size proportion')
ax[1].set_title('Test accuracy')
fig.colorbar(p,ax=ax[1])

plt.show()

In [None]:
# [2]
### create fake dataset (same as in previous videos)

fakedata = np.tile(np.array([1,2,3,4]),(10,1)) + np.tile(10*np.arange(1,11),(4,1)).T
fakelabels = np.arange(10)>4
print(fakedata), print(' ')
print(fakelabels)

In [None]:
# specify sizes of the partitions
# order is train,devset,test
partitions = [.8,.1,.1]

# split the data (note the third input, and the TMP in the variable name)
train_data,testTMP_data, train_labels,testTMP_labels = \
                   train_test_split(fakedata, fakelabels, train_size=partitions[0])

# now split the TMP data
split = partitions[1] / np.sum(partitions[1:])
devset_data,test_data, devset_labels,test_labels = \
              train_test_split(testTMP_data, testTMP_labels, train_size=partitions[1])




# print out the sizes
print('Training data size: ' + str(train_data.shape))
print('Devset data size: '   + str(devset_data.shape))
print('Test data size: '     + str(test_data.shape))
print(' ')

# print out the train/test data
print('Training data: ')
print(train_data)
print(' ')

print('Devset data: ')
print(devset_data)
print(' ')

print('Test data: ')
print(test_data)

## 1-3. Cross Validation with Dataset & DataLoader

In [None]:
# [1]
# create our fake dataset

fakedata = np.tile(np.array([1,2,3,4]),(10,1)) + np.tile(10*np.arange(1,11),(4,1)).T
fakelabels = np.arange(10)>4
print(fakedata), print(' ')
print(fakelabels)

In [None]:
# dataloader object with all data
fakedataLdr = DataLoader(fakedata, shuffle=True)
print( fakedataLdr )
print( fakedataLdr.batch_size )

In [None]:
# iterate through the data
for i,oneSample in enumerate(fakedataLdr):
  print(i,oneSample,oneSample.shape)

# but where are the labels??

In [None]:
# we need to create a Dataset that contains the data and labels
fakeDataset = torch.utils.data.TensorDataset(torch.Tensor(fakedata),torch.Tensor(fakelabels))
print( fakeDataset.tensors ), print(' ')

# then create another DataLoader
fakedataLdr = DataLoader(fakeDataset, shuffle=True)

# iterate through the data
for dat,lab in fakedataLdr:
  print(dat,lab)

In [None]:
# use scikitlearn to split the data
train_data,test_data, train_labels,test_labels = train_test_split(fakedata, fakelabels, test_size=.2)

# then convert them into PyTorch Datasets
train_data = torch.utils.data.TensorDataset(
     torch.Tensor(train_data),torch.Tensor(train_labels))

test_data = torch.utils.data.TensorDataset(
     torch.Tensor(test_data),torch.Tensor(test_labels))

# finally, translate into dataloader objects
# notice the batches (see next cell)!
train_loader = DataLoader(train_data,batch_size=4)
test_loader  = DataLoader(test_data)

In [None]:
# examine the contents of the dataloader (batching is an advantage of dataloader!)
print('TRAINING DATA')
for batch,label in train_loader: # iterable
  print(batch,label)
  print(' ')


print(' ')
print('TESTING DATA')
for batch,label in test_loader: # iterable
  print(batch,label)
  print(' ')

In [None]:
# [2] 
# use scikitlearn to split the data
train_data,test_data, train_labels,test_labels = \
                              train_test_split(data, labels, train_size=.8)


# then convert them into PyTorch Datasets (note: already converted to tensors)
train_data = torch.utils.data.TensorDataset(train_data,train_labels)
test_data  = torch.utils.data.TensorDataset(test_data,test_labels)


# finally, translate into dataloader objects
train_loader = DataLoader(train_data,shuffle=True,batch_size=12)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0])

In [None]:
# check sizes of data batches
for X,y in train_loader:
  print(X.shape,y.shape)

X,y

In [None]:
# a function that creates the ANN model

def createANewModel():

  # model architecture
  ANNiris = nn.Sequential(
      nn.Linear(4,64),   # input layer
      nn.ReLU(),         # activation unit
      nn.Linear(64,64),  # hidden layer
      nn.ReLU(),         # activation unit
      nn.Linear(64,3),   # output units
        )

  # loss function
  lossfun = nn.CrossEntropyLoss()

  # optimizer
  optimizer = torch.optim.SGD(ANNiris.parameters(),lr=.01)

  return ANNiris,lossfun,optimizer

In [None]:
# train the model

# global parameter
numepochs = 500

def trainTheModel():

  # initialize accuracies as empties (not storing losses here)
  trainAcc = []
  testAcc  = []

  # loop over epochs
  for epochi in range(numepochs):


    # loop over training data batches
    batchAcc = []
    for X,y in train_loader:

      # forward pass and loss
      yHat = ANNiris(X)
      loss = lossfun(yHat,y)

      # backprop
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # compute training accuracy just for this batch
      batchAcc.append( 100*torch.mean((torch.argmax(yHat,axis=1) == y).float()).item() )
    # end of batch loop...


    # now that we've trained through the batches, get their average training accuracy
    trainAcc.append( np.mean(batchAcc) )

    # test accuracy
    X,y = next(iter(test_loader)) # extract X,y from test dataloader
    predlabels = torch.argmax( ANNiris(X),axis=1 )
    testAcc.append( 100*torch.mean((predlabels == y).float()).item() )

  # function output
  return trainAcc,testAcc

In [None]:
# create a model
ANNiris,lossfun,optimizer = createANewModel()

# train the model
trainAcc,testAcc = trainTheModel()

In [None]:
# plot the results
fig = plt.figure(figsize=(10,5))

plt.plot(trainAcc,'ro-')
plt.plot(testAcc,'bs-')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.legend(['Train','Test'])

# optional zoom-in to final epochs
# plt.xlim([300,500])
# plt.ylim([90,100.5])

plt.show()

# 2. Model Building
## 2-1. Sequential
The `torch.nn.Sequential` class is easy to set up & read but with limited flexibility & interactivity. Suitable for creating small models.  
1. `torch.nn.Sequential(*args: Module)`: Modules will be added to the sequential container in the order they are passed in the constructor.
2. `torch.nn.Sequential(arg: OrderedDict[str, Module])`: Passes in an `OrderedDict` of modules.

In [2]:
import torch

model = nn.Sequential(
    torch.nn.Conv2d(1, 20, 5),
    torch.nn.ReLU(),
    torch.nn.Conv2d(20, 64, 5),
    torch.nn.ReLU()
)
model

Sequential(
  (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (1): ReLU()
  (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
  (3): ReLU()
)

In [5]:
from collections import OrderedDict

model = torch.nn.Sequential(OrderedDict([
    ("conv_1", torch.nn.Conv2d(1, 20, 5)),
    ("relu_1", torch.nn.ReLU()),
    ("conv_2", torch.nn.Conv2d(20, 64, 5)),
    ("relu_2", torch.nn.ReLU())
]))
model

Sequential(
  (conv_1): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
  (relu_1): ReLU()
  (conv_2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
  (relu_2): ReLU()
)

## 2-2. Class
Your model should subclass `torch.nn.Module`. `torch.nn` holds basic build blocks for graphs.
1. `torch.nn.Module.parameters(recurse=True)`: Returns an iterator over module learnable parameters (weights & biases). This is typically passed to an optimizer.
2. `torch.nn.Module.named_parameters(prefix='', recurse=True, remove_duplicate=True)`: Returns an iterator over module parameters, yielding both the name of the parameter as well as the parameter itself.
3. `torch.nn.Module.to(device=None, dtype=None, non_blocking=False)`: Moves and/or casts the parameters and buffers.
4. `torch.nn.Module.load_state_dict(state_dict, strict=True, assign=False)`: Copies parameters and buffers from `state_dict` into this module and its descendants.
5. `torch.nn.Module.state_dict(*, destination: T_destination, prefix: str = '', keep_vars: bool = False)`: A Python dictionary object contains parameters & persistent buffers. 

In [14]:
class CNNModel(torch.nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        self.conv_layers = torch.nn.Sequential(
            torch.nn.Conv2d(1, 20, 5),
            torch.nn.ReLU(),
            torch.nn.Conv2d(20, 64, 5),
            torch.nn.ReLU()
        )
  
    def forward(self, X):
        out = self.conv_layers(X)
        return out

model = CNNModel()
model

CNNModel(
  (conv_layers): Sequential(
    (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
    (1): ReLU()
    (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
    (3): ReLU()
  )
)

In [17]:
# `parameters()`
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[[[ 1.3241e-01,  1.5604e-01, -1.0033e-01,  1.9216e-01, -6.9605e-02],
          [ 1.7387e-01, -8.7238e-02, -8.7400e-02, -6.6454e-03, -1.3710e-02],
          [ 4.7818e-02, -1.3161e-01, -1.2073e-01, -8.0846e-02, -1.4806e-01],
          [ 1.5466e-01,  1.7643e-01,  6.8757e-02, -1.3546e-01,  8.1396e-02],
          [ 1.4838e-01, -1.3129e-01, -3.4942e-02,  1.3256e-01,  1.8711e-01]]],


        [[[-1.7838e-01,  9.0442e-02,  9.4969e-02,  1.9040e-01, -5.6426e-02],
          [ 1.4222e-01,  1.0127e-01, -9.1116e-03, -2.4985e-02, -1.6505e-01],
          [ 1.7914e-01, -7.6648e-02, -1.7809e-01,  1.2110e-01, -5.1342e-02],
          [ 2.2690e-02,  1.5465e-01, -8.5308e-02, -4.7745e-02,  3.1571e-02],
          [ 2.8867e-02, -3.0587e-02, -3.1570e-02, -1.9829e-01, -4.0238e-02]]],


        [[[ 1.3186e-01, -1.4602e-01,  9.5862e-02, -8.8489e-02, -9.6553e-02],
          [-1.3332e-01,  1.1675e-02,  1.6270e-01,  1.4342e-01,  1.4186e-02],
          [ 6.7934e-02,  1.0176e-02,  5.6962e-

In [22]:
# `named_parameters()`
for name, param in model.named_parameters():
    print("Name:", name)
    print(param)

Name: conv_layers.0.weight
Parameter containing:
tensor([[[[ 1.3241e-01,  1.5604e-01, -1.0033e-01,  1.9216e-01, -6.9605e-02],
          [ 1.7387e-01, -8.7238e-02, -8.7400e-02, -6.6454e-03, -1.3710e-02],
          [ 4.7818e-02, -1.3161e-01, -1.2073e-01, -8.0846e-02, -1.4806e-01],
          [ 1.5466e-01,  1.7643e-01,  6.8757e-02, -1.3546e-01,  8.1396e-02],
          [ 1.4838e-01, -1.3129e-01, -3.4942e-02,  1.3256e-01,  1.8711e-01]]],


        [[[-1.7838e-01,  9.0442e-02,  9.4969e-02,  1.9040e-01, -5.6426e-02],
          [ 1.4222e-01,  1.0127e-01, -9.1116e-03, -2.4985e-02, -1.6505e-01],
          [ 1.7914e-01, -7.6648e-02, -1.7809e-01,  1.2110e-01, -5.1342e-02],
          [ 2.2690e-02,  1.5465e-01, -8.5308e-02, -4.7745e-02,  3.1571e-02],
          [ 2.8867e-02, -3.0587e-02, -3.1570e-02, -1.9829e-01, -4.0238e-02]]],


        [[[ 1.3186e-01, -1.4602e-01,  9.5862e-02, -8.8489e-02, -9.6553e-02],
          [-1.3332e-01,  1.1675e-02,  1.6270e-01,  1.4342e-01,  1.4186e-02],
          [ 6.7934e

In [23]:
# `named_parameters()` & `data` attribute
for name, param in model.named_parameters():
    print("Name:", name)
    print("Parameters:", param.data)

Name: conv_layers.0.weight
Parameters: tensor([[[[ 1.3241e-01,  1.5604e-01, -1.0033e-01,  1.9216e-01, -6.9605e-02],
          [ 1.7387e-01, -8.7238e-02, -8.7400e-02, -6.6454e-03, -1.3710e-02],
          [ 4.7818e-02, -1.3161e-01, -1.2073e-01, -8.0846e-02, -1.4806e-01],
          [ 1.5466e-01,  1.7643e-01,  6.8757e-02, -1.3546e-01,  8.1396e-02],
          [ 1.4838e-01, -1.3129e-01, -3.4942e-02,  1.3256e-01,  1.8711e-01]]],


        [[[-1.7838e-01,  9.0442e-02,  9.4969e-02,  1.9040e-01, -5.6426e-02],
          [ 1.4222e-01,  1.0127e-01, -9.1116e-03, -2.4985e-02, -1.6505e-01],
          [ 1.7914e-01, -7.6648e-02, -1.7809e-01,  1.2110e-01, -5.1342e-02],
          [ 2.2690e-02,  1.5465e-01, -8.5308e-02, -4.7745e-02,  3.1571e-02],
          [ 2.8867e-02, -3.0587e-02, -3.1570e-02, -1.9829e-01, -4.0238e-02]]],


        [[[ 1.3186e-01, -1.4602e-01,  9.5862e-02, -8.8489e-02, -9.6553e-02],
          [-1.3332e-01,  1.1675e-02,  1.6270e-01,  1.4342e-01,  1.4186e-02],
          [ 6.7934e-02,  1.01

In [25]:
# `state_dict()`
model.state_dict()

OrderedDict([('conv_layers.0.weight',
              tensor([[[[ 1.3241e-01,  1.5604e-01, -1.0033e-01,  1.9216e-01, -6.9605e-02],
                        [ 1.7387e-01, -8.7238e-02, -8.7400e-02, -6.6454e-03, -1.3710e-02],
                        [ 4.7818e-02, -1.3161e-01, -1.2073e-01, -8.0846e-02, -1.4806e-01],
                        [ 1.5466e-01,  1.7643e-01,  6.8757e-02, -1.3546e-01,  8.1396e-02],
                        [ 1.4838e-01, -1.3129e-01, -3.4942e-02,  1.3256e-01,  1.8711e-01]]],
              
              
                      [[[-1.7838e-01,  9.0442e-02,  9.4969e-02,  1.9040e-01, -5.6426e-02],
                        [ 1.4222e-01,  1.0127e-01, -9.1116e-03, -2.4985e-02, -1.6505e-01],
                        [ 1.7914e-01, -7.6648e-02, -1.7809e-01,  1.2110e-01, -5.1342e-02],
                        [ 2.2690e-02,  1.5465e-01, -8.5308e-02, -4.7745e-02,  3.1571e-02],
                        [ 2.8867e-02, -3.0587e-02, -3.1570e-02, -1.9829e-01, -4.0238e-02]]],
              
   

## 2-3. Activation Functions

## 2-4. Summary
- [torchinfo](https://github.com/TylerYep/torchinfo)

In [28]:
from torchinfo import summary

summary(model)

Layer (type:depth-idx)                   Param #
CNNModel                                 --
├─Sequential: 1-1                        --
│    └─Conv2d: 2-1                       520
│    └─ReLU: 2-2                         --
│    └─Conv2d: 2-3                       32,064
│    └─ReLU: 2-4                         --
Total params: 32,584
Trainable params: 32,584
Non-trainable params: 0

In [29]:
summary(model, input_size=(32, 1, 28, 28))

Layer (type:depth-idx)                   Output Shape              Param #
CNNModel                                 [32, 64, 20, 20]          --
├─Sequential: 1-1                        [32, 64, 20, 20]          --
│    └─Conv2d: 2-1                       [32, 20, 24, 24]          520
│    └─ReLU: 2-2                         [32, 20, 24, 24]          --
│    └─Conv2d: 2-3                       [32, 64, 20, 20]          32,064
│    └─ReLU: 2-4                         [32, 64, 20, 20]          --
Total params: 32,584
Trainable params: 32,584
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 420.00
Input size (MB): 0.10
Forward/backward pass size (MB): 9.50
Params size (MB): 0.13
Estimated Total Size (MB): 9.73

# 3. Training
## 3-1. Loss Functions
1. Each PyTorch Loss function creates a criterion that measures the loss between `output` from a model and `target`, returning a `torch.Tensor`.
   - [Loss Functions](https://pytorch.org/docs/stable/nn.html#loss-functions)
   - Metrics supporting backpropagation (`is_differentiable == True`) in **TorchMetrics** can be used. More details in [Metrics & Differentiability](https://lightning.ai/docs/torchmetrics/stable/pages/overview.html#metrics-and-differentiability).
3. `torch.Tensor.backward(gradient=None, retain_graph=None, create_graph=False, inputs=None)`: Computes the gradient of current tensor with reference to graph leaves. The graph is differentiated using the chain rule. This function accumulates gradients in the leaves.

In [10]:
import torch, torch.nn as nn

loss = nn.MSELoss()
# Model output
input = torch.randn(3, 5, requires_grad=True)
print(input)
target = torch.randn(3, 5)
print(target)
output = loss(input, target)
print(output)
output.backward()
print(input.grad)

tensor([[-0.6368,  0.6902, -1.1642,  1.7340,  0.3089],
        [-0.5741, -2.2496,  0.1549,  0.2255, -0.5959],
        [ 0.6467,  0.2575,  1.8013,  0.4424, -0.4409]], requires_grad=True)
tensor([[-0.7584,  0.2138, -0.3933,  0.1054,  0.2149],
        [ 0.2793,  0.7960, -0.1902,  2.1231, -0.5180],
        [ 1.6648, -0.5898,  2.1185,  0.5762,  1.2281]])
tensor(1.4591, grad_fn=<MseLossBackward0>)
tensor([[ 0.0162,  0.0635, -0.1028,  0.2171,  0.0125],
        [-0.1138, -0.4061,  0.0460, -0.2530, -0.0104],
        [-0.1358,  0.1130, -0.0423, -0.0178, -0.2225]])


## 3-2. Optimizer
1. `torch.optim` implements various [Optimization Algorithms](https://pytorch.org/docs/stable/optim.html#algorithms).
2. `torch.optim.Optimizer.step(closure=None)`: Performs a single optimization step (parameter update).
    - For example, [torch.optim.SGD.step(closure=None)](https://github.com/pytorch/pytorch/blob/cd9b27231b51633e76e28b6a34002ab83b0660fc/torch/optim/sgd.py#L63).
3. `torch.optim.Optimizer.zero_grad(set_to_none=True)`: Resets the gradients of all optimized `torch.Tensor`s.
4. `torch.optim.Optimizer.load_state_dict(state_dict)`: Loads the optimizer state. Uses this function when loading a general checkpoint for inference or resuming training.
5. `torch.optim.Optimizer.state_dict`: Contains information about the optimizer's state (parameters to be optimized), as well as the hyperparameters used.
6. `torch.optim.Optimizer.add_param_group(param_group)`: Adds a param group to the `Optimizer`'s `param_groups`. Uses this function when fine tuning a pre-trained network as frozen layers can be made trainable and added to the `Optimizer` as training progresses.

## 3-3. Mini-Batch Gradient Descent

In [None]:
# [1]
# import dataset
import pandas as pd
iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')

In [None]:
# plot the data
iris.plot(marker='o',linestyle='none',figsize=(12,6))
plt.xlabel('Sample number')
plt.ylabel('Value')
plt.show()

In [None]:
# organize the data

# convert from pandas dataframe to tensor
data = torch.tensor( iris[iris.columns[0:4]].values ).float()

# transform species to number
labels = torch.zeros(len(data), dtype=torch.long)
# labels[iris.species=='setosa']   = 0 # don't need!
labels[iris.species=='versicolor'] = 1
labels[iris.species=='virginica']  = 2

In [None]:
# use scikitlearn to split the data
train_data,test_data, train_labels,test_labels = train_test_split(data, labels, test_size=.2)


# then convert them into PyTorch Datasets (note: already converted to tensors)
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)


# finally, translate into dataloader objects
batchsize    = 16
train_loader = DataLoader(train_data,batch_size=batchsize,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_data.tensors[0].shape[0]) # how big should these batches be??

In [None]:
# check sizes of data batches
for X,y in train_loader:
  print(X.shape,y.shape)

# go back and set drop_last=True in training DataLoader

In [None]:
# a function that creates the ANN model

def createANewModel():

  # model architecture
  ANNiris = nn.Sequential(
      nn.Linear(4,64),   # input layer
      nn.ReLU(),         # activation unit
      nn.Linear(64,64),  # hidden layer
      nn.ReLU(),         # activation unit
      nn.Linear(64,3),   # output units
        )

  # loss function
  lossfun = nn.CrossEntropyLoss()

  # optimizer
  optimizer = torch.optim.SGD(ANNiris.parameters(),lr=.0005)

  return ANNiris,lossfun,optimizer

In [None]:
# train the model

# global parameter
numepochs = 2500

def trainTheModel():

  # initialize accuracies as empties
  trainAcc = []
  testAcc  = []
  losses   = []

  # loop over epochs
  for epochi in range(numepochs):

    # loop over training data batches
    batchAcc  = []
    batchLoss = []
    for X,y in train_loader:

      # forward pass and loss
      yHat = ANNiris(X)
      loss = lossfun(yHat,y)

      # backprop
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # compute training accuracy just for this batch
      batchAcc.append( 100*torch.mean((torch.argmax(yHat,axis=1) == y).float()).item() )
      batchLoss.append( loss.item() )
    # end of batch loop...

    # now that we've trained through the batches, get their average training accuracy
    trainAcc.append( np.mean(batchAcc) )
    losses.append( np.mean(batchLoss) )

    # test accuracy
    X,y = next(iter(test_loader)) # extract X,y from test dataloader
    predlabels = torch.argmax( ANNiris(X),axis=1 )
    testAcc.append( 100*torch.mean((predlabels == y).float()).item() )

  # function output
  return trainAcc,testAcc,losses

In [None]:
# create a model
ANNiris,lossfun,optimizer = createANewModel()

# train the model
trainAcc,testAcc,losses = trainTheModel()

In [None]:
# plot the results
fig,ax = plt.subplots(1,2,figsize=(15,5))


ax[0].plot(losses,'k^-')
ax[0].set_ylabel('Loss')
ax[0].set_xlabel('Epochs')
ax[0].set_title('Losses with minibatch size=' + str(batchsize))

ax[1].plot(trainAcc,'ro-')
ax[1].plot(testAcc,'bs-')
ax[1].set_title('Accuracy with minibatch size=' + str(batchsize))
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('Accuracy (%)')
ax[1].legend(['Train','Test'])
ax[1].set_ylim([27,103])

plt.show()

In [None]:
# [2]
# create data

nPerClust = 200

th = np.linspace(0,4*np.pi,nPerClust)
r1 = 10
r2 = 15

# generate data
a = [ r1*np.cos(th) + np.random.randn(nPerClust)*3 ,
      r1*np.sin(th) + np.random.randn(nPerClust) ]
b = [ r2*np.cos(th) + np.random.randn(nPerClust) ,
      r2*np.sin(th) + np.random.randn(nPerClust)*3 ]

# true labels
labels_np = np.vstack((np.zeros((nPerClust,1)),np.ones((nPerClust,1))))

# concatanate into a matrix
data_np = np.hstack((a,b)).T

# convert to a pytorch tensor
data = torch.tensor(data_np).float()
labels = torch.tensor(labels_np).float()

# show the data
fig = plt.figure(figsize=(5,5))
plt.plot(data[np.where(labels==0)[0],0],data[np.where(labels==0)[0],1],'bs')
plt.plot(data[np.where(labels==1)[0],0],data[np.where(labels==1)[0],1],'ko')
plt.title("The qwerties' doughnuts!")
plt.xlabel('qwerty dimension 1')
plt.ylabel('qwerty dimension 2')
plt.show()

In [None]:
# use scikitlearn to split the data
train_data,test_data, train_labels,test_labels = train_test_split(data, labels, test_size=.1)


# then convert them into PyTorch Datasets (note: already converted to tensors)
train_data = TensorDataset(train_data,train_labels)
test_data  = TensorDataset(test_data,test_labels)


# finally, translate into dataloader objects
train_batchsize = 16
test_batchsize  = test_data.tensors[0].shape[0]-2
train_loader = DataLoader(train_data,batch_size=train_batchsize,shuffle=True,drop_last=True)
test_loader  = DataLoader(test_data,batch_size=test_batchsize)

In [None]:
# check sizes of data batches
for X,y in test_loader:
  print(X.shape,y.shape)

In [None]:
class theModelClass(nn.Module):
  def __init__(self):
    super().__init__()

    ### layers
    self.input  = nn.Linear(  2,128)
    self.hidden = nn.Linear(128,128)
    self.output = nn.Linear(128, 1)

  # forward pass
  def forward(self,x):
    x = F.relu( self.input(x) )
    x = F.relu( self.hidden(x) )
    x = self.output(x)
    return x

In [None]:
# a function that creates the ANN model

def createANewModel():

  # grab an instance of the model class
  ANNQC = theModelClass()

  # loss function
  lossfun = nn.BCEWithLogitsLoss()

  # optimizer
  optimizer = torch.optim.SGD(ANNQC.parameters(),lr=.01)

  return ANNQC,lossfun,optimizer

In [None]:
# train the model

# global parameter
numepochs = 500

# NOTE: this time, the model, lossfun, and optimizer are inputs into the function!
def trainTheModel(ANNQC,lossfun,optimizer):

  # initialize accuracies as empties (not storing losses here)
  trainAcc = []
  testAcc  = []

  # loop over epochs
  for epochi in range(numepochs):

    # loop over training data batches
    batchAcc = []
    for X,y in train_loader:

      # forward pass and loss
      yHat = ANNQC(X)
      loss = lossfun(yHat,y)
      
      # backprop
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # compute training accuracy just for this batch
      batchAcc.append( 100*torch.mean(((yHat>0)==y).float()).item() )
    # end of batch loop...

    # now that we've trained through the batches, get their average training accuracy
    trainAcc.append( np.mean(batchAcc) )

    # test accuracy (NOTE: testing in batches!)
    tstacc = []
    for X,y in test_loader:
      yHat = ANNQC(X)
      tstacc.append( 100*torch.mean(((yHat>0) == y).float()).item() )
    # now get the average accuracy over test-batches
    testAcc.append(np.mean(tstacc))
  
  # function output
  return trainAcc,testAcc


In [None]:
# create a model
ANNQC,lossfun,optimizer = createANewModel()

# train the model (note the inputs!)
trainAcc,testAcc = trainTheModel(ANNQC,lossfun,optimizer)

In [None]:
# plot the results
fig = plt.figure(figsize=(10,5))

plt.plot(trainAcc,'bs')
plt.plot(testAcc,'ro')
plt.xlabel('Epochs')
plt.ylabel('Accuracy (%)')
plt.legend(['Train','Test'])

plt.show()

## 3-4. Gradient Accumulation

**Gradient Accumulation** refers to the situation, where multiple backwards passes are performed before updating the parameters. The goal is to have the same model parameters for multiple inputs (batches) and then update the model's parameters based on all these batches, instead of performing an update after every single batch. This technique is used to overcome GPU memory limitations when training neural networks. 

Gradient accumulation adds gradients over an effective batch of size, `batch_per_iter * iters_to_accumulate` (`* num_procs` if distributed). Steps including:
- Specify the `iters_to_accumulate` parameter, indicating how many batches we would like to update the network weights.
- Condition the weight update on the index of the running batch. This requires using `enumerate(DataLoader)` to store the batch index when looping through the data.
- Divide the running loss by `iters_to_accumulate`. This normalizes the loss to reduce the contribution of each mini-batch we are actually processing. Depending on the way you compute the loss, you might not need this step. If you average loss within each batch, the division is already correct and there is no need for extra normalization.

In [None]:
optimizer = ...

for epoch in range(...):
    # loop through batches
    for inputs, labels in data_loader:
        # extract inputs and labels
        inputs = inputs.to(device)
        labels = labels.to(device)
    
        # passes and weights update
        with torch.set_grad_enabled(True):
            # forward pass
            preds = model(inputs)
            loss = criterion(preds, labels)
    
            # backward pass
            loss.backward()
    
            # weights update
            optimizer.step()
            optimizer.zero_grad()

In [None]:
# batch accumulation parameter
iters_to_accumulate = 4
optimizer = ...

for epoch in range(...):
    # loop through enumaretad batches
    for batch_idx, (inputs, labels) in enumerate(data_loader):
        # extract inputs and labels
        inputs = inputs.to(device)
        labels = labels.to(device)
    
        # passes and weights update
        with torch.set_grad_enabled(True):
            # forward pass
            preds = model(inputs)
            loss = criterion(preds, labels)
    
            # normalize loss to account for batch accumulation
            loss = loss / iters_to_accumulate
    
            # backward pass
            loss.backward()
    
            # weights update
            if ((batch_idx + 1) % iters_to_accumulate == 0) or (batch_idx + 1 == len(data_loader)):
                optimizer.step()
                optimizer.zero_grad()

## 3-5. Automatic Mixed Precision

NVIDIA researchers created a methodology that combines single precision with the half precision floating point numbers for training deep learning models, that achieves the same level of accuracy as `float32`. Main advantages include less training time, enabling larger batch sizes, larger models & inputs, lower memory requirements. 

In PyTorch, **Automatic Mixed Precision Training** means training with `torch.autocast` & `torch.amp.GradScaler` together.
- Instances of `torch.autocast` enable autocasting for chosen regions. Autocasting automatically chooses the precision for GPU operations to improve performance while maintaining accuracy.
- Instances of `torch.amp.GradScaler` help perform the steps of gradient scaling conveniently. Gradient scaling improves convergence for networks with `float16` gradients by minimizing gradient underflow.

In [None]:
import torch
from torch.cuda.amp import GradScaler, autocast

scaler = GradScaler()

for epoch in range(epochs):
    for input, target in data:
        optimizer.zero_grad()

        with autocast(device_type='cuda', dtype=torch.float16):
            output = model(input)
            loss = loss_fn(output, target)
		
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

In [None]:
import torch
from torch.cuda.amp import GradScaler, autocast

scaler = GradScaler()
batch_size = 4
iters_to_accumulate = 16
# this means training will be done for affective batch size of 4 * 16 = 64

for epoch in range(epochs):
    for batch_idx, (input, target) in enumerate(data):
        with autocast(device_type='cuda', dtype=torch.float16):
            output = model(input)
            loss = loss_fn(output, target)
            loss = loss / iters_to_accumulate

        scaler.scale(loss).backward()

        if (batch_idx + 1) % iters_to_accumulate == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

# 4. Evaluation

# 5. Save & Reload

# 6. Inference

## 7. Optimization
PyTorch introduces the inference speedups.

1. `torch.compile(model=None, *, fullgraph=False, dynamic=None, backend='inductor', mode=None, options=None, disable=False)`: Optimizes given model/function using TorchDynamo & specified backend.
   - Running TorchInductor on GPU requires [Triton](https://github.com/triton-lang/triton). Check the installation instructions. Update NVIDIA drivers & PyTorch to the latest versions.
   - An arbitrary Python function can be optimized by passing the callable to `torch.compile` or decorating the function with `@torch.compile`.
   - Nested function calls & submodules will also be compiled. Or you can disable compilation using `@torch.compiler.disable(recursive=False)`.
   - [TorchDynamo APIs for Fine-Grained Tracing](https://pytorch.org/docs/stable/torch.compiler_fine_grain_apis.html#torchdynamo-apis-for-fine-grained-tracing)

Best practices:
- **Top-Level Compilation:** One approach is to compile at the highest level possible (i.e., when the top-level module is initialized/called) and selectively disable compilation when encountering excessive graph breaks or errors. If there are still many compile issues, compile individual subcomponents instead.
- **Modular Testing:** Test individual functions and modules with `torch.compile` before integrating them into larger models to isolate potential issues.
- **Disable Compilation Selectively:** If certain functions or sub-modules cannot be handled by `torch.compile`, use the `torch.compiler.disable` context managers to recursively exclude them from compilation.
- **Compile Leaf Functions First:** In complex models with multiple nested functions and modules, start by compiling the leaf functions or modules first. For more information see TorchDynamo APIs for fine-grained tracing.

In [30]:
# `torch.compile()`
def foo(x, y):
    a = torch.sin(x)
    b = torch.cos(y)
    return a + b

opt_foo1 = torch.compile(foo)
print(opt_foo1(torch.randn(10, 10), torch.randn(10, 10)))

refreshing <module 'torch.ops.quantized_decomposed' from 'torch.ops'> quantize_per_tensor
refreshing <module 'torch.ops.quantized_decomposed' from 'torch.ops'> quantize_per_tensor
refreshing <module 'torch.ops.quantized_decomposed' from 'torch.ops'> dequantize_per_tensor
refreshing <module 'torch.ops.quantized_decomposed' from 'torch.ops'> dequantize_per_tensor
tensor([[ 8.6933e-01,  7.5847e-04,  3.9209e-01,  8.7964e-01,  9.0168e-02,
          1.8997e+00,  6.3927e-02,  1.9821e+00,  1.0241e+00,  2.5646e-01],
        [ 5.8212e-02,  1.1917e+00, -3.9244e-01,  4.9394e-01,  1.0135e+00,
         -5.8056e-02,  4.3636e-01,  3.7796e-02, -8.9714e-01,  7.5028e-01],
        [ 8.4038e-01,  5.7328e-01, -9.5642e-02,  4.7605e-02,  2.5690e-01,
         -1.4153e-01,  6.5757e-01,  4.1083e-01,  1.5635e+00,  9.8605e-01],
        [ 3.4656e-01,  2.0036e-01,  8.9890e-02,  1.6248e+00,  6.8143e-01,
          1.7246e+00,  3.4969e-01, -1.1370e+00,  1.7010e+00, -6.9169e-02],
        [-1.8587e-01,  1.9307e+00,  1.60

In [31]:
# `@torch.compile`
t1 = torch.randn(10, 10)
t2 = torch.randn(10, 10)

@torch.compile
def opt_foo2(x, y):
    a = torch.sin(x)
    b = torch.cos(y)
    return a + b
print(opt_foo2(t1, t2))

tensor([[-4.5126e-01,  8.7730e-02, -1.5393e-03,  6.8282e-01,  1.5495e+00,
         -1.1766e-01,  4.8922e-01,  5.2666e-01,  2.9069e-01, -3.9022e-02],
        [ 1.4289e+00,  6.8740e-02,  6.8203e-01,  1.4575e+00,  1.6099e+00,
          1.9043e+00,  9.8577e-01, -1.5439e-01,  1.4233e+00, -2.6519e-01],
        [ 4.2308e-01,  1.8493e+00,  3.9818e-01,  4.7927e-01, -9.0105e-02,
          6.0482e-01,  1.2752e+00, -2.6861e-01, -3.1696e-01,  7.3602e-01],
        [ 8.0196e-01,  3.5524e-01,  1.8555e+00,  9.0838e-01, -3.1011e-01,
         -7.7550e-01,  9.3343e-01,  7.4526e-01, -3.8135e-02,  6.0181e-01],
        [ 1.6538e+00,  1.3102e+00,  3.3101e-01,  1.7505e+00,  4.6171e-01,
          8.5225e-01,  9.6259e-01,  5.2586e-01,  1.6967e+00, -2.6893e-01],
        [ 7.6992e-01,  1.0677e+00,  1.1792e-01,  1.1183e-01,  1.2136e+00,
          9.9022e-01,  1.0098e+00,  4.9332e-01, -8.8015e-01,  1.0438e+00],
        [ 1.6655e+00,  1.8387e+00,  5.5634e-01,  1.4005e-01, -6.5086e-01,
          8.6880e-01,  1.5779e+0

In [33]:
# `@torch.compile` with nested functions
def nested_function(x):
    return torch.sin(x)

@torch.compile
def outer_function(x, y):
    a = nested_function(x)
    b = torch.cos(y)
    return a + b

print(outer_function(t1, t2))

tensor([[-4.5126e-01,  8.7730e-02, -1.5393e-03,  6.8282e-01,  1.5495e+00,
         -1.1766e-01,  4.8922e-01,  5.2666e-01,  2.9069e-01, -3.9022e-02],
        [ 1.4289e+00,  6.8740e-02,  6.8203e-01,  1.4575e+00,  1.6099e+00,
          1.9043e+00,  9.8577e-01, -1.5439e-01,  1.4233e+00, -2.6519e-01],
        [ 4.2308e-01,  1.8493e+00,  3.9818e-01,  4.7927e-01, -9.0105e-02,
          6.0482e-01,  1.2752e+00, -2.6861e-01, -3.1696e-01,  7.3602e-01],
        [ 8.0196e-01,  3.5524e-01,  1.8555e+00,  9.0838e-01, -3.1011e-01,
         -7.7550e-01,  9.3343e-01,  7.4526e-01, -3.8135e-02,  6.0181e-01],
        [ 1.6538e+00,  1.3102e+00,  3.3101e-01,  1.7505e+00,  4.6171e-01,
          8.5225e-01,  9.6259e-01,  5.2586e-01,  1.6967e+00, -2.6893e-01],
        [ 7.6992e-01,  1.0677e+00,  1.1792e-01,  1.1183e-01,  1.2136e+00,
          9.9022e-01,  1.0098e+00,  4.9332e-01, -8.8015e-01,  1.0438e+00],
        [ 1.6655e+00,  1.8387e+00,  5.5634e-01,  1.4005e-01, -6.5086e-01,
          8.6880e-01,  1.5779e+0

In [32]:
# `torch.compile()` with `torch.nn.Module` instances
t = torch.randn(10, 100)

class MyModule(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.lin = torch.nn.Linear(100, 10)

    def forward(self, x):
        return torch.nn.functional.relu(self.lin(x))

mod = MyModule()
opt_mod = torch.compile(mod)
print(opt_mod(t))

tensor([[0.5393, 0.2917, 0.1451, 0.0000, 0.0000, 0.0000, 0.0000, 0.4562, 0.3549,
         0.5634],
        [0.2787, 0.0000, 0.0000, 0.1695, 0.4874, 0.0000, 0.0000, 0.0000, 0.9075,
         0.3468],
        [0.1160, 0.0000, 0.0000, 0.7212, 0.6043, 0.0000, 0.0000, 0.0000, 0.0000,
         0.4435],
        [0.0000, 0.0000, 0.0000, 0.6279, 0.1041, 0.0000, 0.0000, 0.0000, 0.4334,
         0.0000],
        [0.0119, 0.0000, 1.0767, 0.7205, 0.3798, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000],
        [0.2850, 0.6219, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0924, 0.6958,
         0.3071],
        [0.0000, 0.3307, 0.1828, 0.0000, 0.0000, 0.0000, 0.8457, 0.0000, 0.0000,
         0.2721],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.1516, 0.0000, 0.2325, 0.3455,
         0.0000],
        [0.0000, 1.0032, 0.0000, 0.0000, 0.0000, 0.0000, 0.7204, 0.0000, 0.0000,
         0.4558],
        [0.0000, 0.0000, 0.0000, 1.2350, 0.6283, 0.6489, 0.0000, 0.6369, 0.1068,
         0.0835]], grad_fn=<

In [34]:
# `torch.compile()` with submodules
class OuterModule(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.inner_module = MyModule()
        self.outer_lin = torch.nn.Linear(10, 2)

    def forward(self, x):
        x = self.inner_module(x)
        return torch.nn.functional.relu(self.outer_lin(x))

outer_mod = OuterModule()
opt_outer_mod = torch.compile(outer_mod)
print(opt_outer_mod(t))

tensor([[0.0000, 0.0747],
        [0.0000, 0.1943],
        [0.0000, 0.2722],
        [0.0000, 0.2669],
        [0.0000, 0.4329],
        [0.0000, 0.0000],
        [0.0000, 0.0878],
        [0.0000, 0.2325],
        [0.0000, 0.2807],
        [0.0000, 0.6843]], grad_fn=<CompiledFunctionBackward>)


In [1]:
# Demonstrate Speedups
def timed(fn):
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    start.record()
    result = fn()
    end.record()
    torch.cuda.synchronize()
    return result, start.elapsed_time(end) / 1000

# Generates random input and targets data for the model, where `b` is batch size.
def generate_data(b):
    return (
        torch.randn(b, 3, 128, 128).to(torch.float32).cuda(),
        torch.randint(1000, (b,)).cuda(),
    )

N_ITERS = 10

from torchvision.models import densenet121
def init_model():
    return densenet121().to(torch.float32).cuda()

In [3]:
import numpy as np
import torch

# [1] Eager
model = init_model()
opt = torch.optim.Adam(model.parameters())

def train(mod, data):
    opt.zero_grad(True)
    pred = mod(data[0])
    loss = torch.nn.CrossEntropyLoss()(pred, data[1])
    loss.backward()
    opt.step()

eager_times = []
for i in range(N_ITERS):
    inp = generate_data(16)
    _, eager_time = timed(lambda: train(model, inp))
    eager_times.append(eager_time)
    print(f"eager train time {i}: {eager_time}")
print("~" * 10)

# [2] `torch.compile()`
model = init_model()
opt = torch.optim.Adam(model.parameters())
train_opt = torch.compile(train, mode="reduce-overhead")

compile_times = []
for i in range(N_ITERS):
    inp = generate_data(16)
    _, compile_time = timed(lambda: train_opt(model, inp))
    compile_times.append(compile_time)
    print(f"compile train time {i}: {compile_time}")
print("~" * 10)

eager_med = np.median(eager_times)
compile_med = np.median(compile_times)
speedup = eager_med / compile_med
assert(speedup > 1)
print(f"(train) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x")
print("~" * 10)

eager train time 0: 0.02529996871948242
eager train time 1: 0.021381824493408204
eager train time 2: 0.023163679122924805
eager train time 3: 0.022937599182128905
eager train time 4: 0.022953983306884765
eager train time 5: 0.022991840362548827
eager train time 6: 0.023912256240844726
eager train time 7: 0.023015552520751954
eager train time 8: 0.022227968215942383
eager train time 9: 0.019496864318847656
~~~~~~~~~~
compile train time 0: 51.86091796875
compile train time 1: 5.8743623046875
compile train time 2: 0.018679807662963867
compile train time 3: 0.01844121551513672
compile train time 4: 0.017512447357177736
compile train time 5: 0.0174704647064209
compile train time 6: 0.017583103179931642
compile train time 7: 0.017505151748657227
compile train time 8: 0.017543167114257813
compile train time 9: 0.0174653434753418
~~~~~~~~~~
(train) eager median: 0.022972911834716794, compile median: 0.017563135147094726, speedup: 1.308018849841678x
~~~~~~~~~~


## 8. PyTorch Lightning Trainer
1. `pytorch_lightning.Trainer(*, accelerator='auto', strategy='auto', devices='auto', num_nodes=1, precision=None, logger=None, callbacks=None, fast_dev_run=False, max_epochs=None, min_epochs=None, max_steps=-1, min_steps=None, max_time=None, limit_train_batches=None, limit_val_batches=None, limit_test_batches=None, limit_predict_batches=None, overfit_batches=0.0, val_check_interval=None, check_val_every_n_epoch=1, num_sanity_val_steps=None, log_every_n_steps=None, enable_checkpointing=None, enable_progress_bar=None, enable_model_summary=None, accumulate_grad_batches=1, gradient_clip_val=None, gradient_clip_algorithm=None, deterministic=None, benchmark=None, inference_mode=True, use_distributed_sampler=True, profiler=None, detect_anomaly=False, barebones=False, plugins=None, sync_batchnorm=False, reload_dataloaders_every_n_epochs=0, default_root_dir=None)`
   - [Trainer Class API](https://lightning.ai/docs/pytorch/stable/common/trainer.html#trainer-class-api)
3. `Trainer.fit()`
4. `Trainer.validate()`
5. `Trainer.test()`
6. `Trainer.predict()`

In [None]:
model = MyLightningModule()

trainer = Trainer()
trainer.fit(model, train_dataloader, val_dataloader)

In [None]:
trainer.validate(model=model, dataloaders=val_dataloaders)
trainer.test(dataloaders=test_dataloaders)

## 9. Accelerate Accelerator
**Accelerate** is a library, developed by Hugging Face, that makes training & inference at scale simple, efficient & adaptable. 3 main features of Accelerate:
- **Unified Launch Interface:** A unified command line launching interface for distributed training scripts.
- **Adapt Training Code:** Enables the same PyTorch code to be run across different distributed configurations.
- **Big Model Inference:** Loads large models for inference that typically don't fit into memory.
1. `accelerate.Accelerator(gradient_accumulation_steps)`

In [None]:
!accelerate config

In [None]:
from accelerate import Accelerator

accelerator = Accelerator()

In [None]:
device = accelerator.device

model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, lr_scheduler
)

In [None]:
for inputs, targets in dataloader:
    # inputs = inputs.to(device)
    # targets = targets.to(device)
    outputs = model(inputs)
    loss = loss_function(outputs, targets)
    # loss.backward()
    #
    accelerator.backward(loss)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

In [None]:
# Calculate metrics
import evaluate
from accelerate import Accelerator

accelerator = Accelerator()
train_dataloader, eval_dataloader, model, optimizer, scheduler = (
    accelerator.prepare(
        train_dataloader, eval_dataloader, 
        model, optimizer, scheduler
    )
)

metric = evaluate.load("accuracy")
for inputs, targets in train_dataloader:
    # inputs = inputs.to(device)
    # targets = targets.to(device)
    outputs = model(inputs)
    loss = loss_function(outputs, targets)
    loss.backward()
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
  
model.eval()
for inputs, targets in eval_dataloader:
    # inputs = inputs.to(device)
    # targets = targets.to(device)
    with torch.no_grad():
        outputs = model(inputs)
    predictions = outputs.argmax(dim=-1)
    # 
    predictions, references = accelerator.gather_for_metrics(
        (predictions, references)
    )
    metric.add_batch(
        predictions = predictions,
        references = references
    )
print(metric.compute())

In [None]:
# Gradient accumulation
from accelerate import Accelerator

#
accelerator = Accelerator(gradient_accumulation_steps=2)
dataloader, model, optimizer, scheduler = accelerator.prepare(dataloader, model, optimizer, scheduler)

for batch in dataloader:
    #
    with accelerator.accumulate(model):
        inputs, targets = batch
        outputs = model(inputs)
        loss = loss_function(outputs, targets)
        accelerator.backward(loss)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

In [None]:
# Checkpointing
from accelerate import Accelerator

accelerator = Accelerator()
dataloader, model, optimizer, scheduler = accelerator.prepare(
    dataloader, model, optimizer, scheduler
)
  
for batch in dataloader:
    inputs, targets = batch
    outputs = model(inputs)
    loss = loss_function(outputs, targets)
    accelerator.backward(loss)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

#
accelerator.save_state("checkpoint_dir")
#
accelerator.load_state("checkpoint_dir")

In [None]:
# Experiment Tracking
from accelerate import Accelerator

accelerator = Accelerator(log_with="wandb")
train_dataloader, model, optimizer, scheduler = accelerator.prepare(
    dataloader, model, optimizer, scheduler
)
#
accelerator.init_trackers()
model.train()
for batch in train_dataloader:
    inputs, targets = batch
    outputs = model(inputs)
    loss = loss_function(outputs, targets)
    #
    accelerator.log({"loss":loss})
    accelerator.backward(loss)
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()
#
accelerator.end_training()

## 10. Experiment Tracking