In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
#export
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from functools import partial
from pathlib import Path
import torchvision

from scripts.dataloader import Dataset, Transforms, Resize, ToTorch, Sampler, collate, DataLoader
from scripts.custom_models import Lambda, flatten, MixedInputModel, TabularModel, CNNModel, CustomResnet

# Part 0 - Importing our data

We will be using our custom dataloaders for many of the subsequent installments, so we will create a function to fetch these variables easily moving forward.  The final thing that we need to add to our dataloaders, however, is to add a validation set and dataloader.  We can accomplish this using the 'indices' input from our custom dataset class. 

In [38]:
#export
def get_example_dataloaders():
    df_path = r'data/processed_dataframe.csv'
    img_col = 'filename'
    cont_cols = ['followers', 'following', 'engagement_factor_std', 'month', 'year', 'day_name', 'hour']
    cat_cols = []
    target_col = 'engagement_factor_moving_avg'
    image_path = Path(r'data/Images')
    tfms = Transforms([Resize(256), ToTorch()])
    
    df = pd.read_csv(r'data/processed_dataframe.csv')
    train_idx = df.sample(frac=0.8).index
    valid_idx = df.loc[~df.index.isin(train_idx)].index

    ds_train = Dataset(df_path, 
                       img_col = img_col,
                       cont_cols = cont_cols, 
                       cat_cols = cat_cols, 
                       target_col = target_col, 
                       image_path = image_path, 
                       transforms = tfms,
                       indices = train_idx)
    
    ds_valid = Dataset(df_path, 
                   img_col = img_col,
                   cont_cols = cont_cols, 
                   cat_cols = cat_cols, 
                   target_col = target_col, 
                   image_path = image_path, 
                   transforms = tfms,
                   indices = valid_idx)

    dl_train = DataLoader(dataset = ds_train,
                          sampler = Sampler(ds_train, bs = 16),
                          collate_func = collate)
    
    dl_valid = DataLoader(dataset = ds_valid,
                          sampler = Sampler(ds_valid, bs = 16),
                          collate_func = collate)
    
    
    return dl_train, dl_valid

In [39]:
dl_train, dl_valid = get_example_dataloaders()

In [5]:
xb, yb = next(iter(dl_train))
x_image, x_tab = xb

bs, ch_img, h_img, w_img = x_image.shape
bs, tab_inputs = x_tab.shape

num_cnn_outputs = 10
num_tabular_outputs = 4

num_mixed_inputs = num_cnn_outputs + num_tabular_outputs


input_cnn_model = CustomResnet(torchvision.models.resnet34(pretrained = True), [1000,50,20, num_cnn_outputs])
input_tabular_model = TabularModel([tab_inputs, 10, num_tabular_outputs])
input_mixed_model = TabularModel([num_mixed_inputs, 7, 1])

model = MixedInputModel(input_cnn_model, input_tabular_model, input_mixed_model)

# Step 1 - Overview of The Training Loop

Training a cycle of a deep learning model requires a set of consistent steps that are fairly data-type agnostic.  These are: 
1. Load the xb and yb using a **Dataloader**
2. Run xb through the **model**
3. Assign how differences between the model's predictions and the target data (yb) are calculated as loss using a **loss function**
4. Starting with the loss, use a **backpropagation method** to traverse through the network to determine the gradient of each parameter with with respect to the final loss tensor
5. Update each parameter accordingly using an **optimizer**
6. Reset the gradients of each parameter to prepare for the next cycle

We have already created many of the components required, but still require a **loss function**, a **backpropagation method**, and an **optimizer**.  Those will be the subject of todays instalment.  

Before we continue, we should take a quick look at the output from our model.  With a batch size of 16 and a single prediction for each item, we are left with a tensor of size (16,1).  Our predictions are of shape (16), so we should make sure to squeeze out the extra dimension from our model before doing any operations between the two.

In [6]:
preds = model(xb)
preds.shape, yb.shape

(torch.Size([16, 1]), torch.Size([16]))

In [7]:
torch.squeeze(preds, -1).shape

torch.Size([16])

# Step 2 - The Loss Function

One of the most critical decision in a deep learning model is choosing how differences between the model's outputs and the true values should be penalized.  This is particularly important because the objective of training is to reduce this loss value to as small a number as possible over many cycles and epochs of training.  Therefore, the loss values calculated by the loss function should ideally be directly correlated with your desired model performance, with a decrease in loss always representing an increase in performance for your purposes.  

Our example is a simple regression problem, which means that we have a few options including:
- Mean Squared Error
- Mean Absolute Error
- Mean Squared Logarithmic Error

Today, we will be using mean squared error, as it is a safe first choice and is easy to understand.  Mean squared error is defined as: $$\operatorname{MSE}=\frac{1}{n}\sum_{i=1}^n(Y_i-\hat{Y_i})^2$$
which can be written in python as `(inputs-targets)**2).mean()`.  We can write this as simple function and test the results

In [8]:
def mse(inp, target):
    if len(inp.shape) > len(target.shape): inp = torch.squeeze(inp)
    return ((inp-target)**2).mean()

In [9]:
mse(preds, yb)

tensor(10470802., grad_fn=<MeanBackward0>)

We can also set this up as an nn.Module to fit better into our model's flow.  

In [10]:
class MSE_Loss(nn.Module):
    def __init__(self, reduce = False):
        super(MSE_Loss, self).__init__()
        self.reduce = reduce
        
    def forward(self, inp, target):
        if self.reduce: inp = torch.squeeze(inp, -1)
        return ((inp-target)**2).mean()

In [11]:
loss_func = MSE_Loss(reduce = True)
loss = loss_func(preds, yb)
loss

tensor(10470802., grad_fn=<MeanBackward0>)

As we can see, we can treat the loss as a module like any other.  In fact, we could start the process of backpropagation (discussed next) from any tensor in the network, although it likely wouldn't be useful to improving performance.  The loss function is an excellent way to add customization or use domain knowledge to help your model reach peak performance.  For instance, if you want a segmentation model to segment your object but never go over the object's bound (preferring to undersegment it), you can increase the penalty for labelling the background as the object, thereby encouraging your model to act in your desired manner (this can also be implemented using weighted cross-entropy loss)

The loss function is also an easy way to implements certain regularization techniques, such as L1 or L2 regularization so make sure you explore all the low-hanging fruit during your own custom implementation.

If you want to keep things simple, PyTorch also has a wide range of loss functions available to use.  MSE is available from the MSELoss class, which essentially performs the same operation as our module with a few extra options and warnings

```python 
class MSELoss(_Loss):
    __constants__ = ['reduction']

    def __init__(self, size_average=None, reduce=None, reduction='mean'):
        super(MSELoss, self).__init__(size_average, reduce, reduction)

    def forward(self, input, target):
        return F.mse_loss(input, target, reduction=self.reduction)
    
```


```python
def mse_loss(input, target, size_average=None, reduce=None, reduction='mean'):
    if not (target.size() == input.size()):
        warnings.warn("Using a target size ({}) that is different to the input size ({}). "
                      "This will likely lead to incorrect results due to broadcasting. "
                      "Please ensure they have the same size.".format(target.size(), input.size()),
                      stacklevel=2)
    if size_average is not None or reduce is not None:
        reduction = _Reduction.legacy_get_string(size_average, reduce)
    if target.requires_grad:
        ret = (input - target) ** 2
        if reduction != 'none':
            ret = torch.mean(ret) if reduction == 'mean' else torch.sum(ret)
    else:
        expanded_input, expanded_target = torch.broadcast_tensors(input, target)
        ret = torch._C._nn.mse_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction))
    return ret


```

In [12]:
torch_loss_func = torch.nn.MSELoss()

In [13]:
torch_loss_func(torch.squeeze(preds), yb)

tensor(10470802., grad_fn=<MseLossBackward>)

As we can see, the loss calculated by all three methods is identical

# Step 3 - Backpropagation

As the goal of training is to minimize loss, there needs to be a method of calculating how each parameter contributes to an increase in loss.  In other words, the gradient of that parameter with respect to the loss.  This would be an exceptionally complex calculation to do in a single step, but can be broken down into simple steps by using the chain rule and tracing the gradients back one by one from the loss to the parameter of interest.  How deeply you want to get in your understanding of this process is up to you, but fortunately PyTorch has an autograd feature that tracks the operations of each tensor and can then be used to calculate all of these gradients automatically.  To start the process, choose the starting tensor of choice (in this case the output from the loss function) and call the class method `.backward()`

Note: After calculating the gradients, the buffers keeping track of the operations are freed.  Running backwards twice in a row will results in a `RuntimeError: Trying to backward through the graph a second time, but the buffers have already been freed. Specify retain_graph=True when calling backward the first time.`

In [14]:
loss.backward()

We can now take a look at some of the parameters by examing their gradients.  As we can see, the shapes of the grads and parameters are identical: there is one gradient for every element of each parameter

In [15]:
params = list(model.parameters())
params[-2], params[-2].grad

(Parameter containing:
 tensor([[ 0.2824, -0.2682,  0.2561, -0.3252,  0.1252, -0.1312,  0.1911]],
        requires_grad=True),
 tensor([[42169308., 35757724., 46602248.,        0., 81614776., 26811706.,
                 0.]]))

Also note that the gradients in this example are either very high or very low (exploding or vanishing gradients).  This is something that we hope to avoid through proper initialization and training

# Step 4 - The Optimizer (Updating the Parameters)

Once we have the gradients calculated, we know that by moving each parameter in the direction of the gradient, it will act to increase the loss.  Instead, we want to move each parameter in the opposite direction proportional to the intensity of the gradient.  The easiest way to do that is to choose a scaling factor (the learning rate), then subtract the (learning rate * gradient) from each parameter.  

As before, we can access all of the parameters in the model using the `.parameters()` class method, an ability imparted by the nn.Module base class.  We can then iterate through all the parameters and subtract the negative gradient * the learning rate.  It's important to note that we need to use the context of torch.no_grad(), since we don't want these operations to be tracked by the autograd function.  

When we're finished, we have to set all of the gradients to zero to prepare them for the next cycle of training.  The reason that we don't automatically do it with the step operation is to provide more flexibility to training.  For instance, if you have limited GPU RAM, but would like to train with large batch sizes, one option is to run the multiple training cycles without either updating the parameters or zeroing the gradients.  This allows the gradients to accumulate, so multiple cycles can be averaged in a single update pass.

In [16]:
params = list(model.parameters())
params[-2], params[-2].grad

(Parameter containing:
 tensor([[ 0.2824, -0.2682,  0.2561, -0.3252,  0.1252, -0.1312,  0.1911]],
        requires_grad=True),
 tensor([[42169308., 35757724., 46602248.,        0., 81614776., 26811706.,
                 0.]]))

In [17]:
class Optimizer():
    def __init__(self, model, lr):
        self.params = list(model.parameters())
        self.lr = lr   #learning rate
        
    def step(self):
        with torch.no_grad():
            for p in self.params:
                p -= self.lr*p.grad
            
    def zero_grad(self):
        for p in self.params:
            p.grad.data.zero_()
    

In [18]:
opt = Optimizer(model, lr = 0.02)

In [19]:
opt.step()

In [20]:
opt.zero_grad()

In [21]:
params = list(model.parameters())
params[-2], params[-2].grad

(Parameter containing:
 tensor([[-8.4339e+05, -7.1515e+05, -9.3204e+05, -3.2516e-01, -1.6323e+06,
          -5.3623e+05,  1.9110e-01]], requires_grad=True),
 tensor([[0., 0., 0., 0., 0., 0., 0.]]))

We see that after the calls to step and zero_grad, the values of our parameters have changed and the gradients have been zeroed

# Step 5 - Putting it all together

We now have all the tools required to train our model using all of the batches from our training set.  Passing through our training set in this manner is called an 'epoch'.  The goal now is just to run through all of the steps in sequential order.

Note: we will handle moving all the components onto the GPU later, but for now, we will use one additional function to make the process easier

In [42]:
model.cuda()

def move_data_to_GPU(xb, yb):
    xb = tuple(map(lambda x: x.cuda(), xb))
    yb = yb.cuda()
    return xb, yb



def epoch(dl_train, dl_valid, model, loss_func, opt, early_stop = None):
    #Training Pass
    i = 0
    for xb, yb in dl_train:
        #Temporary code
        xb, yb = move_data_to_GPU(xb,yb)

        #Actual Train Cycle
        preds = model(xb)
        loss = loss_func(preds, yb)
        loss.backward()
        opt.step()
        opt.zero_grad()

        print(loss)
        i+= 1
        if early_stop is not None and i>early_stop: break
    
    print('\n Validation Step:')
    val_batches = 0
    tot_loss = 0
    i = 0
    for xb, yb in dl_valid:
        #Temporary code
        xb, yb = move_data_to_GPU(xb,yb)
        with torch.no_grad():
            preds = model(xb)
            loss = loss_func(preds, yb)
            tot_loss += loss
            val_batches+= 1
            
            i += 1
            if early_stop is not None and i>early_stop: break
            
    print(tot_loss/val_batches)
        
epoch(dl_train, dl_valid,model, loss_func, opt, early_stop = 4)

tensor(nan, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(nan, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(nan, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(nan, device='cuda:0', grad_fn=<MeanBackward0>)
tensor(nan, device='cuda:0', grad_fn=<MeanBackward0>)

 Validation Step:
tensor(nan, device='cuda:0')




We're able to train, but we can see that after the first iteration, the gradients explode.  This is not surprising given everything that we've thrown together.  Training a complex deep learning model is no easy feat!  In the next few iterations, we will set up a callback system to make things more flexible, as well as investigate other optimizations such as learning rate, initialization and tuning hyperparameters.  Using the model building blocks (cnn_model, mixed_model, tab_model), we can see how we can create new models that share the internal components.  This allows us to train components of the model seperately, then combine everything together once the network is fairly stable.