In [None]:
# default_exp models

# Image sequence models
> Models to predict the action class from a sequence of frames

We will built a bunch of models to read the `ImageTuple` and output the corresponding `Category`.

In [None]:
#export
from fastai2.vision.all import *

## A resnet based Encoder
> Extracting features of images to latent variable space

Let's build a tensor representing a batch of images:
- `(batch_size, channels, width, hight)`

In [None]:
x = torch.rand(8, 3, 64, 64)

We will build a basic Resnet based encoder:

In [None]:
#export
@delegates(create_cnn_model)
class Encoder(Module):
    def __init__(self, arch=resnet34, n_in=3, weights_file=None, head=True, **kwargs):
        "Encoder based on resnet, if head=False returns the feature map"
        model = create_cnn_model(arch, n_out=1, n_in=n_in, pretrained=True, **kwargs)
        if weights_file is not None: load_model(weights_file, model, opt=None)
        self.body = model[0]
        if head: self.head = model[1]
        else:    self.head = nn.Sequential(*(model[1][0:3]))

    def forward(self, x):
        return self.head(self.body(x))

this encoder will reduce images to a latent dimension space:

In [None]:
enc = Encoder(n_in=3, weights_file=None, head=False)

In [None]:
enc.head

Sequential(
  (0): AdaptiveConcatPool2d(
    (ap): AdaptiveAvgPool2d(output_size=1)
    (mp): AdaptiveMaxPool2d(output_size=1)
  )
  (1): Flatten(full=False)
  (2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In this case, 1024

In [None]:
encoded_var = enc(x)
encoded_var.shape

torch.Size([8, 1024])

In [None]:
test_eq(encoded_var.shape, [8,1024])

## Simple Model
> A very basic CNN model

This network is just using an old resnet and expanding the sequence dimesion on the batch dim. It is not optimal.

In [None]:
#export
class SimpleModel(Module):
    "A simple CNN model"
    def __init__(self, arch=resnet34, weights_file=None, num_classes=30, seq_len=40, debug=False):
        "Create a simple arch based model"
        model = Encoder(arch, 3, weights_file, head=False)
        nf = num_features_model(nn.Sequential(*model.body.children())) * 2
        self.encoder = model
        self.head = nn.Sequential(LinBnDrop(nf,  nf//2, p=0.2, act=nn.ReLU()),
                                  LinBnDrop(nf//2, num_classes, p=0.05))
        self.attention_layer = nn.Linear(nf, 1)
        self.debug = debug

    def forward(self, x):
        if self.debug:  print(f' input len:   {len(x), x[0].shape}')
        x = torch.stack(x, dim=1)
        if self.debug:  print(f' after stack:   {x.shape}')
        batch_size, seq_length, c, h, w = x.shape
        x = x.view(batch_size * seq_length, c, h, w)
        x = self.encoder(x)
        x = x.view(batch_size, seq_length, -1)
        if self.debug:  print(f' encoded shape: {x.shape}')
        attention_w = F.softmax(self.attention_layer(x).squeeze(-1), dim=-1)
        x = torch.sum(attention_w.unsqueeze(-1) * x, dim=1)
        if self.debug:  print(f' after attention shape: {x.shape}')
        x = self.head(x)
        return x

A splitter function to train separetely the parameers from the encoder and the head, this is a needed argument for the `Learner` to be able to call `Learner.freeze()`.

In [None]:
#export
def simple_splitter(model):
    return [params(model.encoder), params(model.attention_layer)+ params(model.head)]

A sequence of 10 images:

In [None]:
#bs, seq_len, ch, w, h
inp = [torch.rand(64, 3, 64, 64) for _ in range(10)]

In [None]:
sm = SimpleModel(debug=True, seq_len=10)
out = sm(inp)
test_eq(out.shape, [64, 30])

 input len:   (10, torch.Size([64, 3, 64, 64]))
 after stack:   torch.Size([64, 10, 3, 64, 64])
 encoded shape: torch.Size([64, 10, 1024])
 after attention shape: torch.Size([64, 1024])


# ConvLSTM
> An LSTM encoded image model

First the LSTM wrapper, with the `reset` method to erase hidden state before each epoch.

In [None]:
#export
class LSTM(Module):
    def __init__(self, input_dim, n_hidden, n_layers, bidirectional=False, p=0.5):
        self.lstm = nn.LSTM(input_dim, n_hidden, n_layers, batch_first=True, bidirectional=bidirectional)
        self.drop = nn.Dropout(p)
        self.h = None

    def reset(self):
        self.h = None

    def forward(self, x):
        if (self.h is not None) and (x.shape[0] != self.h[0].shape[1]):
                self.h = self.h[0][:,0:x.shape[0],:], self.h[0][:,0:x.shape[0],:]
        raw, h = self.lstm(x, self.h)
        out = self.drop(raw)
        self.h = [h_.detach() for h_ in h]
        return out

We will take as input_size the output of the encoder, so the `latent_dimesion`, the `num_layers` is how many `nn.LSTMCell` are stacked and hidden dim is the same as before.

Let's build a 16 layers LSTM stack:

In [None]:
lstm = LSTM(512, 512, 1, bidirectional=False)

In [None]:
# bs, input_dim, hidden_dim
y = torch.rand(32, 10,  512)

We get the same input, encoded on the hidden_dim

In [None]:
lstm(y)[0].shape

torch.Size([10, 512])

In [None]:
lstm.h[0].shape, lstm.h[1].shape

(torch.Size([1, 32, 512]), torch.Size([1, 32, 512]))

In [None]:
#Export
class ConvLSTM(Module):
    def __init__(self, arch=resnet34, weights_file=None, num_classes=30, lstm_layers=1, hidden_dim=1024, 
                 bidirectional=True, attention=True, debug=False):
        model = Encoder(arch, 3, weights_file, head=False)
        nf = num_features_model(nn.Sequential(*model.body.children())) * 2
        self.encoder = model
        self.lstm = LSTM(nf, hidden_dim, lstm_layers, bidirectional)
        self.head = nn.Sequential(
            LinBnDrop(2 * hidden_dim if bidirectional else hidden_dim, hidden_dim, p=0.2, act=nn.ReLU()),
            nn.Linear(hidden_dim, num_classes),
        )
        self.attention = attention
        self.attention_layer = nn.Linear(2 * hidden_dim if bidirectional else hidden_dim, 1)
        self.debug = debug
        
    def forward(self, x):
        x = torch.stack(x, dim=1)
        if self.debug:  print(f' after stack:   {x.shape}')
        batch_size, seq_length, c, h, w = x.shape
        x = x.view(batch_size * seq_length, c, h, w)
        x = self.encoder(x)
        if self.debug:  print(f' after encode:   {x.shape}')
        x = x.view(batch_size, seq_length, -1)
        if self.debug:  print(f' before lstm:   {x.shape}')
        x = self.lstm(x)
        if self.debug:  print(f' after lstm:   {x.shape}')
        if self.attention:
            attention_w = F.softmax(self.attention_layer(x).squeeze(-1), dim=-1)
            x = torch.sum(attention_w.unsqueeze(-1) * x, dim=1)
        else:
            x = x[:, -1]
        return self.head(x)
    
    def reset(self): self.lstm.reset()

In [None]:
#export
def convlstm_splitter(model):
    return [params(model.encoder), params(model.lstm) + params(model.attention_layer) + params(model.head)]

In [None]:
#bs, seq_len, ch, w, h
inp = [torch.rand(64, 3, 64, 64) for _ in range(10)]

In [None]:
clstm = ConvLSTM(bidirectional=False, debug=True)

In [None]:
test_eq(clstm(inp).shape, [64, 30])

 after stack:   torch.Size([64, 10, 3, 64, 64])
 after encode:   torch.Size([640, 1024])
 before lstm:   torch.Size([64, 10, 1024])
 after lstm:   torch.Size([64, 10, 1024])


# Export -

In [None]:
# hide
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted 01_utils.ipynb.
Converted 03_models.ipynb.
Converted 04_train_baseline.ipynb.
Converted 04_train_convlstm.ipynb.
Converted index.ipynb.
