In [1]:
# default_exp models

# Image sequence models
> Models to predict the action class from a sequence of frames

We will built a bunch of models to read the `ImageTuple` and output the corresponding `Category`.

In [2]:
#export
from fastai.vision.all import *

In [3]:
torch.cuda.set_device(1)
torch.cuda.get_device_name()

'GeForce RTX 2070 SUPER'

## A resnet based Encoder
> Extracting features of images to latent variable space

Let's build a tensor representing a batch of images:
- `(batch_size, channels, width, hight)`

In [4]:
x = torch.rand(8, 3, 64, 64)

We will build a basic Resnet based encoder:

In [5]:
#export
@delegates(create_cnn_model)
class Encoder(Module):
    def __init__(self, arch=resnet34, n_in=3, weights_file=None, head=True, **kwargs):
        "Encoder based on resnet, if head=False returns the feature map"
        model = create_cnn_model(arch, n_out=1, n_in=n_in, pretrained=True, **kwargs)
        if weights_file is not None: load_model(weights_file, model, opt=None)
        self.body = model[0]
        if head: self.head = model[1]
        else:    self.head = nn.Sequential(*(model[1][0:3]))

    def forward(self, x):
        return self.head(self.body(x))

this encoder will reduce images to a latent dimension space:

In [6]:
enc = Encoder(n_in=3, weights_file=None, head=False)

In [7]:
enc.head

Sequential(
  (0): AdaptiveConcatPool2d(
    (ap): AdaptiveAvgPool2d(output_size=1)
    (mp): AdaptiveMaxPool2d(output_size=1)
  )
  (1): Flatten(full=False)
  (2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In this case, 1024

In [8]:
encoded_var = enc(x)
encoded_var.shape

torch.Size([8, 1024])

In [9]:
test_eq(encoded_var.shape, [8,1024])

## Simple Model
> A very basic CNN model

This network is just using an old resnet and expanding the sequence dimesion on the batch dim. It is not optimal.

In [10]:
#export
class SimpleModel(Module):
    "A simple CNN model"
    def __init__(self, arch=resnet34, weights_file=None, num_classes=30, seq_len=40, debug=False):
        "Create a simple arch based model"
        model = Encoder(arch, 3, weights_file, head=False)
        nf = num_features_model(nn.Sequential(*model.body.children())) * 2
        self.encoder = model
        self.head = nn.Sequential(LinBnDrop(nf,  nf//2, p=0.2, act=nn.ReLU()),
                                  LinBnDrop(nf//2, num_classes, p=0.05))
        self.attention_layer = nn.Linear(nf, 1)
        self.debug = debug

    def forward(self, x):
        if self.debug:  print(f' input len:   {len(x), x[0].shape}')
        x = torch.stack(x, dim=1)
        if self.debug:  print(f' after stack:   {x.shape}')
        batch_size, seq_length, c, h, w = x.shape
        x = x.view(batch_size * seq_length, c, h, w)
        x = self.encoder(x)
        x = x.view(batch_size, seq_length, -1)
        if self.debug:  print(f' encoded shape: {x.shape}')
        attention_w = F.softmax(self.attention_layer(x).squeeze(-1), dim=-1)
        x = torch.sum(attention_w.unsqueeze(-1) * x, dim=1)
        if self.debug:  print(f' after attention shape: {x.shape}')
        x = self.head(x)
        return x

A splitter function to train separetely the parameers from the encoder and the head, this is a needed argument for the `Learner` to be able to call `Learner.freeze()`.

In [11]:
#export
def simple_splitter(model):
    return [params(model.encoder), params(model.attention_layer)+ params(model.head)]

A sequence of 10 images:

In [12]:
#bs, seq_len, ch, w, h
inp = [torch.rand(64, 3, 64, 64) for _ in range(10)]

In [13]:
sm = SimpleModel(debug=True, seq_len=10)
out = sm(inp)
test_eq(out.shape, [64, 30])

 input len:   (10, torch.Size([64, 3, 64, 64]))
 after stack:   torch.Size([64, 10, 3, 64, 64])
 encoded shape: torch.Size([64, 10, 1024])
 after attention shape: torch.Size([64, 1024])


## ConvLSTM
> An LSTM encoded image model

First the LSTM wrapper, with the `reset` method to erase hidden state before each epoch.

In [14]:
#export
class LSTM(Module):
    def __init__(self, input_dim, n_hidden, n_layers, bidirectional=False, p=0.5):
        self.lstm = nn.LSTM(input_dim, n_hidden, n_layers, batch_first=True, bidirectional=bidirectional)
        self.drop = nn.Dropout(p)
        self.h = None

    def reset(self):
        self.h = None

    def forward(self, x):
        if (self.h is not None) and (x.shape[0] != self.h[0].shape[1]): #dealing with last batch on valid
#             self.h = [h_[:, :x.shape[0], :] for h_ in self.h]
            self.h = None
        raw, h = self.lstm(x, self.h)
        out = self.drop(raw)
        self.h = [h_.detach() for h_ in h]
        return out, h

We will take as input_size the output of the encoder, so the `latent_dimesion`, the `num_layers` is how many `nn.LSTMCell` are stacked and hidden dim is the same as before.

Let's build a 16 layers LSTM stack:

In [15]:
lstm = LSTM(512, 512, 1, bidirectional=False)

In [16]:
# bs, input_dim, hidden_dim
y = torch.rand(32, 10,  512)

We get the same input, encoded on the hidden_dim

In [17]:
out, (h,c) = lstm(y)
out.shape, h.shape, c.shape

(torch.Size([32, 10, 512]), torch.Size([1, 32, 512]), torch.Size([1, 32, 512]))

It can deal with different batch sizes now:

In [18]:
out, (h,c) = lstm(torch.rand(16,10,512))
out.shape, h.shape, c.shape

(torch.Size([16, 10, 512]), torch.Size([1, 16, 512]), torch.Size([1, 16, 512]))

In [19]:
lstm = LSTM(512, 512, 3, bidirectional=True)

In [20]:
out, (h,c) = lstm(torch.rand(16,10,512))
out.shape,  h.shape, c.shape

(torch.Size([16, 10, 1024]),
 torch.Size([6, 16, 512]),
 torch.Size([6, 16, 512]))

In [21]:
#Export
class ConvLSTM(Module):
    def __init__(self, arch=resnet34, weights_file=None, num_classes=30, lstm_layers=1, hidden_dim=1024, 
                 bidirectional=True, attention=True, debug=False):
        model = Encoder(arch, 3, weights_file, head=False)
        nf = num_features_model(nn.Sequential(*model.body.children())) * 2
        self.encoder = model
        self.lstm = LSTM(nf, hidden_dim, lstm_layers, bidirectional)
        self.attention = attention
        self.attention_layer = nn.Linear(2 * hidden_dim if bidirectional else hidden_dim, 1)
        self.head = nn.Sequential(
            LinBnDrop( (lstm_layers if not attention else 1)*(2 * hidden_dim if bidirectional else hidden_dim), 
                      hidden_dim, p=0.2, act=nn.ReLU()),
            nn.Linear(hidden_dim, num_classes),
        )
        self.debug = debug
        
    def forward(self, x):
        x = torch.stack(x, dim=1)
        if self.debug:  print(f' after stack:   {x.shape}')
        batch_size, seq_length, c, h, w = x.shape
        x = x.view(batch_size * seq_length, c, h, w)
        x = self.encoder(x)
        if self.debug:  print(f' after encode:   {x.shape}')
        x = x.view(batch_size, seq_length, -1)
        if self.debug:  print(f' before lstm:   {x.shape}')
        x, (h,c) = self.lstm(x)
        if self.debug:  print(f' after lstm:   {x.shape}')
        if self.attention:
            attention_w = F.softmax(self.attention_layer(x).squeeze(-1), dim=-1)
            if self.debug: print(f' attention_w: {attention_w.shape}')
            out = torch.sum(attention_w.unsqueeze(-1) * x, dim=1)
            if self.debug: print(f' after attention: {out.shape}')
        else:
            if self.debug: print(f' hidden state: {h.shape}')
            out = h.permute(1,0,2).flatten(1)
            if self.debug: print(f' hidden state flat: {out.shape}')
        return self.head(out)
    
    def reset(self): self.lstm.reset()

In [22]:
#export
def convlstm_splitter(model):
    return [params(model.encoder), params(model.lstm) + params(model.attention_layer) + params(model.head)]

In [23]:
#bs, seq_len, ch, w, h
inp = [torch.rand(32, 3, 64, 64) for _ in range(10)]

In [24]:
clstm = ConvLSTM(attention=False, bidirectional=False, lstm_layers=2, debug=True)
test_eq(clstm(inp).shape, [32, 30])

 after stack:   torch.Size([32, 10, 3, 64, 64])
 after encode:   torch.Size([320, 1024])
 before lstm:   torch.Size([32, 10, 1024])
 after lstm:   torch.Size([32, 10, 1024])
 hidden state: torch.Size([2, 32, 1024])
 hidden state flat: torch.Size([32, 2048])


In [25]:
clstm = ConvLSTM(lstm_layers=3, debug=True)
test_eq(clstm(inp).shape, [32, 30])

 after stack:   torch.Size([32, 10, 3, 64, 64])
 after encode:   torch.Size([320, 1024])
 before lstm:   torch.Size([32, 10, 1024])
 after lstm:   torch.Size([32, 10, 2048])
 attention_w: torch.Size([32, 10])
 after attention: torch.Size([32, 2048])


In [26]:
clstm = ConvLSTM(lstm_layers=1, debug=True)
test_eq(clstm(inp).shape, [32, 30])

 after stack:   torch.Size([32, 10, 3, 64, 64])
 after encode:   torch.Size([320, 1024])
 before lstm:   torch.Size([32, 10, 1024])
 after lstm:   torch.Size([32, 10, 2048])
 attention_w: torch.Size([32, 10])
 after attention: torch.Size([32, 2048])


## Transformer model
> inspired from DETR : https://colab.research.google.com/github/facebookresearch/detr/blob/colab/notebooks/detr_demo.ipynb#scrollTo=h91rsIPl7tVl

In [27]:
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=6)

In [28]:
transformer_encoder(torch.rand(10, 32, 512)).shape

torch.Size([10, 32, 512])

In [29]:
#export
class DETR(Module):
    def __init__(self,  arch=resnet34, n_in=3, n=400, n_classes=30, hidden_dim=256, nheads=4, num_encoder_layers=4, 
                 num_decoder_layers=4, debug=False):
        self.debug = debug
        
        #the image encoder
        self.backbone = Encoder(arch, n_in=n_in, head=False).body

        # create conversion layer
        self.conv = nn.Conv2d(512, hidden_dim, 1)

        # create a default PyTorch transformer
        self.transformer = nn.Transformer(
            hidden_dim, nheads, num_encoder_layers, num_decoder_layers)
        
        # output positional encodings (object queries)
        self.query_pos = nn.Parameter(torch.rand(1, hidden_dim))

        # spatial positional encodings
        # note that in baseline DETR we use sine positional encodings
        self.pos = nn.Parameter(torch.rand(n, hidden_dim))
#         self.row_embed = nn.Parameter(torch.rand(50, hidden_dim // 4))
#         self.col_embed = nn.Parameter(torch.rand(50, hidden_dim // 4))
#         self.time_embed =nn.Parameter(torch.rand(50, hidden_dim // 2))
        
        #head
        self.lin = nn.Linear(hidden_dim, n_classes)
        
    def forward(self, x):
        x = torch.stack(x, dim=1)
        if self.debug:  print(f' after stack:   {x.shape}')
        batch_size, seq_length, c, h, w = x.shape
        x = x.view(batch_size * seq_length, c, h, w)
        # propagate inputs through ResNet up to avg-pool layer
        x = self.backbone(x)
        if self.debug: print(f'backbone: {x.shape}')
            
        # convert from the latent dim to 256 feature planes for the transformer
        h = self.conv(x)
        if self.debug: print(f'h: {h.shape}')
        h = h.view(batch_size, seq_length, *h.shape[1:])
        if self.debug: print(f'h: {h.shape}')

        # construct positional encodings
        H, W = h.shape[-2:]
        T = h.shape[1]
        if self.debug: print(f'T,H,W: {T}, {H}, {W}')
        
#         pos = torch.cat([
#             self.time_embed[:T].view(T,1,1,-1).repeat(1, H, W, 1),
#             self.col_embed[:W].view(1,1,W,-1).repeat(T, H, 1, 1),
#             self.row_embed[:H].view(1,H,1,-1).repeat(T, 1, W, 1),
#         ], dim=-1).flatten(0, 2).unsqueeze(1)
        pos = self.pos[0:T*W*H].unsqueeze(1)
        if self.debug: print(f'pos: {pos.shape}')
        
        # propagate through the transformer
        tf_input = pos + 0.1 * h.permute(0,2,1,3,4).flatten(2).permute(2,0,1)
        if self.debug: print(f'tf_input: {tf_input.shape}')
        h = self.transformer(tf_input,
                             self.query_pos.unsqueeze(1))
        if self.debug: print(f'tf_out: {h.shape}')
        return self.lin(h).squeeze(1)      

In [30]:
#export
# def detr_split(m):
#     return [params(m.backbone), 
#             params(m.conv)+params(m.transformer)+[m.query_pos]+[m.col_embed]+[m.row_embed]+[m.time_embed]+params(m.lin)]
def detr_split(m):
    return [params(m.backbone), 
            params(m.conv)+params(m.transformer)+[m.query_pos]+[m.pos]+params(m.lin)]

In [31]:
detr = DETR(debug=True)

In [32]:
detr_split(detr);

In [33]:
detr([torch.rand(1, 3, 64, 64) for _ in range(10)]).shape

 after stack:   torch.Size([1, 10, 3, 64, 64])
backbone: torch.Size([10, 512, 2, 2])
h: torch.Size([10, 256, 2, 2])
h: torch.Size([1, 10, 256, 2, 2])
T,H,W: 10, 2, 2
pos: torch.Size([40, 1, 256])
tf_input: torch.Size([40, 1, 256])
tf_out: torch.Size([1, 1, 256])


torch.Size([1, 30])

## Simple transformer

In [34]:
class PositionalEncoding(Module):
    r"""Inject some information about the relative or absolute position of the tokens
        in the sequence. The positional encodings have the same dimension as
        the embeddings, so that the two can be summed. Here, we use sine and cosine
        functions of different frequencies.
    .. math::
        \text{PosEncoder}(pos, 2i) = sin(pos/10000^(2i/d_model))
        \text{PosEncoder}(pos, 2i+1) = cos(pos/10000^(2i/d_model))
        \text{where pos is the word position and i is the embed idx)
    Args:
        d_model: the embed dim (required).
        dropout: the dropout value (default=0.1).
        max_len: the max. length of the incoming sequence (default=5000).
    Examples:
        >>> pos_encoder = PositionalEncoding(d_model)
    """

    def __init__(self, d_model, dropout=0.1, max_len=5000):
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        r"""Inputs of forward function
        Args:
            x: the sequence fed to the positional encoder model (required).
        Shape:
            x: [sequence length, batch size, embed dim]
            output: [sequence length, batch size, embed dim]
        Examples:
            >>> output = pos_encoder(x)
        """

        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

In [35]:
#export
class TransformerModel(Module):
    def __init__(self,  arch=resnet34, n_in=3, n=400, n_classes=30, hidden_dim=256, nheads=4, num_encoder_layers=4, 
                 num_decoder_layers=4, debug=False):
        self.debug = debug
        
        #the image encoder
        self.backbone = Encoder(arch, n_in=n_in, head=False).body

        # create conversion layer
        self.conv = nn.Conv2d(512, hidden_dim, 1)
        
        #Encoder of the Transformer
        encoder_layers = nn.TransformerEncoderLayer(hidden_dim, nheads, hidden_dim)
        self.transformer = nn.TransformerEncoder(encoder_layers, num_encoder_layers)
        
        # spatial positional encodings
        # note that in baseline DETR we use sine positional encodings
        self.pos = PositionalEncoding(hidden_dim)
        
        #head
        self.lin = nn.Linear(hidden_dim, n_classes)
        
    def forward(self, x):
        x = torch.stack(x, dim=1)
        if self.debug:  print(f' after stack:   {x.shape}')
        batch_size, seq_length, c, h, w = x.shape
        x = x.view(batch_size * seq_length, c, h, w)
        # propagate inputs through ResNet up to avg-pool layer
        x = self.backbone(x)
        if self.debug: print(f'backbone: {x.shape}')
            
        # convert from the latent dim to 256 feature planes for the transformer
        h = self.conv(x)
        if self.debug: print(f'h: {h.shape}')
        h = h.view(batch_size, seq_length, *h.shape[1:])
        if self.debug: print(f'h: {h.shape}')

        # construct positional encodings
        H, W = h.shape[-2:]
        T = h.shape[1]
        if self.debug: print(f'T,H,W: {T}, {H}, {W}')
        
#         pos = torch.cat([
#             self.time_embed[:T].view(T,1,1,-1).repeat(1, H, W, 1),
#             self.col_embed[:W].view(1,1,W,-1).repeat(T, H, 1, 1),
#             self.row_embed[:H].view(1,H,1,-1).repeat(T, 1, W, 1),
#         ], dim=-1).flatten(0, 2).unsqueeze(1)
#         pos = self.pos[0:T*W*H].unsqueeze(1)
#         if self.debug: print(f'pos: {pos.shape}')
        
        # propagate through the transformer
        tf_input = self.pos(h.permute(0,2,1,3,4).flatten(2).permute(2,0,1))
        if self.debug: print(f'tf_input: {tf_input.shape}')
        h = self.transformer(tf_input)
        if self.debug: print(f'tf_out: {h.shape}')
        return self.lin(h).squeeze(1)      

In [36]:
tf = TransformerModel(debug=True)

In [37]:
tf([torch.rand(1, 3, 64, 64) for _ in range(10)]).shape

 after stack:   torch.Size([1, 10, 3, 64, 64])
backbone: torch.Size([10, 512, 2, 2])
h: torch.Size([10, 256, 2, 2])
h: torch.Size([1, 10, 256, 2, 2])
T,H,W: 10, 2, 2
tf_input: torch.Size([40, 1, 256])
tf_out: torch.Size([40, 1, 256])


torch.Size([40, 30])

# Export -

In [38]:
###### hide
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted 01_utils.ipynb.
Converted 03_models.ipynb.
Converted 04_train_baseline.ipynb.
Converted 04_train_convlstm.ipynb.
Converted 04_train_convlstm_split0.ipynb.
Converted 05_train_transformer.ipynb.
Converted index.ipynb.
