resnet 3D CNN + kinetics-400 pretrained + hmdb51 training  
Reference paper: Can Spatiotemporal 3D CNNs Retrace the History of 2D CNNs and ImageNet?

In [61]:
import torch
import torchvision
from torchvision import transforms, datasets
from torch.utils.data import DataLoader
from torchsummaryX import summary
import torch.optim as optim
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter

import pretrained_model as model
import os
import time
import copy
import warnings
warnings.filterwarnings("ignore")
import utils

## Loading data

In [1]:
BSZ = 16
NEPOCH = 10
LR = 1e-4

transform = transforms.Compose([utils.RandomResizedCropVideo(112), utils.ToTensorVideo()])

data_root = '/mnt/data/public/datasets/HMDB'

train_loader = DataLoader(datasets.HMDB51(root = os.path.join(data_root, 'hmdb51'), 
                                          annotation_path = os.path.join(data_root,'splits'),
                                          frames_per_clip = 8, fold = 1, train = True, 
                                          transform = transform), 
                          batch_size = BSZ, shuffle = True, num_workers = 4)

val_loader = DataLoader(datasets.HMDB51(root = os.path.join(data_root, 'hmdb51'), 
                                        annotation_path = os.path.join(data_root, 'splits'),
                                        frames_per_clip = 8, fold = 2, train = False, 
                                        transform = transform), 
                        batch_size = BSZ, shuffle = True, num_workers = 4)

test_loader = DataLoader(datasets.HMDB51(root = os.path.join(data_root, 'hmdb51'), 
                                         annotation_path = os.path.join(data_root, 'splits'),
                                         frames_per_clip = 8, fold = 3, train = False, 
                                         transform = transform), 
                         batch_size = BSZ, shuffle = True, num_workers = 4)



input_size = next(iter(train_loader))[0].size()
assert input_size == torch.Size([BSZ, 3, 8, 112, 112]), 'input_size is {}'.format(input_size)

NameError: name 'transforms' is not defined

## Define model 

In [5]:
device = torch.device('cuda:2' if torch.cuda.is_available() else 'cpu')
model_ft = model.get_model().to(device)
criterion =  nn.CrossEntropyLoss()
optimizer = optim.Adam(model_ft.parameters(), lr = LR)
path = path = f'bsz:{BSZ}-lr:{LR}-nepoch:{NEPOCH}'
writer = SummaryWriter(os.path.join('./log', path))

print(model_ft)
summary(model_ft, torch.zeros(1, 3, 8, 112, 112).to(device))

VideoResNet(
  (stem): BasicStem(
    (0): Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2), padding=(1, 3, 3), bias=False)
    (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (conv2): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (conv1): Sequential(
        (0): Conv3DSimple(64, 64, kernel_size=(3, 3, 3), stride=(1, 1, 1), padding=(1, 1, 1), bias=False)
        (1):

Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_stem.Conv3d_0,"[3, 64, 3, 7, 7]","[1, 64, 8, 56, 56]",28224.0,708083700.0
1_stem.BatchNorm3d_1,[64],"[1, 64, 8, 56, 56]",128.0,64.0
2_stem.ReLU_2,-,"[1, 64, 8, 56, 56]",,
3_layer1.0.conv1.Conv3DSimple_0,"[64, 64, 3, 3, 3]","[1, 64, 8, 56, 56]",110592.0,2774532000.0
4_layer1.0.conv1.BatchNorm3d_1,[64],"[1, 64, 8, 56, 56]",128.0,64.0
5_layer1.0.conv1.ReLU_2,-,"[1, 64, 8, 56, 56]",,
6_layer1.0.conv2.Conv3DSimple_0,"[64, 64, 3, 3, 3]","[1, 64, 8, 56, 56]",110592.0,2774532000.0
7_layer1.0.conv2.BatchNorm3d_1,[64],"[1, 64, 8, 56, 56]",128.0,64.0
8_layer1.0.ReLU_relu,-,"[1, 64, 8, 56, 56]",,
9_layer1.1.conv1.Conv3DSimple_0,"[64, 64, 3, 3, 3]","[1, 64, 8, 56, 56]",110592.0,2774532000.0


## Train and test

In [62]:
best_val_acc = 0.0
best_val_model = copy.deepcopy(model_ft.state_dict())
classes = ['brush_hair', 'eat', 'kiss', 'shake_hands', 'sword',
           'cartwheel', 'fall_floor', 'laugh', 'shoot_ball', 'sword_exercise',
           'catch', 'fencing', 'pick', 'shoot_bow', 'talk',
           'chew', 'flic_flac', 'pour', 'shoot_gun', 'throw',
           'clap', 'golf', 'pullup', 'sit', 'turn',
           'climb', 'handstand', 'punch', 'situp', 'walk',
           'climb_stairs', 'hit', 'push', 'smile', 'wave',
           'dive', 'hug', 'pushup', 'smoke', 'draw_sword',
           'jump', 'ride_bike', 'somersault', 'dribble', 'kick',
           'ride_horse', 'stand', 'drink', 'kick_ball', 'run', 'swing_baseball']

for epoch in range(EPOCH):

    model.train(model_ft, train_loader, criterion, optimizer, epoch, writer, device)
    val_acc = model.evaluate(model_ft, val_loader, writer, device, classes, test_mode = False)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_val_model = copy.deepcopy(model_ft.state_dict())
        
model_ft.load_state_dict(best_val_model)
test_acc = model.evaluate(model_ft, test_loader, writer, device, classes, test_mode = True)

KeyboardInterrupt: 

## Save best model

## Add hyper parameters and final results to tensorboard

In [57]:
hparam_dict = {'train size': len(train_loader.dataset),
               'validation size': len(val_loader.dataset),
               'test size': len(test_loader.dataset),
               'batch number per epoch': len(train_loader),
               'batch size': train_loader.batch_size,
               'epoch number': EPOCH,
               'input shape': str(input_size)}
metric_dict = {'hparam/best validation accuracy': best_val_acc,
               'hparam/test accuracy': test_acc}
writer.add_hparams(hparam_dict, metric_dict)
writer.flush()