### Finetuning and training the Pytorch-i3d model
Code taken from: https://github.com/piergiaj/pytorch-i3d/blob/master/train_i3d.py 

Note: This code was written for PyTorch 0.3. Version 0.4 and newer may cause issues.

To-dos:
1. extract videos and add labels (the line `return images, 0` assigns the label 0 to every images in the code for class `Dataset`. I have chosen the top few single-class labels in `preprocess.ipynb`, see the last cell to get the labels and associated videos through the video names, `v_names`)
2. streamline process for training pre-trained model (loaded via `i3d.load_state_dict(torch.load('rgb_imagenet.pt'))`) (no need to change layers yet)

In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]='2'
import sys
import argparse

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable

import torchvision
from torchvision import datasets, transforms
import videotransforms

import numpy as np

from pytorch_i3d import InceptionI3d

import numpy as np
import glob
import random

from tensorboardX import SummaryWriter

In [2]:
class dataset(torch.utils.data.Dataset):
    
    def __init__(self, paths, num_samples=16): # num_samples cannot be lower than 16
        self.num_samples = num_samples
        self.frames = dict()
        for p in paths:
            self.frames[p] = sorted(glob.glob(p+"/*.jpg"))
        self.data = paths
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        p = self.data[idx]
        num_frames = len(self.frames[p])-1
        sampled_idx = np.linspace(0, num_frames, self.num_samples)
        images = []
        for i in sampled_idx:
            image = torchvision.io.read_image(self.frames[p][int(i)])
            small_dim = min(image.shape[-2:])
            image = torchvision.transforms.functional.center_crop(image, (small_dim, small_dim))
            image = torchvision.transforms.functional.resize(image, (224, 224), antialias=True)
            images.append(image)
        images = torch.stack(images, axis=1)
        images = (images/255)*2 - 1 # values are between -1 and 1
        return images, 0 # 0 is just a placeholder for labels, getitem return data, labels 
        
video_frames_path = "/scratch/network/hishimwe/image" 
# add code here to only extract the videos with v_names and v_labels from preprocess.ipynb 
paths = glob.glob(video_frames_path+"/*")
random.seed(0)
random.shuffle(paths)
d=dataset(paths=paths[:800]) # first 800, change as needed
loader = torch.utils.data.DataLoader(d, shuffle=True, batch_size=10, drop_last=False, num_workers=4)

ValueError: num_samples should be a positive integer value, but got num_samples=0

In [None]:
d.__getitem__(0)[0].shape, d.__getitem__(0)[1] # data, label 

Need to make sure that each input has the same number of frames -> either by taking the minimum or even better just do uniform sampling (same number of frames regardless of video length)

In [None]:
i3d = InceptionI3d(400, in_channels=3) 
i3d.load_state_dict(torch.load('rgb_imagenet.pt'))
i3d.replace_logits(157) # replace 157 with num_classes
#i3d.load_state_dict(torch.load('/ssd/models/000920.pt'))
i3d.cuda()
i3d(torch.zeros(4,3,16,224,224).cuda()).shape # example input

Example training:

In [5]:
# set up gradient descent params
init_lr = 0.1 # default value
optimizer = optim.SGD(i3d.parameters(), 
                      lr=init_lr, 
                      momentum=0.9, 
                      weight_decay=0.0000001)
lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])

# set up training variables 
epochs = 200 # random number, change
tot_loss = 0.0
writer = SummaryWriter("deleteme")
step = 0
for e in range(epochs):
    for data, label in loader:
        data = data.cuda()
        label = label.cuda()
        num_frames = data.size(2)
        per_frame_logits = i3d(data).mean(2)
        
        # compute loss 
        loss = F.cross_entropy(per_frame_logits, label) 
        print(f"epoch {e}: loss = {loss}")
        optimizer.zero_grad()
        loss.backward()
        optimizer.step() 
        writer.add_scalar("train/loss", loss.item(), step) 
        step+=1
    break # remove this to train for more than one epoch

epoch 0: loss = 5.229801177978516
epoch 0: loss = 2.0972135066986084
epoch 0: loss = 0.07437565922737122
epoch 0: loss = 0.00036732948501594365
epoch 0: loss = 5.3908115660306066e-05
epoch 0: loss = 3.576278118089249e-08
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0
epoch 0: loss = 0.0

In [6]:
writer.flush() # ensure that all loss values are recorded 