### Finetuning and training the Pytorch-i3d model
Code taken from: https://github.com/piergiaj/pytorch-i3d/blob/master/train_i3d.py 

Note: This code was written for PyTorch 0.3. Version 0.4 and newer may cause issues.

To-dos:
1. extract videos and add labels (the line `return images, 0` assigns the label 0 to every images in the code for class `Dataset`. I have chosen the top few single-class labels in `preprocess.ipynb`, see the last cell to get the labels and associated videos through the video names, `v_names`)
2. streamline process for training pre-trained model (loaded via `i3d.load_state_dict(torch.load('rgb_imagenet.pt'))`) (possibly creating a .py script, no need to change layers yet)
3. run the baseline model and record baseline performance
4. write the code for editing layers in the pretrained model

# Import packages

In [1]:
import os
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]='1'
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:<1024>"
import sys
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import torchvision
from torchvision import datasets, transforms
import videotransforms
import numpy as np
from pytorch_i3d import InceptionI3d
import numpy as np
import glob
import random
from tensorboardX import SummaryWriter
from preprocess import holdout_set
import time

# Construct a dataset class for training the model:

In [2]:
class dataset(torch.utils.data.Dataset):
    
    def __init__(self, paths, v_names, v_labels, num_samples=16): # num_samples cannot be lower than 16
        self.num_samples = num_samples
        self.frames = dict()
        for p in paths:
            self.frames[p] = sorted(glob.glob(p+"/*.jpg"))
        self.data = paths
        self.video_names = v_names
        self.video_labels = v_labels
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        p = self.data[idx]
        num_frames = len(self.frames[p])-1
        sampled_idx = np.linspace(0, num_frames, self.num_samples) #get num_samples frames from the video
        images = []
        index = np.where(self.video_names == p.split('/')[-1]) #index of p's video name in video_names
        label_video = self.video_labels[index] # the labels for the video
        for i in sampled_idx:
            image = torchvision.io.read_image(self.frames[p][int(i)])
            small_dim = min(image.shape[-2:])
            image = torchvision.transforms.functional.center_crop(image, (small_dim, small_dim))
            image = torchvision.transforms.functional.resize(image, (224, 224), antialias=True)
            images.append(image)
        images = torch.stack(images, axis=1)
        images = (images/255)*2 - 1 #values are between -1 and 1
        return images, label_video 

# Extract data and labels

In [3]:
def creating_datasets():
    video_train, video_val, label_train, label_val, unique_labels = holdout_set(0.25) #valid names and videos
    batch_size = 10 # batch size in training
    num_videos_train = len(video_train)
    num_videos_val = len(video_val)
    num_classes = len(set(label_train)) #count unique in labels

    video_frames_path = "/scratch/network/hishimwe/image" 
    # only extract the videos with v_names and v_labels from preprocess.ipynb 
    paths = glob.glob(video_frames_path+"/*")
    random.seed(0)
    random.shuffle(paths)

    good_paths_train = list(filter(lambda c: c.split('/')[-1] in video_train, paths)) #should only get path where good video name; not sure if this filtering will work 
    good_paths_val = list(filter(lambda c: c.split('/')[-1] in video_val, paths)) # validation video paths 

    d_train = dataset(paths=good_paths_train, v_names=video_train, v_labels= label_train)
    d_val = dataset(paths=good_paths_val, v_names=video_val, v_labels= label_val)

    loader_train = torch.utils.data.DataLoader(d_train, shuffle=True, batch_size=batch_size, drop_last=False, num_workers=4)
    loader_val = torch.utils.data.DataLoader(d_val, shuffle=True, batch_size=batch_size, drop_last=False, num_workers=4)
    
    return loader_train, loader_val, unique_labels, num_classes

# Construct the model:

In [4]:
loader_train, loader_val, unique_labels, num_classes = creating_datasets() 

start_time = time.time() 
i3d = InceptionI3d(400, in_channels=3) # first input is num_classes 
i3d.load_state_dict(torch.load('rgb_imagenet.pt'), strict=False) #added strict = false; theoretically this lets us add layers
i3d.replace_logits(num_classes)
i3d.cuda()

print(f"time taken: {time.time()-start_time} seconds")

time taken: 1.1172072887420654 seconds


# Function to evaluate model performance:

In [5]:
#returns accuracy, f1 score, average f1, and confusion matrix for the data
def eval_metrics(ground_truth, predictions, num_classes):

    #dictionary containing the accuracy, precision, f1, avg f1, and confusion matrix for the data
    f1 = f1_score(y_true=ground_truth, y_pred=predictions, labels=np.arange(num_classes), average=None)
    metrics = {
        "accuracy": accuracy_score(y_true=ground_truth, y_pred=predictions),
        "f1": f1,
        "average f1": np.mean(f1),
        "confusion matrix": confusion_matrix(y_true=ground_truth, y_pred=predictions, labels=np.arange(num_classes)),
        "precision": precision_score(y_true=ground_truth, y_pred=predictions, labels=np.arange(num_classes), average=None)
        }
    
    return metrics

In [6]:
def training(model, optimizer, loader, num_classes):
    losses = []
    ground_truth = []
    predictions = []
    for data, label in loader:
        data = data.cuda()
        label = label.squeeze().type(torch.LongTensor).cuda()
        num_frames = data.size(2)
        per_frame_logits = i3d(data).mean(2)
        preds = per_frame_logits.cpu().detach().numpy().argmax(axis=1) # convert logits into predictions for evaluating accuracy
        
        # calculate and save loss
        loss = F.cross_entropy(per_frame_logits, label)
        losses.append(loss.item()) # append to losses
        ground_truth.extend(list(label.cpu().detach().numpy()))
        predictions.extend(preds.tolist())
        
        # perform gradient descent
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    metrics = eval_metrics(ground_truth, predictions, num_classes)   
    return np.mean(losses), metrics # one loss per epoch and the corresponding metrics        


In [None]:
def evaluate(model, loader, num_classes):
    losses = []
    ground_truth = []
    predictions = []
    for data, label in loader:
        data = data.cuda()
        label = label.squeeze().type(torch.LongTensor).cuda()
        num_frames = data.size(2)
        per_frame_logits = i3d(data).mean(2)
        preds = per_frame_logits.cpu().detach().numpy().argmax(axis=1) # convert logits into predictions for evaluating accuracy
        
        # calculate and save loss
        loss = F.cross_entropy(per_frame_logits, label)
        losses.append(loss.item()) # append to losses
        ground_truth.extend(list(label.cpu().detach().numpy()))
        predictions.extend(preds.tolist())
        
    metrics = eval_metrics(ground_truth, predictions, num_classes)
    return np.mean(losses), metrics # one loss per epoch and the corresponding metrics
    

# Train:

In [None]:
# set up gradient descent params
optimizer = optim.SGD(i3d.parameters(), lr=0.1, momentum=0.9, weight_decay=0.0000001) # weight_decay = l2 regularization
lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])

# set up training variables 
epochs = 10 

train_losses = []
train_accuracies = []
train_precisions = []
val_losses = []
val_accuracies = []
val_precisions = []

# train
for e in range(epochs):
    start_time = time.time()
    
    print("EPOCH", e)
    
    # training
    loss_train, metrics_train = training(model=i3d, optimizer=optimizer, loader=loader_train, num_classes=num_classes)
    train_losses.append(loss_train)
    train_accuracies.append(metrics_train["accuracy"])
    train_precisions.append(metrics_train["precision"])
    
    print("TRAINING")
    print("Loss", loss_train)
    print("Accuracy", metrics_train["accuracy"])
    print("Precision", metrics_train["precision"])
    
    # validation 
    loss_val, metrics_val = evaluate(model=i3d, loader=loader_val, num_classes=num_classes)
    val_losses.append(loss_val)
    val_accuracies.append(metrics_val["accuracy"])
    val_precisions.append(metrics_val["precision"])
    
    print("VALIDATION")
    print("Loss", loss_val)
    print("Accuracy", metrics_val["accuracy"])
    print("Precision", metrics_val["precision"])
        
    print(f"Time taken for epoch {e}: {(time.time()-start_time)/60} mins")
    print("-----------------------------------------------------------------------")

np.savetxt('losses/train/train_10epochs', np.array(train_losses), delimiter=",")
np.savetxt('losses/val/val_10epochs', np.array(val_losses), delimiter=",")

np.savetxt('accuracies/train/train_10epochs', np.array(train_accuracies), delimiter=",")
np.savetxt('accuracies/val/val_10epochs', np.array(val_accuracies), delimiter=",")

np.savetxt('precisions/train/train_10epochs', np.array(train_precisions), delimiter=",")
np.savetxt('precisions/val/val_10epochs', np.array(val_precisions), delimiter=",")


EPOCH 0
TRAINING
Loss 2.1675919307566085
Accuracy 0.27865667499306135
Precision [0.06666667 0.         0.37793852 0.13333333 0.2014218  0.18885449
 0.15602837 0.22886598 0.38219895 0.17241379 0.43773585]
VALIDATION
Loss 1.9157873461069155
Accuracy 0.33277870216306155
Precision [0.10650888 0.         0.53645833 0.         0.29530201 0.22164948
 0.33870968 0.31932773 0.66       0.34782609 0.50526316]
Time taken for epoch 0: 3.6150447646776835 mins
-----------------------------------------------------------------------
EPOCH 1


  _warn_prf(average, modifier, msg_start, len(result))


TRAINING
Loss 1.8293240440189014
Accuracy 0.3807937829586456
Precision [0.13095238 0.         0.48266167 0.23076923 0.30252101 0.25609756
 0.28623188 0.31130064 0.54625551 0.33802817 0.56370656]


  _warn_prf(average, modifier, msg_start, len(result))


VALIDATION
Loss 1.7715953928379973
Accuracy 0.40931780366056575
Precision [0.5        0.         0.51029748 0.33333333 0.37172775 0.232
 0.27950311 0.5        0.7826087  0.34615385 0.56410256]
Time taken for epoch 1: 3.613905648390452 mins
-----------------------------------------------------------------------
EPOCH 2


  _warn_prf(average, modifier, msg_start, len(result))


TRAINING
Loss 1.6888170220845293
Accuracy 0.4279766860949209
Precision [0.2238806  0.         0.51948052 0.34285714 0.34039334 0.35805627
 0.32142857 0.3650108  0.62857143 0.37142857 0.52982456]


  _warn_prf(average, modifier, msg_start, len(result))


VALIDATION
Loss 1.6227360796337285
Accuracy 0.44675540765391014
Precision [0.45       0.16666667 0.53444181 0.2        0.33333333 0.28729282
 0.48888889 0.43103448 0.675      0.16666667 0.58441558]
Time taken for epoch 2: 3.6262500087420144 mins
-----------------------------------------------------------------------
EPOCH 3
TRAINING
Loss 1.5211530218824456
Accuracy 0.49375520399666945
Precision [0.3814433  0.         0.59836901 0.375      0.40758294 0.36117936
 0.47811448 0.40976645 0.68312757 0.40601504 0.61672474]
VALIDATION
Loss 1.610532833524972
Accuracy 0.45757071547420963
Precision [0.1978022  0.         0.61068702 0.23076923 0.40394089 0.35779817
 0.30635838 0.44680851 0.78461538 0.48333333 0.7       ]
Time taken for epoch 3: 3.6009892503420513 mins
-----------------------------------------------------------------------
EPOCH 4
TRAINING
Loss 1.4230163585281108
Accuracy 0.5217873993893978
Precision [0.41322314 0.07692308 0.6130031  0.39622642 0.42767296 0.42931937
 0.46779661 0.4

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
print(train_losses)
print(val_losses)
print(train_accuracies)
print(val_accuracies)

# Save model

In [None]:
model_path = "models/baseline_10epochs"
torch.save(i3d, model_path)

Check saved output

In [None]:
torch.load(model_path)