### Training with augmentation
Code taken from: https://github.com/piergiaj/pytorch-i3d/blob/master/train_i3d.py 

Note: This code was written for PyTorch 0.3. Version 0.4 and newer may cause issues.

# TO CHANGE BEFORE RUNNING

Set `augment = True` below for data to be augmented, and `false` otherwise.

In [1]:
is_augment = False

Set `dropout = True` below for dropout to be included, and `false` otherwise.

In [2]:
is_dropout = False

Dropout details:

In [3]:
dropout_details = "layer1_p0.5"

Learning rate

In [4]:
learning_rate = 0.1

Set `l2 = True` below for L2 Regularization, and `false` for L1 Regularization.

In [5]:
l2 = True

Set weight decay value, `wd`, for L2 Regularization

In [6]:
wd = 1e-10

Set `lambda` for L1 Regularization 

In [7]:
lambda1 = 1e-2

Set the number of epochs in training:

In [8]:
num_epochs = 30

**ALL FILES INCLUDING LOSSES AND THE MODEL WILL BE SAVED WITH THIS NAME:**

In [9]:
# "30epochs_wd_1e-07_dropout__augmented" means the there are 30 training epochs, weight decay is 1e-07, and that there is dropout and augmentation
save_name = f"{num_epochs}epochs"
if (not l2): save_name = save_name + "_l1_lr_" + str(learning_rate) + "_ld_" + str(lambda1) # l1 regularization
if l2: save_name = save_name + "_l2_lr_" + str(learning_rate) + "_wd_"+ str(wd) # l2 regularization
if is_dropout: save_name = save_name + "_dropout_"+dropout_details
if is_augment: save_name = save_name + "_augment"

In [10]:
# check save_name
save_name

'30epochs_l2_lr_0.1_wd_1e-10'

# CODE

Import packages

In [11]:
import os
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]='3'
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:<1024>"
import sys
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.autograd import Variable
import torchvision
from torchvision import datasets, transforms
import numpy as np
from pytorch_i3d import InceptionI3d
import numpy as np
import glob
import random
from tensorboardX import SummaryWriter
from preprocess import run_preprocessing, get_action, holdout_set
import time
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
from PIL import Image, ImageSequence

# video augmentation scripts (c) 2018 okankop
from vidaug import *

Construct a dataset class for training the model:

In [12]:
class dataset(torch.utils.data.Dataset):
    
    def __init__(self, paths, v_names, v_labels, num_samples=16, transforms=None): # num_samples cannot be lower than 16
        self.num_samples = num_samples
        self.frames = dict()
        for p in paths:
            self.frames[p] = sorted(glob.glob(p+"/*.jpg"))
        self.data = paths
        self.video_names = v_names
        self.video_labels = v_labels
        self.transforms = transforms
    
    def __getitem__(self, idx):
        # get original video
        p = self.data[idx]
        
        # sample frames uniformly and create newly sampled video 
        num_frames = len(self.frames[p])-1
        sampled_idx = np.linspace(0, num_frames, self.num_samples) #get num_samples frames from the video
        images = []
        index = np.where(self.video_names == p.split('/')[-1]) #index of p's video name in video_names
        label_video = self.video_labels[index] # the labels for the video
        for i in sampled_idx:
            image = torchvision.io.read_image(self.frames[p][int(i)])
            small_dim = min(image.shape[-2:])
            image = torchvision.transforms.functional.center_crop(image, (small_dim, small_dim))
            image = torchvision.transforms.functional.resize(image, (224, 224), antialias=True)
            images.append(image)
        images = torch.stack(images, axis=1)
        
        # data augmentation 
        if (self.transforms is not None):
            images = np.array(self.transforms(images.numpy()))
            # normalize
            images = (images/255)*2 - 1 # values are between -1 and 1
            return torch.from_numpy(images).type(torch.FloatTensor), label_video 
        
        else: 
            images = (images/255)*2 - 1 #values are between -1 and 1
            return images, label_video 

    def __len__(self):
        return len(self.data)

Build transformations for data augmentation

In [13]:
if is_augment:
    sometimes = lambda aug: Sometimes(0.4, aug) # Used to apply augmentor with 40% probability
    rand_aug = SomeOf([ # randomly chooses two of the following augmentation methods 
        RandomRotate(degrees=10), # randomly rotates the video with a degree randomly choosen from [-10, 10] 
        RandomTranslate(x=40,y=20), # randomly shifting video in [-x, +x] and [-y, +y] coordinate
        RandomShear(x=0.2,y=0.1), # randomly shearing video in [-x, +x] and [-y, +y] directions.
        sometimes(HorizontalFlip()), # horizontally flip the video with 50% probability
        sometimes(GaussianBlur(sigma=random.uniform(0.5,4))), # blur images using gaussian kernels with std. dev. = sigma
        sometimes(ElasticTransformation(alpha=random.uniform(0,5), cval=int(random.uniform(0,255)), mode="nearest")), # moving pixels locally around using displacement fields
        sometimes(PiecewiseAffineTransform(displacement=15, displacement_kernel=1, displacement_magnification=1)), # places a regular grid of points on an image and randomly moves the neighbourhood of these point around via affine transformations
        sometimes(Add(value=int(random.uniform(-100,100)))), # add a value to all pixel intesities in an video
        sometimes(Multiply(value=2)), # multiply all pixel intensities with given value
        sometimes(Multiply(value=0.5)), # multiply all pixel intensities with given value
        sometimes(Pepper(ratio=25)), # sets a certain fraction of pixel intensities to 0
        sometimes(Salt(ratio=25)), # sets a certain fraction of pixel intensities to 255
    ], 2) # only select two of the above augmenters each time

Extract data and labels

In [14]:
video_train, video_val, label_train, label_val, unique_labels = holdout_set(0.25) #valid names and videos
batch_size = 10 # batch size in training
num_videos_train = len(video_train)
num_videos_val = len(video_val)
num_classes = len(set(label_train)) #count unique in labels

video_frames_path = "/scratch/network/hishimwe/image" 
# only extract the videos with v_names and v_labels from preprocess.ipynb 
paths = glob.glob(video_frames_path+"/*")
random.seed(0)
random.shuffle(paths)

good_paths_train = list(filter(lambda c: c.split('/')[-1] in video_train, paths)) #should only get path where good video name; not sure if this filtering will work 
good_paths_val = list(filter(lambda c: c.split('/')[-1] in video_val, paths)) # validation video paths 

if is_augment: d_train = dataset(paths=good_paths_train, v_names=video_train, v_labels= label_train, transforms=rand_aug)
else: d_train = dataset(paths=good_paths_train, v_names=video_train, v_labels= label_train)
d_val = dataset(paths=good_paths_val, v_names=video_val, v_labels= label_val)

loader_train = torch.utils.data.DataLoader(d_train, shuffle=True, batch_size=batch_size, drop_last=False, num_workers=4)
loader_val = torch.utils.data.DataLoader(d_val, shuffle=True, batch_size=batch_size, drop_last=False, num_workers=4)

Construct the model:

In [15]:
start_time = time.time() 
i3d = InceptionI3d(400, in_channels=3) # first input is num_classes in kinetics, this is replaced with replace_logits

if is_dropout: i3d.load_state_dict(torch.load('rgb_imagenet.pt'), strict=False) #added strict = false; theoretically this lets us add layers
else: i3d.load_state_dict(torch.load('rgb_imagenet.pt')) 

i3d.replace_logits(num_classes)
i3d.cuda()

print(f"time taken: {time.time()-start_time} seconds")

time taken: 0.9524366855621338 seconds


Function to evaluate model performance:

In [16]:
#returns accuracy, f1 score, average f1, and confusion matrix for the data
def eval_metrics(ground_truth, predictions, num_classes):

    #dictionary containing the accuracy, precision, f1, avg f1, and confusion matrix for the data
    f1 = f1_score(y_true=ground_truth, y_pred=predictions, labels=np.arange(num_classes), average=None)
    metrics = {
        "accuracy": accuracy_score(y_true=ground_truth, y_pred=predictions),
        "f1": f1,
        "average f1": np.mean(f1),
        "confusion matrix": confusion_matrix(y_true=ground_truth, y_pred=predictions, labels=np.arange(num_classes)),
        "precision": precision_score(y_true=ground_truth, y_pred=predictions, labels=np.arange(num_classes), average=None)
        }
    
    return metrics

Function to train and validate:

In [17]:
def training(model, optimizer, loader, num_classes, reg_type, ld=None):
    losses = []
    ground_truth = []
    predictions = []
    for data, label in loader:
        data = data.cuda()
        label = label.squeeze().type(torch.LongTensor).cuda()
        num_frames = data.size(2)
        per_frame_logits = i3d(data).mean(2)
        preds = per_frame_logits.cpu().detach().numpy().argmax(axis=1) # convert logits into predictions for evaluating accuracy
        
        # calculate and save loss
        loss = F.cross_entropy(per_frame_logits, label)
        losses.append(loss.item()) # append to losses
        ground_truth.extend(list(label.cpu().detach().numpy()))
        predictions.extend(preds.tolist())
        
        if (not reg_type): # l1 regularization
            params = torch.cat([p.view(-1) for p in model.parameters()]) # weights
            norm = torch.norm(params, 1)
            loss = loss - (ld * norm) # updating loss
             
        # back propagation    
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    metrics = eval_metrics(ground_truth, predictions, num_classes)   
    return np.mean(losses), metrics # one loss per epoch and the corresponding metrics        


In [18]:
def evaluate(model, loader, num_classes):
    losses = []
    ground_truth = []
    predictions = []
    for data, label in loader:
        data = data.cuda()
        label = label.squeeze().type(torch.LongTensor).cuda()
        num_frames = data.size(2)
        per_frame_logits = i3d(data).mean(2)
        preds = per_frame_logits.cpu().detach().numpy().argmax(axis=1) # convert logits into predictions for evaluating accuracy
        
        # calculate and save loss
        loss = F.cross_entropy(per_frame_logits, label)
        losses.append(loss.item()) # append to losses
        ground_truth.extend(list(label.cpu().detach().numpy()))
        predictions.extend(preds.tolist())
        
    metrics = eval_metrics(ground_truth, predictions, num_classes)
    return np.mean(losses), metrics # one loss per epoch and the corresponding metrics
    

Train

In [None]:
# set up gradient descent params

if (l2): # l2 regularization 
    optimizer = optim.SGD(i3d.parameters(), lr=learning_rate, momentum=0.9, weight_decay=wd) # weight_decay = l2 regularization
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])
else: # l1 regularization
    optimizer = optim.SGD(i3d.parameters(), lr=learning_rate, momentum=0.9) 
    lr_sched = optim.lr_scheduler.MultiStepLR(optimizer, [300, 1000])


# save performance
train_losses = []
train_accuracies = []
train_precisions = []
val_losses = []
val_accuracies = []
val_precisions = []

# train
for e in range(num_epochs):
    start_time = time.time()
    
    print("EPOCH", e)
    
    # training
    loss_train, metrics_train = training(model=i3d, optimizer=optimizer, loader=loader_train, num_classes=num_classes, reg_type=l2, ld=lambda1)
    train_losses.append(loss_train)
    train_accuracies.append(metrics_train["accuracy"])
    train_precisions.append(metrics_train["precision"])
    
    print("TRAINING")
    print("Loss", loss_train)
    print("Accuracy", metrics_train["accuracy"])
    print("Precision", metrics_train["precision"])
    
    # validation 
    loss_val, metrics_val = evaluate(model=i3d, loader=loader_val, num_classes=num_classes)
    val_losses.append(loss_val)
    val_accuracies.append(metrics_val["accuracy"])
    val_precisions.append(metrics_val["precision"])
    
    print("VALIDATION")
    print("Loss", loss_val)
    print("Accuracy", metrics_val["accuracy"])
    print("Precision", metrics_val["precision"])
        
    print(f"Time taken for epoch {e}: {(time.time()-start_time)/60} mins")
    print("-----------------------------------------------------------------------")

np.savetxt('/scratch/network/hishimwe/losses/train/train_'+ save_name, np.array(train_losses), delimiter=",")
np.savetxt('/scratch/network/hishimwe/losses/val/val_' + save_name, np.array(val_losses), delimiter=",")

np.savetxt('/scratch/network/hishimwe/accuracies/train/train_'+save_name, np.array(train_accuracies), delimiter=",")
np.savetxt('/scratch/network/hishimwe/accuracies/val/val_'+save_name, np.array(val_accuracies), delimiter=",")

np.savetxt('/scratch/network/hishimwe/precisions/train/train_'+save_name, np.array(train_precisions), delimiter=",")
np.savetxt('/scratch/network/hishimwe/precisions/val/val_'+save_name, np.array(val_precisions), delimiter=",")

EPOCH 0
TRAINING
Loss 2.177376237602445
Accuracy 0.2875381626422426
Precision [0.07142857 0.         0.37411972 0.         0.22783083 0.21666667
 0.15918367 0.23913043 0.3215859  0.14814815 0.43727599]
VALIDATION
Loss 1.9116446114768666
Accuracy 0.37271214642262895
Precision [0.         0.         0.50377834 0.         0.31111111 0.39393939
 0.16666667 0.25       0.62903226 0.16666667 0.33695652]
Time taken for epoch 0: 3.742797271410624 mins
-----------------------------------------------------------------------
EPOCH 1


  _warn_prf(average, modifier, msg_start, len(result))


TRAINING
Loss 1.8072564646477844
Accuracy 0.3993893977241188
Precision [0.175      0.         0.5075188  0.0625     0.28940217 0.35828877
 0.2519084  0.30460922 0.62443439 0.38961039 0.55925926]
VALIDATION
Loss 1.726477937264876
Accuracy 0.42678868552412647
Precision [0.         0.         0.60479042 0.         0.49462366 0.22741433
 0.30172414 0.35294118 0.6835443  0.32894737 0.73170732]
Time taken for epoch 1: 3.7373080333073934 mins
-----------------------------------------------------------------------
EPOCH 2


  _warn_prf(average, modifier, msg_start, len(result))


TRAINING
Loss 1.6528390119280512
Accuracy 0.4507354981959478
Precision [0.27272727 0.         0.54672897 0.3        0.35873016 0.34951456
 0.37275986 0.34509804 0.69196429 0.33333333 0.65098039]


  _warn_prf(average, modifier, msg_start, len(result))


VALIDATION
Loss 1.645756261900437
Accuracy 0.44093178036605657
Precision [0.33333333 0.         0.45824847 0.35       0.49350649 0.375
 0.35897436 0.32850242 0.64583333 0.66666667 0.62686567]
Time taken for epoch 2: 3.743498770395915 mins
-----------------------------------------------------------------------
EPOCH 3
TRAINING
Loss 1.5251612394140037
Accuracy 0.4893144601720788
Precision [0.27272727 0.125      0.58932238 0.3255814  0.40277778 0.37995338
 0.38636364 0.43531828 0.71493213 0.52991453 0.6394052 ]
VALIDATION
Loss 1.8452082832982717
Accuracy 0.4101497504159734
Precision [0.         0.         0.45348837 0.         0.32046332 0.27748691
 0.46666667 0.57142857 0.88571429 0.30097087 0.67741935]
Time taken for epoch 3: 3.741293517748515 mins
-----------------------------------------------------------------------
EPOCH 4


  _warn_prf(average, modifier, msg_start, len(result))


TRAINING
Loss 1.421807022471177
Accuracy 0.5281709686372468
Precision [0.3539823  0.45454545 0.61706556 0.37142857 0.44992051 0.40566038
 0.46712803 0.4679089  0.76763485 0.50746269 0.64664311]
VALIDATION
Loss 1.579698558188667
Accuracy 0.49417637271214643
Precision [0.33333333 0.25       0.63311688 0.42857143 0.35       0.33939394
 0.54166667 0.4921875  0.69318182 0.44262295 0.49122807]
Time taken for epoch 4: 3.7417317310969036 mins
-----------------------------------------------------------------------
EPOCH 5
TRAINING
Loss 1.2768532132673132
Accuracy 0.5731334998612267
Precision [0.4        0.27777778 0.67429194 0.45098039 0.47474747 0.45475113
 0.56230032 0.54623656 0.77290837 0.55782313 0.6292517 ]
VALIDATION
Loss 1.655060145973174
Accuracy 0.4525790349417637
Precision [0.2826087  0.2        0.68888889 0.6        0.31226766 0.30645161
 0.57377049 0.39920949 0.95238095 0.6875     0.5952381 ]
Time taken for epoch 5: 3.753821531931559 mins
-------------------------------------------

In [None]:
print(f"train_losses: {train_losses}")
print(f"val_losses: {val_losses}")
print(f"train_accuracies: {train_accuracies}")
print(f"val_accuracies: {val_accuracies}")

Save model

In [None]:
model_path = "/scratch/network/hishimwe/models_trained/" + save_name 
torch.save(i3d, model_path)

Check saved output

In [None]:
torch.load(model_path)