In [3]:
import numpy as np
import cv2
from PIL import Image
import os
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
import torchvision 
from torchvision import datasets, models
from torchvision.transforms import transforms
from torchvision.transforms import functional as F

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedShuffleSplit

<i> Adapting some code snippets from "Pytorch Computer Vision Cookbook" tutorial on video classification</i> [link](https://github.com/PacktPublishing/PyTorch-Computer-Vision-Cookbook/tree/master/Chapter10)

# Training Pipeline

<i>Important directory to note: </i>

1) <i>Main Folder </i><br>
2) <i>Folder containing videos </i> <br>
3) <i>Folder containing video frames </i>

For the training pipeline, root directory will be the folder containing <i>train, val and test </i> folders.

e.g
```
hmdb51/
├── test/
│   ├── brush_hair/
│   │   ├── April_09_brush_hair_u_nm_np1_ba_goo_0/
│   │   │   ├── frame0.jpg
│   │   │   ├── frame1.jpg
│   │   │   ├── frame10.jpg
|   |   |   ......
|   |   |...
│   ├── cartwheel/
│   │   ├── (Rad)Schlag_die_Bank!_cartwheel_f_cm_np1_le_med_0/
│   │   │   ├── frame0.jpg
|   |...
├── train/
│   ├── brush_hair/
│   │   ├── April_09_brush_hair_u_nm_np1_ba_goo_2/
│   │   │   ├── frame0.jpg
│   │   │   ├── frame1.jpg
│   │   │   ├── frame10.jpg
|   |   |   |...
|   |   |...
|   |...
└── val/
    ├── brush_hair/
    │   ├── April_09_brush_hair_u_nm_np1_ba_goo_1/
    │   │   ├── frame0.jpg
    │   │   ├── frame1.jpg
    │   │   ├── frame10.jpg
    │   ├── Aussie_Brunette_Brushing_Hair_II_brush_hair_u_nm_np2_le_goo_1/
    │   │   ├── frame0.jpg
```

In [5]:
path2data = "C:\\Users\\USER\\Desktop\\Video_data"
folder    = "hmdb51_org"
folder_jpg = "hmdb51_jpg"
path2folder = os.path.join(path2data,folder)

In [6]:
def get_videos(root_dir, target_folder):
    target_folder_dir = os.path.join(root_dir, target_folder)
    classes = sorted(os.listdir(target_folder_dir))
    labels = []
    ids = []
    
    for cat in classes:

        cat_dir = os.path.join(target_folder_dir, cat)
        video_path = [os.path.join(cat_dir, loc) for loc in sorted(os.listdir(cat_dir))]
        ids.extend(video_path)
        labels.extend([cat]*len(video_path))
    return ids, labels, classes
def _find_classes(root_dir):
    classes = [d.name for d in os.scandir(root_dir) if d.is_dir()]
    classes.sort()
    class_to_idx = {class_name: i for i, class_name in enumerate(classes)}
    return classes, class_to_idx

In [7]:
classes, class_to_idx = _find_classes(os.path.join(path2data,folder_jpg))

In [8]:
ids, labels ,classes = get_videos(path2data, folder_jpg)

In [9]:
class VideoDataset(Dataset):
    """
    Creates VideoDataset to be loaded 
    
    Parameters:
        root_dir
        split - train/val
        num_classes
        transform
        
    __getitem__(idx)
    
    Parameters:
        idx
    
    returns:
        a list of 16 transformed(frames) and its corresponding label
    
    get_videos(root_dir,target_folder)
    
    Parameters:
        root_dir
        target_folder - train/val
    
    returns:
        all video path, corresponding label and classes
    
    """
    def __init__(self,root_dir, split, num_classes, transform = None):
        self.root_dir = root_dir
        self.num_classes = num_classes
        self.split = split
        if transform == None:
            self.transform = transforms.Compose([
                    transforms.Resize((112,112), interpolation=3),
                    transforms.ToTensor(),
                    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
                                        ])
        else:
            self.transform = transform
        
        all_ids, all_labels, _ = self.get_videos(root_dir, split)
        _, self.labels_dict = self._find_classes(os.path.join(root_dir, split))
        
        
        
        self.unique_ids = [id_ for id_, label in zip(all_ids, all_labels) 
                                    if self.labels_dict[label]<self.num_classes]
        self.unique_labels = [label for id_, label in zip(all_ids, all_labels) 
                                            if self.labels_dict[label]<self.num_classes]  
        
        self.classes = list(np.unique(self.unique_labels))
    def __getitem__(self,idx):
        frames = []
      
        
        if self.transform:
            for frame_name in os.listdir(self.unique_ids[idx])[:16]:
                img = Image.open(os.path.join(self.unique_ids[idx], frame_name))
                img = self.transform(img)
                frames.append(img)
                
                
        else:
            for frame_name in os.listdir(self.unique_ids[idx])[:16]:
                img = Image.open(os.path.join(self.unique_ids[idx], frame_name))
                frames.append(img)
        
        label= self.labels_dict[self.unique_labels[idx]]
        frames_tr = torch.stack(frames)
        return frames_tr, label        
    
    def __len__(self):
        return len(self.unique_ids)        
    
    
    def get_videos(self,root_dir, target_folder):
        target_folder_dir = os.path.join(root_dir, target_folder)
        classes = sorted(os.listdir(target_folder_dir))
        labels = []
        ids = []

        for cat in classes:

            cat_dir = os.path.join(target_folder_dir, cat)
            video_path = [os.path.join(cat_dir, loc) for loc in sorted(os.listdir(cat_dir))]
            ids.extend(video_path)
            labels.extend([cat]*len(video_path))
        return ids, labels, classes
    
    def _find_classes(self, root_dir):
        classes = [d.name for d in os.scandir(root_dir) if d.is_dir()]
        classes.sort()
        class_to_idx = {class_name: i for i, class_name in enumerate(classes)}
        return classes, class_to_idx

In [28]:
path2data = "C:\\Users\\USER\\Desktop\\Video_data\\hmdb51"

In [29]:
data_transform = {
    'train': transforms.Compose([
        transforms.Resize((112,112), interpolation=3),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize((112,112), interpolation=3),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [30]:
def collate_fn_r3d_18(batch):
    imgs_batch, label_batch = list(zip(*batch))
    imgs_batch = [imgs for imgs in imgs_batch if len(imgs)>0]
    label_batch = [torch.tensor(l) for l, imgs in zip(label_batch, imgs_batch) if len(imgs)>0]
    imgs_tensor = torch.stack(imgs_batch)
    imgs_tensor = torch.transpose(imgs_tensor, 2, 1)
    labels_tensor = torch.stack(label_batch)
    return imgs_tensor,labels_tensor

In [31]:
video_data = {x: VideoDataset(path2data, x, 4, transform = data_transform[x]) for x in ["train", "val"]}
data_loader = {x: torch.utils.data.DataLoader(video_data[x], batch_size = 4, shuffle = True,
                                             collate_fn = collate_fn_r3d_18) for x in ["train", "val"]}
dataset_sizes = {x: len(video_data[x]) for x in ['train', 'val']}

In [63]:
num_classes = len(video_data["train"].classes)
model = models.video.r3d_18(pretrained=True, progress=False)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes) 

In [8]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [67]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.001, momentum = 0.9)
exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=40, gamma=0.1)

In [68]:
model, loss_history, metric_history = train_model(model, criterion, optimizer, exp_lr_scheduler)

Epoch 0/24
----------
train Loss: 1.0501 Acc: 0.5204
val Loss: 0.1784 Acc: 0.9710

Epoch 1/24
----------
train Loss: 0.4003 Acc: 0.8439
val Loss: 0.1409 Acc: 0.9565

Epoch 2/24
----------
train Loss: 0.2711 Acc: 0.9219
val Loss: 0.1277 Acc: 0.9565

Epoch 3/24
----------
train Loss: 0.3177 Acc: 0.8922
val Loss: 0.0726 Acc: 0.9855

Epoch 4/24
----------
train Loss: 0.3067 Acc: 0.8773
val Loss: 0.0775 Acc: 0.9565

Epoch 5/24
----------
train Loss: 0.1346 Acc: 0.9703
val Loss: 0.0906 Acc: 0.9710

Epoch 6/24
----------
train Loss: 0.1278 Acc: 0.9591
val Loss: 0.0802 Acc: 0.9710

Epoch 7/24
----------
train Loss: 0.1576 Acc: 0.9628
val Loss: 0.0828 Acc: 0.9710

Epoch 8/24
----------
train Loss: 0.1320 Acc: 0.9665
val Loss: 0.1350 Acc: 0.9710

Epoch 9/24
----------
train Loss: 0.1621 Acc: 0.9368
val Loss: 0.1061 Acc: 0.9710

Epoch 10/24
----------
train Loss: 0.1294 Acc: 0.9480
val Loss: 0.1108 Acc: 0.9710

Epoch 11/24
----------
train Loss: 0.1731 Acc: 0.9517
val Loss: 0.0708 Acc: 0.9710

Ep

# Testing your model 

<i> packages needed </i> <br>
<i> - cv2 </i> <br>
<i> - collections </i> <br>
<i> - time </i>

## Pipeline 

1) Load the video

2) Create a deque object with max length of 16 frames

3) Run the video and load the frames into deque object

4) When deque object is filled, transform it into the appropriate Tensor dimension for the model to process

In [6]:
checkpoint = torch.load("VideoClass.pth")
model = models.video.r3d_18(pretrained=True, progress=False)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 4) 
model.load_state_dict(checkpoint['model_state_dict'])
#optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

<All keys matched successfully>

In [80]:
from collections import deque
import time

In [78]:
path2Video = "C:\\Users\\USER\\Desktop\\Video_data\\hmdb51_org\\catch\\Frisbee_catch_f_cm_np1_ri_med_1.avi"
cap = cv2.VideoCapture(path2Video)
fps = cap.get(cv2.CAP_PROP_FPS)
transform = transforms.Compose([
        transforms.Resize((112,112), interpolation=3),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])

In [79]:
counter = 0 
frames = deque(maxlen = 16)
while(cap.isOpened()):
    # Capture frame-by-frame
    ret, frame = cap.read()

    if ret == True:
    # Our operations on the frame come here
        time.sleep(1/fps)
        img = Image.fromarray(frame)
        img_tensor = transform(img)
        frames.append(img_tensor)

        if len(frames) == 16:
            frames_tensor = torch.stack(list(frames))
            frames_tensor = torch.transpose(frames_tensor, 1, 0)
            frames_tensor = frames_tensor.unsqueeze(0)
            output = model(frames_tensor.to(device))
            action = video_data["train"].classes[torch.argmax(output,dim = 1).cpu().item()]
        
            frame = cv2.putText(frame,action,(20,20), cv2.FONT_HERSHEY_SIMPLEX ,  
                   1, (255,0,0), 1, cv2.LINE_AA) 
        
        
        # Display the resulting frame
        cv2.imshow('frame',frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
    else:
        break

# When everything done, release the capture
cap.release()
cv2.destroyAllWindows()

In [59]:
import time
import copy
def train_model(model, criterion, optimizer, scheduler, num_epochs = 25):
    since = time.time()
    loss_history={
        "train": [],
        "val": [],
    }
    
    metric_history={
        "train": [],
        "val": [],
    }
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    model.to(device)
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in data_loader[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
            loss_history[phase].append(epoch_loss)
            metric_history[phase].append(1.0 - epoch_acc)
            
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                

        print()

    time_elapsed = time.time() - since
    model.load_state_dict(best_model_wts)
    return model, loss_history, metric_history