In [1]:
import numpy as np
import cv2
from PIL import Image
import os
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
import torchvision 
from torchvision import datasets, models
from torchvision.transforms import transforms
from torchvision.transforms import functional as F

from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedShuffleSplit

Following "Pytorch Computer Vision Cookbook" tutorial on video classification
https://github.com/PacktPublishing/PyTorch-Computer-Vision-Cookbook/tree/master/Chapter10 <br> a.ravel()

In [3]:
path2data = "C:\\Users\\JUNHAO.KOH\\Desktop\\Video_data"
folder    = "hmdb51_org"
folder_jpg = "hmdb51_jpg"
path2folder = os.path.join(path2data,folder)

In [133]:
from sklearn.model_selection import train_test_split
def preprocess_videos(root_dir, folder, dataset_name = "hmdb51"):
    extension = ".avi"
    n_frames = 16
    data_train_dir = os.path.join(root_dir, dataset_name)
    if not os.path.exists(os.path.join(root_dir,dataset_name)):
        data_train_dir = os.path.join(root_dir, dataset_name)
        os.mkdir(data_train_dir)
        os.mkdir(os.path.join(data_train_dir, "train"))
        os.mkdir(os.path.join(data_train_dir, "val"))
        os.mkdir(os.path.join(data_train_dir, "test"))
    
    target_folder_dir = os.path.join(root_dir, folder)
    classes = sorted(os.listdir(target_folder_dir))

    for class_names in classes:

        class_videos = os.listdir(os.path.join(target_folder_dir, class_names))
        train_val_videos, test_videos = train_test_split(class_videos, test_size=0.2, random_state=42)
        train_videos, val_videos = train_test_split(train_val_videos, test_size=0.2, random_state=42)
        
        for train_vid_names in train_videos:
            if extension not in train_vid_names:
                continue
            path2vid = os.path.join(target_folder_dir, class_names, train_vid_names)
            frames, vlen = get_frames(path2vid, n_frames = n_frames)
            vid_name = path2vid.replace(extension, "")
            vid_name = vid_name.split("\\")[-1]
            path2store = os.path.join(data_train_dir, "train", class_names, vid_name )
            if not os.path.exists(os.path.join(data_train_dir, "train", class_names)):
                os.mkdir(os.path.join(data_train_dir, "train", class_names))
            if not os.path.exists(path2store):
                os.mkdir(path2store)
            store_frames(frames, path2store)
            
        for val_vid_names in val_videos:
            if extension not in val_vid_names:
                continue
            path2vid = os.path.join(target_folder_dir, class_names, val_vid_names)
            frames, vlen = get_frames(path2vid, n_frames = n_frames)
            vid_name = path2vid.replace(extension, "")
            vid_name = vid_name.split("\\")[-1]
            path2store = os.path.join(data_train_dir, "val", class_names, vid_name )
            if not os.path.exists(os.path.join(data_train_dir, "val", class_names)):
                os.mkdir(os.path.join(data_train_dir, "val", class_names))
            if not os.path.exists(path2store):
                os.mkdir(path2store)
            store_frames(frames, path2store)
            
        for test_vid_names in test_videos:
            if extension not in test_vid_names:
                continue
            path2vid = os.path.join(target_folder_dir, class_names, test_vid_names)
            frames, vlen = get_frames(path2vid, n_frames = n_frames)
            vid_name = path2vid.replace(extension, "")
            vid_name = vid_name.split("\\")[-1]
            path2store = os.path.join(data_train_dir, "test", class_names, vid_name )
            if not os.path.exists(os.path.join(data_train_dir, "test", class_names)):
                os.mkdir(os.path.join(data_train_dir, "test", class_names))            
            if not os.path.exists(path2store):
                os.mkdir(path2store)
            store_frames(frames, path2store)            


def store_frames(frames, path2store):
    for ii, frame in enumerate(frames):
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  
        path2img = os.path.join(path2store, "frame"+str(ii)+".jpg")
        cv2.imwrite(path2img, frame)       
        
def get_frames(filename, n_frames= 1):
    frames = []
    v_cap = cv2.VideoCapture(filename)
    v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_list= np.linspace(0, v_len-1, n_frames+1, dtype=np.int16)
    
    for fn in range(v_len):
        success, frame = v_cap.read()
        if success is False:
            continue
        if (fn in frame_list):
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)  
            frames.append(frame)
    v_cap.release()
    return frames, v_len    

In [134]:
preprocess_videos(path2data, folder)

In [14]:
def get_videos(root_dir, target_folder):
    target_folder_dir = os.path.join(root_dir, target_folder)
    classes = sorted(os.listdir(target_folder_dir))
    labels = []
    ids = []
    
    for cat in classes:

        cat_dir = os.path.join(target_folder_dir, cat)
        video_path = [os.path.join(cat_dir, loc) for loc in sorted(os.listdir(cat_dir))]
        ids.extend(video_path)
        labels.extend([cat]*len(video_path))
    return ids, labels, classes
def _find_classes(root_dir):
    classes = [d.name for d in os.scandir(root_dir) if d.is_dir()]
    classes.sort()
    class_to_idx = {class_name: i for i, class_name in enumerate(classes)}
    return classes, class_to_idx

In [15]:
classes, class_to_idx = _find_classes(os.path.join(path2data,folder_jpg))

In [16]:
ids, labels ,classes = get_videos(path2data, folder_jpg)

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
train_val_ids, test_ids, train_val_labels,  test_labels = train_test_split(ids, labels, test_size=0.2, random_state=42)
train_ids, val_ids, train_labels,  val_labels = train_test_split(train_val_ids,train_val_labels, test_size=0.2, random_state=42)

In [77]:
class VideoDataset(Dataset):
    def __init__(self,ids, labels,labels_dict, transform=None):
        self.ids = ids
        self.labels = labels
        self.labels_dict = labels_dict
        self.transform = transform

        
    def __getitem__(self, idx):

        frames = []
        if self.transform:
            for frame_name in os.listdir(self.ids[idx])[:16]:
                img = Image.open(os.path.join(self.ids[idx], frame_name))
                img = self.transform(img)
                frames.append(img)
                
                
        else:
            for frame_name in os.listdir(self.ids[idx])[:16]:
                img = Image.open(os.path.join(self.ids[idx], frame_name))
                frames.append(img)
        
        label= self.labels_dict[self.labels[idx]]
        frames_tr = torch.stack(frames)
        return frames_tr, label
                

    def __len__(self):
        return len(self.ids)



In [78]:
data_transform = {
    'train': transforms.Compose([
        transforms.Resize((112,112), interpolation=3),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize((112,112), interpolation=3),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [79]:
train_ds = VideoDataset(train_ids, train_labels, class_to_idx,transform = data_transform["train"])
val_ds = VideoDataset(val_ids, val_labels, class_to_idx,transform = data_transform["val"])

In [81]:
def collate_fn_r3d_18(batch):
    imgs_batch, label_batch = list(zip(*batch))
    imgs_batch = [imgs for imgs in imgs_batch if len(imgs)>0]
    label_batch = [torch.tensor(l) for l, imgs in zip(label_batch, imgs_batch) if len(imgs)>0]
    imgs_tensor = torch.stack(imgs_batch)
    imgs_tensor = torch.transpose(imgs_tensor, 2, 1)
    labels_tensor = torch.stack(label_batch)
    return imgs_tensor,labels_tensor

In [82]:
train_dl = DataLoader(train_ds, batch_size= 16, 
                      shuffle=True, collate_fn = collate_fn_r3d_18)
val_dl = DataLoader(val_ds, batch_size= 2*16, 
                     shuffle=False, collate_fn = collate_fn_r3d_18)


In [83]:
num_classes = len(classes)
model = models.video.r3d_18(pretrained=True, progress=False)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, num_classes) 

In [57]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [93]:
device

device(type='cuda')

In [96]:
video_data = {"train": train_ds,
             "val": val_ds}
data_loader = {"train": train_dl,
              "val":val_dl}
dataset_sizes = {x: len(video_data[x]) for x in ['train', 'val']}

In [97]:
dataset_sizes

{'train': 4329, 'val': 1083}

In [98]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr = 0.001, momentum = 0.9)
exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=40, gamma=0.1)

In [99]:
model, loss_history, metric_history = train_model(model, criterion, optimizer, exp_lr_scheduler)

Epoch 0/24
----------
train Loss: 2.0309 Acc: 0.5318
val Loss: 1.6530 Acc: 0.5743

Epoch 1/24
----------
train Loss: 1.4849 Acc: 0.6387
val Loss: 1.3981 Acc: 0.6214

Epoch 2/24
----------


KeyboardInterrupt: 

In [94]:
import time
import copy
def train_model(model, criterion, optimizer, scheduler, num_epochs = 25):
    since = time.time()
    loss_history={
        "train": [],
        "val": [],
    }
    
    metric_history={
        "train": [],
        "val": [],
    }
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    model.to(device)
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in data_loader[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)
                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()

                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))
            loss_history[phase].append(epoch_loss)
            metric_history[phase].append(1.0 - epoch_acc)
            
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                

        print()

    time_elapsed = time.time() - since
    model.load_state_dict(best_model_wts)
    return model, loss_history, metric_history