In [1]:
!wget -q https://git.io/JGc31 -O ucf101_top5.tar.gz # downloading only a sample of ucf101 dataset
!tar xf ucf101_top5.tar.gz

In [1]:
import os
import cv2
import glob
import torch
import numpy as np
import pandas as pd
from torch import nn
from tqdm import tqdm
from IPython.core.display import Video
from torchvision import models, transforms
from torch.utils.data import TensorDataset, DataLoader

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
print(len(train_df), len(test_df))
print(train_df.head())

594 224
                  video_name          tag
0  v_CricketShot_g08_c01.avi  CricketShot
1  v_CricketShot_g08_c02.avi  CricketShot
2  v_CricketShot_g08_c03.avi  CricketShot
3  v_CricketShot_g08_c04.avi  CricketShot
4  v_CricketShot_g08_c05.avi  CricketShot


In [None]:
f = glob.glob('train/*.*')[42]
Video(f)

In [None]:
f = glob.glob('test/*.*')[42]
Video(f)

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_epochs = 30
img_size = 299
batch_size = 64
max_seq_len = 20
n_features = 2048 # output shape of inception_v3, if the final fc layer is removed
hidden1 = 16
hidden2 = 8
drop_prob = 0.4
lr = 1e-3
print(device)

cuda


In [7]:
inception = models.inception_v3(pretrained=True, aux_logits=False)
inception.fc = nn.Identity()
inception = inception.to(device)
inp = torch.randn(1, 3, 299, 299).to(device)
out = inception(inp)
print(out.shape)
del inp, out

torch.Size([1, 2048])


In [8]:
labels = train_df['tag'].values.tolist()
i2l = list(np.unique(labels))
l2i = {}
for i, l in enumerate(i2l):
    l2i[l] = i
print(i2l, l2i)

['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing'] {'CricketShot': 0, 'PlayingCello': 1, 'Punch': 2, 'ShavingBeard': 3, 'TennisSwing': 4}


In [9]:
def crop_center_square(frame):
    y, x = frame.shape[0:2]
    min_dim = min(y, x)
    start_x = (x // 2) - (min_dim // 2)
    start_y = (y // 2) - (min_dim // 2)
    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]

def load_video(path, max_frames, resize):
    cap = cv2.VideoCapture(path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = crop_center_square(frame)
            frame = cv2.resize(frame, resize)
            frames.append(frame)
            if len(frames) == max_frames:
                break
    finally:
        cap.release()
    return frames # (n_frames, h, w, 3)

In [10]:
@torch.no_grad()
def prepare_all_videos(net, df, root_dir, max_seq_len, resize):
    net.eval()
    num_samples = len(df)
    video_paths = df["video_name"].values.tolist()
    frame_features = []
    frame_lenghts = []
    labels = df["tag"].values
    labels = [l2i[l] for l in labels]

    for idx, path in enumerate(video_paths):
        frames = load_video(os.path.join(root_dir, path), max_seq_len, resize) # (n_frames, h, w, 3)
        curr_frame_featutes = np.zeros((max_seq_len, n_features))
        vid_len = len(frames)
        curr_len = min(max_seq_len, vid_len)

        for i, frame in enumerate(frames):
            frame = transforms.ToTensor()(frame).unsqueeze(0).to(device)
            curr_frame_featutes[i] = net(frame)[0].detach().cpu().numpy()
            if i + 1 == curr_len:
                break

        frame_features.append(curr_frame_featutes)
        frame_lenghts.append(curr_len)
    return torch.FloatTensor(frame_features), torch.IntTensor(frame_lenghts), torch.LongTensor(labels)

In [11]:
train_x, train_y, train_z = prepare_all_videos(inception, train_df, 'train', max_seq_len, (img_size, img_size))
test_x, test_y, test_z = prepare_all_videos(inception, test_df, 'test', max_seq_len, (img_size, img_size))

In [12]:
train_data = TensorDataset(train_x, train_y, train_z)
test_data = TensorDataset(test_x, test_y, test_z)
train_loader = DataLoader(train_data, batch_size=batch_size, num_workers=2, shuffle=True, pin_memory=True)
test_loader = DataLoader(test_data, batch_size=batch_size, num_workers=2, shuffle=True, pin_memory=True)
x, y, z = next(iter(train_loader))
print(len(train_data), x.shape, y.shape, z.shape)

594 torch.Size([64, 20, 2048]) torch.Size([64]) torch.Size([64])


In [13]:
class SequentialModel(nn.Module):
    def __init__(self, input_size, hidden1, hidden2, output_size, drop_prob):
        super().__init__()
        self.gru1 = nn.GRU(input_size, hidden1, batch_first=True)
        self.gru2 = nn.GRU(hidden1, hidden2, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc1 = nn.Linear(hidden2, hidden2)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden2, output_size)

    def forward(self, frames, frame_lengths):
        packed_frames = nn.utils.rnn.pack_padded_sequence(frames, frame_lengths, batch_first=True)
        packed_x, _ = self.gru1(packed_frames)
        packed_x, h = self.gru2(packed_x)
        # x, x_len = nn.utils.rnn.pad_packed_sequence(packed_x)
        h = self.dropout(h.squeeze(0)) # (1, bs, d) -> (bs, d)
        h = self.relu(self.fc1(h))
        return self.fc2(h)

In [14]:
seq_model = SequentialModel(n_features, hidden1, hidden2, len(i2l), drop_prob).to(device)
inp = torch.randn(2, 5, 2048).to(device)
inp_len = torch.tensor([4, 3])
out = seq_model(inp, inp_len)
print(out.shape)
del inp, inp_len, out

torch.Size([2, 5])


In [15]:
optimizer = torch.optim.Adam(seq_model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()
def get_accuracy(preds, y):
    preds = preds.argmax(dim=1, keepdim=True)
    correct = preds.squeeze(1).eq(y)
    acc = correct.sum() / torch.FloatTensor([y.shape[0]]).to(device)
    return acc

In [16]:
def loop(net, loader, is_train):
    net.train(is_train)
    losses = []
    accs = []
    pbar = tqdm(loader, total=len(loader))
    for frames, frame_lengths, labels in pbar:
        frames = frames.to(device)
        frame_lengths = frame_lengths.cpu() # this needs to be on cpu
        labels = labels.to(device)
        with torch.set_grad_enabled(is_train):
            preds = net(frames, frame_lengths)
            loss = loss_fn(preds, labels)
            acc = get_accuracy(preds, labels)
            losses.append(loss.item())
            accs.append(acc.item())
        if is_train:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        pbar.set_description(f'epoch={epoch}, train={int(is_train)}, loss={np.mean(losses):.4f}, acc={np.mean(accs):.4f}')

In [17]:
for epoch in range(n_epochs):
    loop(seq_model, train_loader, True)
    loop(seq_model, test_loader, False)

epoch=0, train=1, loss=1.6001, acc=0.2316: 100%|██████████| 10/10 [00:00<00:00, 31.96it/s]
epoch=0, train=0, loss=1.5896, acc=0.2617: 100%|██████████| 4/4 [00:00<00:00, 22.78it/s]
epoch=1, train=1, loss=1.5556, acc=0.3090: 100%|██████████| 10/10 [00:00<00:00, 33.65it/s]
epoch=1, train=0, loss=1.5577, acc=0.3828: 100%|██████████| 4/4 [00:00<00:00, 23.70it/s]
epoch=2, train=1, loss=1.5070, acc=0.3981: 100%|██████████| 10/10 [00:00<00:00, 34.87it/s]
epoch=2, train=0, loss=1.5219, acc=0.3633: 100%|██████████| 4/4 [00:00<00:00, 22.35it/s]
epoch=3, train=1, loss=1.4563, acc=0.4780: 100%|██████████| 10/10 [00:00<00:00, 33.80it/s]
epoch=3, train=0, loss=1.4910, acc=0.4062: 100%|██████████| 4/4 [00:00<00:00, 23.20it/s]
epoch=4, train=1, loss=1.3967, acc=0.5290: 100%|██████████| 10/10 [00:00<00:00, 34.95it/s]
epoch=4, train=0, loss=1.4429, acc=0.4375: 100%|██████████| 4/4 [00:00<00:00, 22.63it/s]
epoch=5, train=1, loss=1.3420, acc=0.5938: 100%|██████████| 10/10 [00:00<00:00, 34.04it/s]
epoch=5, 

In [20]:
@torch.no_grad()
def predict(net, seq_model, path, max_seq_len, resize):
    net.eval()
    seq_model.eval()
    frames = load_video(path, max_seq_len, resize) # (n_frames, h, w, 3)
    frame_features = np.zeros((max_seq_len, n_features))
    vid_len = len(frames)
    frame_len = min(max_seq_len, vid_len)
    for i, frame in enumerate(frames):
        frame = transforms.ToTensor()(frame).unsqueeze(0).to(device)
        frame_features[i] = net(frame)[0].detach().cpu().numpy()
        if i + 1 == frame_len:
            break

    frame_features = torch.FloatTensor([frame_features]).to(device)
    frame_len = torch.IntTensor([frame_len]).cpu()
    preds = seq_model(frame_features, frame_len)
    probs = preds.softmax(-1).squeeze(0)
    probs, idxs = probs.sort(descending=True)
    for prob, idx in zip(probs, idxs):
        print(f'P({i2l[idx]}) = {prob:.4f}')

In [22]:
idx = 42
vid_path = f"test/{test_df['video_name'][idx]}"
label = test_df['tag'][idx]
predict(inception, seq_model, vid_path, max_seq_len, (img_size, img_size))
print(f'label: {label}')
Video(f)

P(CricketShot) = 0.7285
P(Punch) = 0.2045
P(PlayingCello) = 0.0560
P(TennisSwing) = 0.0065
P(ShavingBeard) = 0.0045
label: CricketShot


In [23]:
idx = 69
vid_path = f"test/{test_df['video_name'][idx]}"
label = test_df['tag'][idx]
predict(inception, seq_model, vid_path, max_seq_len, (img_size, img_size))
print(f'label: {label}')
Video(f)

P(PlayingCello) = 0.9240
P(CricketShot) = 0.0508
P(TennisSwing) = 0.0168
P(Punch) = 0.0044
P(ShavingBeard) = 0.0040
label: PlayingCello
