Trying to follow this tutorial
https://www.ai-contentlab.com/2023/03/how-to-implement-timesformer-for-video.html

In [1]:
from TimeSformer.timesformer.models.vit import TimeSformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = TimeSformer(img_size=224, num_classes=3, num_frames=8, attention_type='divided_space_time')

In [3]:
import torch
import os
import cv2
import random
import numpy as np

In [4]:
IMG_SIZE = 224

In [127]:
# create 2 arrays. Append images to 1. Once it reach sequence. Append to another
def frames_from_video_file(video_path, MAX_SEQ_LENGTH, frame_step, output_size = (IMG_SIZE, IMG_SIZE)):
    result = []
  
    src = cv2.VideoCapture(str(video_path))  

    video_length = src.get(cv2.CAP_PROP_FRAME_COUNT)

    need_length = 1 + (MAX_SEQ_LENGTH - 1) * frame_step

    if need_length > video_length:
        start = 0
    else:
        max_start = video_length - need_length
        start = random.randint(0, max_start + 1)

    src.set(cv2.CAP_PROP_POS_FRAMES, start)
    # ret is a boolean indicating whether read was successful, frame is the image itself
    ret, frame = src.read()
    frame = cv2.resize(frame, output_size)
    #cv2.normalize(frame, frame, 0, 255, cv2.NORM_MINMAX)
    #frame = frame.transpose(2,1,0)
    #frame = torch.tensor(frame)
    #frame = frame.to(torch.float32)
    result.append(frame)

    for _ in range(MAX_SEQ_LENGTH - 1):
        for _ in range(frame_step):
            ret, frame = src.read()
        if ret:
            frame = cv2.resize(frame, output_size)
            #cv2.normalize(frame, frame, 0, 255, cv2.NORM_MINMAX)
            #frame = frame.transpose(2,1,0)
            #frame = torch.tensor(frame)
            #frame = frame.to(torch.float32)
            result.append(frame)
        else:
            frame = np.zeros_like(result[0])
            #frame = torch.tensor(frame)
            #frame = frame.to(torch.float32)
            result.append(frame)
    src.release()
    result = np.array(result)[0][..., [2, 1, 0]]
    return result

In [164]:
class VideoDataset(torch.utils.data.Dataset):
    def __init__(self, df, root_dir, MAX_SEQ_LENGTH, frame_step):
        self.video_paths = df["video-name"].values.tolist()
        self.labels = df["label"].values.tolist()
        self.n_frames = MAX_SEQ_LENGTH
        self.root_dir = root_dir
        self.frame_step = frame_step
        
    def __len__(self):
        return len(self.video_paths)
    
    def __getitem__(self, idx):
        path = self.video_paths[idx]
        label = self.labels[idx]
        frames = frames_from_video_file(os.path.join(self.root_dir, path), self.n_frames, self.frame_step)
        frames = np.float32(frames)
        #frames = torch.stack(frames)
        print(frames.shape)
        return frames, label

In [165]:
import pandas as pd

In [166]:
df = pd.read_csv("../data/mirror-data.csv")
df["Action"] = df["Action"].str.rstrip()
df = df[df.Action != "Talking&Yawning"]
df["label"] = df.Action.astype('category').cat.codes

In [167]:
i = 0
dfTrain = pd.DataFrame()
dfTest = pd.DataFrame()

while i<len(df):
    if i%5==0:
        dfTest = pd.concat([dfTest, df.iloc[[i]]])
    else :
        dfTrain = pd.concat([dfTrain, df.iloc[[i]]])
    i+=1

In [168]:
MAX_SEQ_LENGTH = 8
frame_step = 1

In [169]:
train_ds = VideoDataset(dfTrain, "../data/YawDD/YawDD dataset/Mirror/all/", MAX_SEQ_LENGTH, frame_step)
test_ds = VideoDataset(dfTest, "../data/YawDD/YawDD dataset/Mirror/all/", MAX_SEQ_LENGTH, frame_step)

In [170]:
test_ds.__getitem__(0)

(224, 224, 3)


(array([[[253., 253., 253.],
         [253., 253., 253.],
         [253., 253., 253.],
         ...,
         [236., 214., 220.],
         [238., 216., 221.],
         [226., 205., 210.]],
 
        [[253., 253., 253.],
         [253., 253., 253.],
         [253., 253., 253.],
         ...,
         [250., 243., 244.],
         [255., 246., 248.],
         [255., 249., 250.]],
 
        [[253., 253., 253.],
         [253., 253., 253.],
         [253., 253., 253.],
         ...,
         [254., 255., 254.],
         [253., 254., 252.],
         [247., 249., 246.]],
 
        ...,
 
        [[146., 167., 181.],
         [150., 168., 189.],
         [155., 170., 186.],
         ...,
         [252., 250., 251.],
         [250., 250., 253.],
         [249., 250., 253.]],
 
        [[143., 163., 177.],
         [147., 164., 185.],
         [151., 166., 182.],
         ...,
         [253., 246., 245.],
         [254., 249., 248.],
         [253., 251., 252.]],
 
        [[140., 160., 174.],
 

In [171]:
from torch.utils.data import DataLoader

In [172]:
train_loader = DataLoader(train_ds, batch_size=4)
val_loader = DataLoader(test_ds, batch_size=4)

In [173]:
f, l = next(iter(train_loader))

(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)


In [174]:
f.size()

torch.Size([4, 224, 224, 3])

In [175]:
import torch.optim as optim
from torch import nn
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [176]:
num_epochs = 2

for epoch in range(num_epochs):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    # Evaluate on validation set
    correct = 0
    total = 0
    with torch.no_grad():
        for data in val_loader:
            inputs, labels = data
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_acc = 100 * correct / total
    
    print('[%d] loss: %.3f, val_acc: %.3f' %
          (epoch + 1, running_loss / len(train_loader), val_acc))

(224, 224, 3)
(224, 224, 3)
(224, 224, 3)
(224, 224, 3)


ValueError: not enough values to unpack (expected 5, got 4)