4-5 second clips

In [1]:
from TimeSformer.timesformer.models.vit import TimeSformer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = TimeSformer(img_size=56, num_classes=2, num_frames=75, attention_type='divided_space_time', pretrained_model='./modelZoo/K400-96.pyth')

In [3]:
import torch
import os
import cv2
import random
import numpy as np
import matplotlib.pyplot as plt

In [4]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [5]:
IMG_SIZE = 56

In [6]:
def frames_from_video_file(video_path, MAX_SEQ_LENGTH, frame_step, output_size = (IMG_SIZE, IMG_SIZE)):
    result = []
  
    src = cv2.VideoCapture(str(video_path))  

    video_length = src.get(cv2.CAP_PROP_FRAME_COUNT)

    need_length = 1 + (MAX_SEQ_LENGTH - 1) * frame_step

    if need_length > video_length:
        start = 0
    else:
        max_start = video_length - need_length
        start = random.randint(0, max_start + 1)

    src.set(cv2.CAP_PROP_POS_FRAMES, start)
    # ret is a boolean indicating whether read was successful, frame is the image itself
    ret, frame = src.read()
    frame = cv2.resize(frame, output_size)
    result.append(frame)

    for _ in range(MAX_SEQ_LENGTH - 1):
        for _ in range(frame_step):
            ret, frame = src.read()
        if ret:
            frame = cv2.resize(frame, output_size)
            result.append(frame)
        else:
            frame = np.zeros_like(result[0])
            result.append(frame)
    src.release()
    result = np.array(result)
    return result

In [7]:
class VideoDataset(torch.utils.data.Dataset):
    def __init__(self, df, root_dir, MAX_SEQ_LENGTH, frame_step):
        self.video_paths = df["video-name"].values.tolist()
        self.labels = df["label"].values.tolist()
        self.n_frames = MAX_SEQ_LENGTH
        self.root_dir = root_dir
        self.frame_step = frame_step
        
    def __len__(self):
        return len(self.video_paths)
    
    def __getitem__(self, idx):
        path = self.video_paths[idx]
        label = self.labels[idx]
        frames = frames_from_video_file(os.path.join(self.root_dir, path), self.n_frames, self.frame_step)
        frames = np.float32(frames) / 255
        frames = np.moveaxis(frames, -1, 0)
        return frames, label

In [8]:
import pandas as pd

In [9]:
df = pd.read_csv("../data/v6.csv")
df["Action"] = df["Action"].str.rstrip()
df["label"] = (df["Action"]=="yawn")
df["label"] = df["label"].astype("int")

In [10]:
df.head()

Unnamed: 0,Action,video-name,label
0,normal,normal/1-FemaleNoGlasses-Normal.avi,0
1,normal,normal/1-MaleSunGlasses-Normal.avi,0
2,normal,normal/2-FemaleNoGlasses-Normal.avi,0
3,normal,normal/2-MaleGlasses-Normal.avi,0
4,normal,normal/20-FemaleNoGlasses-Normal.avi,0


In [11]:
i = 0
dfTrain = pd.DataFrame()
dfTest = pd.DataFrame()

while i<len(df):
    if i%5==0:
        dfTest = pd.concat([dfTest, df.iloc[[i]]])
    else :
        dfTrain = pd.concat([dfTrain, df.iloc[[i]]])
    i+=1

In [12]:
dfTrain = dfTrain.sample(frac=1, random_state=42)
dfTest = dfTest.sample(frac=1, random_state=42)

In [13]:
print(dfTest[dfTest.label == 0].shape[0])
print(dfTest[dfTest.label == 1].shape[0])

6
5


In [14]:
MAX_SEQ_LENGTH = 75
frame_step = 2

In [15]:
train_ds = VideoDataset(dfTrain, "../data/v6/", MAX_SEQ_LENGTH, frame_step)
test_ds = VideoDataset(dfTest, "../data/v6/", MAX_SEQ_LENGTH, frame_step)

In [16]:
x,y = test_ds.__getitem__(8)

In [17]:
x[0][0]

array([[0.28627452, 0.26666668, 0.26666668, ..., 0.23921569, 0.27450982,
        0.28627452],
       [0.28235295, 0.2627451 , 0.2627451 , ..., 0.07843138, 0.23921569,
        0.26666668],
       [0.27450982, 0.27058825, 0.26666668, ..., 0.12156863, 0.1882353 ,
        0.23921569],
       ...,
       [0.03137255, 0.03529412, 0.03529412, ..., 0.23921569, 0.26666668,
        0.36078432],
       [0.03137255, 0.03529412, 0.03921569, ..., 0.23529412, 0.23921569,
        0.2509804 ],
       [0.03529412, 0.03921569, 0.03921569, ..., 0.22745098, 0.23137255,
        0.23529412]], dtype=float32)

In [18]:
from torch.utils.data import DataLoader

In [19]:
train_loader = DataLoader(train_ds, batch_size=1)
val_loader = DataLoader(test_ds, batch_size=1)

In [20]:
import torch.optim as optim
from torch import nn
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [21]:
model.to(device)

TimeSformer(
  (model): VisionTransformer(
    (dropout): Dropout(p=0.0, inplace=False)
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (time_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0): Block(
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
        )
        (temporal_norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (temporal_attn): Attention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
   

In [22]:
num_epochs = 10
train_accuracies = []
val_accuracies = []

def accuracy(predictions, labels):
    classes = torch.argmax(predictions, dim=1)
    return torch.mean((classes == labels).float())

for epoch in range(num_epochs):
    running_loss = 0.0
    running_acc = 0.0

    for i, data in enumerate(train_loader, 0):
        inputs, labels = data[0].to(device), data[1].to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        running_acc += accuracy(outputs, labels)

    # Calculate training accuracy for the epoch
    train_acc_epoch = 100 * running_acc / len(train_loader)
    train_accuracies.append(train_acc_epoch)

    # Evaluate on validation set
    correct = 0

    with torch.no_grad():
        for data in val_loader:
            inputs, labels = data[0].to(device), data[1].to(device)
            correct += accuracy(outputs, labels)
    val_acc = 100 * correct / len(val_loader)
    val_accuracies.append(val_acc)

    print('[%d] loss: %.3f, train_acc: %.3f, val_acc: %.3f' %
          (epoch + 1, running_loss / len(train_loader), train_acc_epoch, val_acc))

[1] loss: 1.470, train_acc: 55.814, val_acc: 54.545
[2] loss: 0.922, train_acc: 48.837, val_acc: 54.545
[3] loss: 0.835, train_acc: 34.884, val_acc: 54.545
[4] loss: 0.795, train_acc: 51.163, val_acc: 54.545
[5] loss: 0.741, train_acc: 44.186, val_acc: 54.545
[6] loss: 0.727, train_acc: 37.209, val_acc: 54.545
[7] loss: 0.723, train_acc: 37.209, val_acc: 54.545
[8] loss: 0.723, train_acc: 34.884, val_acc: 54.545
[9] loss: 0.712, train_acc: 34.884, val_acc: 54.545
[10] loss: 0.709, train_acc: 34.884, val_acc: 54.545
