In [1]:
import os

import av
import cv2
import numpy as np
import pandas as pd
from pathlib import Path

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import albumentations as A

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import AutoProcessor, AutoModel

In [2]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [3]:
batch_size = 1
root_dir = '../data/sibur_data/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [4]:
def apply_video_augmentations(video, transform):
    targets={'image': video[0]}
    for i in range(1, video.shape[0]):
        targets[f'image{i}'] = video[i]
    transformed = transform(**targets)
    transformed = np.concatenate(
        [np.expand_dims(transformed['image'], axis=0)] 
        + [np.expand_dims(transformed[f'image{i}'], axis=0) for i in range(1, video.shape[0])]
    )
    return transformed

In [5]:
def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, seg_len):
    start_idx, end_idx = 0, seg_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

# Dataset preparation

In [6]:
id2label = {0: "bridge_down", 1: "bridge_up", 2: "no_action", 3: "train_in_out"}
label2id = {l:i for i, l in id2label.items()}
labels = list(id2label.values())

In [7]:
video_paths = list(Path(root_dir).rglob("*.mp4"))
targets = [vp.parent.name for vp in video_paths]
train = pd.DataFrame({
    "video_path": [v.as_posix() for v in video_paths],
    "label": targets,
})

In [8]:
train.label.value_counts()

bridge_down     306
bridge_up        75
train_in_out     66
no_action        49
Name: label, dtype: int64

In [9]:
train['label_id'] = train.label.map(label2id)

In [10]:
X_train, X_val, _, _ = train_test_split(train, train['label'], test_size=0.2)

# Load model

In [11]:
processor = AutoProcessor.from_pretrained("../submit_transformer/xclip")
model = AutoModel.from_pretrained("../submit_transformer/xclip")
model.to(device)

OSError: Error no file named pytorch_model.bin, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory ../submit_transformer/xclip.

# Zero-shot example

In [12]:
idx = np.random.randint(0, len(video_paths))
print(idx)

file_path = video_paths[idx].as_posix()
container = av.open(file_path)
indices = sample_frame_indices(clip_len=8, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)

inputs = processor(
    text=labels,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)
inputs.to(device)

# forward pass
with torch.no_grad():
    outputs = model(**inputs)

logits_per_video = outputs.logits_per_video
probs = logits_per_video.softmax(dim=1)
print(labels[probs.argmax()], targets[idx])

79


  return torch.tensor(value)


train_in_out train_in_out


# Validate zero-shot

In [13]:
model.eval()  

val_targets = []
val_preds = []
for line in tqdm(X_val.itertuples()):
    
    file_path = line.video_path
    container = av.open(file_path)
    indices = sample_frame_indices(clip_len=8, seg_len=container.streams.video[0].frames)
    video = read_video_pyav(container, indices)
    
    inputs = processor(
        text=labels,
        videos=list(video),
        return_tensors="pt",
        padding=True,
    )
    
    inputs = inputs.to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits_per_video = outputs.logits_per_video
    probs = logits_per_video.softmax(dim=1)

    val_targets.append(line.label_id)
    val_preds.append(probs.argmax(axis=1).cpu().numpy()[0])

print('F1:', f1_score(val_targets, val_preds, average='macro'))

0it [00:00, ?it/s]

F1: 0.9848484848484849


# Train

In [23]:
transform = A.Compose([
    A.ShiftScaleRotate(
        shift_limit=0.05, scale_limit=0.05, rotate_limit=15, p=0.5
    ),
    A.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.5),
    A.RandomBrightnessContrast(p=0.5),
], additional_targets={
    f'image{i}': 'image'
    for i in range(1, 8)
})

In [24]:
class ActionDataset(Dataset):

    def __init__(self, meta, transform=None):
        self.meta = meta
        self.transform = transform

    def __len__(self):
        return len(self.meta)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()


        file_path = self.meta['video_path'].iloc[idx]
        container = av.open(file_path)
        indices = sample_frame_indices(clip_len=8, seg_len=container.streams.video[0].frames)
            
        video = read_video_pyav(container, indices)
        while video.shape[0] < 8:
            video = np.vstack([video, video[-1:]])

        if self.transform:
            video = apply_video_augmentations(video, self.transform)
            

        inputs = processor(
            text=[self.meta['label'].iloc[idx]],
            videos=list(video),
            return_tensors="pt",
            padding='max_length',
            max_length=8
        )
        for i in inputs:
            inputs[i] = inputs[i][0]

        return inputs

In [25]:
train_dataset = ActionDataset(meta=X_train, transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)

In [26]:
epochs = 7
lr = 1e-5

optimizer = optim.AdamW(model.parameters(), lr)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-8)

In [17]:
for epoch in range(epochs):

    model.train()    

    train_loss = []
    for i, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch: {epoch}")):
        optimizer.zero_grad()

        batch = batch.to(device)

        outputs = model(**batch, return_loss=True)

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        train_loss.append(loss.item())
    
    model.eval()  

    val_targets = []
    val_preds = []
    for line in tqdm(X_val.itertuples(), total=len(X_val)):

        file_path = line.video_path
        container = av.open(file_path)
        indices = sample_frame_indices(clip_len=8, seg_len=container.streams.video[0].frames)
        video = read_video_pyav(container, indices)

        inputs = processor(
            text=labels,
            videos=list(video),
            return_tensors="pt",
            padding=True,
        )

        inputs = inputs.to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        logits_per_video = outputs.logits_per_video
        probs = logits_per_video.softmax(dim=1)

        val_targets.append(line.label_id)
        val_preds.append(probs.argmax(axis=1).cpu().numpy()[0])

    print('Training loss:', np.mean(train_loss))
    print('F1:', round(f1_score(val_targets, val_preds, average='macro'), 4))

Epoch: 0:   0%|          | 0/50 [00:00<?, ?it/s]

  return torch.tensor(value)


Training loss: 1.9783422470092773


  0%|          | 0/100 [00:00<?, ?it/s]

F1: 0.7819415354357561


Epoch: 1:   0%|          | 0/50 [00:00<?, ?it/s]

Training loss: 1.4345078206062316


  0%|          | 0/100 [00:00<?, ?it/s]

F1: 0.8783245739767478


Epoch: 2:   0%|          | 0/50 [00:00<?, ?it/s]

Training loss: 1.3049543356895448


  0%|          | 0/100 [00:00<?, ?it/s]

F1: 0.870887665410992


Epoch: 3:   0%|          | 0/50 [00:00<?, ?it/s]

Training loss: 1.2740910565853119


  0%|          | 0/100 [00:00<?, ?it/s]

F1: 0.9644607843137255


Epoch: 4:   0%|          | 0/50 [00:00<?, ?it/s]

Training loss: 1.2728268337249755


  0%|          | 0/100 [00:00<?, ?it/s]

F1: 0.9601346925407352


Epoch: 5:   0%|          | 0/50 [00:00<?, ?it/s]

Training loss: 1.2614843332767487


  0%|          | 0/100 [00:00<?, ?it/s]

F1: 0.9911424029071088


Epoch: 6:   0%|          | 0/50 [00:00<?, ?it/s]

Training loss: 1.212795640230179


  0%|          | 0/100 [00:00<?, ?it/s]

F1: 0.9723584367271882


In [18]:
model.save_pretrained("xclip")

# Convert to onnx

In [14]:
file_path = X_val.iloc[0].video_path
container = av.open(file_path)
indices = sample_frame_indices(clip_len=8, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)
inputs = processor(
    text=labels,
    videos=list(video),
    return_tensors="pt",
    padding=True,
)
for i in ["input_ids", "attention_mask"]:
    inputs[i] = inputs[i].type(torch.int32)

outputs = model(**inputs.to(device))
input_names = list(inputs.keys())
output_names = list(outputs.keys())
# ['input_ids', 'attention_mask', 'pixel_values']
# ['logits_per_video', 'logits_per_text', 'text_embeds', 'video_embeds', 'text_model_output', 'vision_model_output', 'mit_output']
print(input_names)
print(output_names)

  return torch.tensor(value)


NameError: name 'model' is not defined

In [16]:
import onnx
import onnxruntime as ort

onnx_path = "xclip.onnx"

model = model.float()
model.eval()

torch.onnx.export(
    model,
    dict(inputs),
    onnx_path,  # where to save the model
    opset_version=14,  # the ONNX version to export the model to
    input_names=["input_ids", "pixel_values", "attention_mask"],  # the model's input names
    output_names=['logits_per_video', 'logits_per_text', 'text_embeds', 'video_embeds'],  # the model's output names
    dynamic_axes={  # variable length axes
        "input_ids": {0: "batch", 1: "sequence"},
        "pixel_values": {0: "batch", 1: "num_frames", 2: "num_channels", 3: "height", 4: "width"},
        "attention_mask": {0: "batch", 1: "sequence"},
        "logits_per_video": {0: "batch"},
        "logits_per_text": {0: "batch"},
        "text_embeds": {0: "batch"},
        "video_embeds": {0: "batch"}
    }
)

In [26]:
from onnxsim import simplify
onnx_simple_path = "xclip_simplified.onnx"
# run checks
onnx_model = onnx.load(onnx_path)
onnx.checker.check_model(onnx_model)

# run additional checks and simplify
model_simp, check = simplify(onnx_model, skip_fuse_bn=True)
assert check, "Simplified ONNX model could not be validated"
onnx.save(model_simp, onnx_simple_path)


In [22]:
dummy_inputs = inputs.to("cpu").copy()

for i in dummy_inputs:
    dummy_inputs[i] = dummy_inputs[i].numpy()

ort_session = ort.InferenceSession("xclip.onnx")

# compute ONNX Runtime output prediction
ort_outs = ort_session.run(None, dict(dummy_inputs))[0]

# compute pytorch model outputs
with torch.no_grad():
    model.cpu()
    torch_model_outs = model(**inputs).logits_per_video.numpy()
    model.cuda()

np.testing.assert_allclose(
    torch_model_outs,
    ort_outs,
    rtol=1e-03,
    atol=1e-05,
)

NameError: name 'model' is not defined

In [25]:
ort_outs.argmax(1)[0]

0