In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import av
import numpy as np
import pandas as pd
from pathlib import Path

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import albumentations as A
from albumentations.pytorch import ToTensorV2

import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import models
from timm.data import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from torch.cuda.amp import GradScaler

os.environ['TOKENIZERS_PARALLELISM'] = 'false'

scaler = GradScaler()

N_CLASSES = 4
batch_size = 16
root_dir = '../data/sibur_data/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
n_frames = 32
MIXUP_ALPHA = 0.2
device

'cuda'

In [3]:
def apply_video_augmentations(video, transform):
    targets={'image': video[0]}
    for i in range(1, video.shape[0]):
        targets[f'image{i}'] = video[i]
    transformed = transform(**targets)
    transformed = np.concatenate(
        [np.expand_dims(transformed['image'], axis=0)] 
        + [np.expand_dims(transformed[f'image{i}'], axis=0) for i in range(1, video.shape[0])]
    )
    return transformed

def apply_video_augmentations_torch(video, transform):
    targets={'image': video[0]}
    for i in range(1, video.shape[0]):
        targets[f'image{i}'] = video[i]
    transformed = transform(**targets)
    transformed = torch.cat(
        [transformed['image'][None]] 
        + [transformed[f'image{i}'][None] for i in range(1, video.shape[0])]
    )
    transformed = transformed.permute(1, 0, 2, 3) # (batch,seq,ch,w,h) -> (batch,ch,seq,w,h)
    return transformed


def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, seg_len):
    start_idx, end_idx = 0, seg_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

# Dataset preparation

In [4]:
id2label = {0: "bridge_down", 1: "bridge_up", 2: "no_action", 3: "train_in_out"}
label2id = {l:i for i, l in id2label.items()}
labels = list(id2label.values())

video_paths = list(Path(root_dir).rglob("*.mp4"))
targets = [vp.parent.name for vp in video_paths]
train = pd.DataFrame({
    "video_path": [v.as_posix() for v in video_paths],
    "label": targets,
})

In [5]:
train.label.value_counts()

bridge_down     306
bridge_up        75
train_in_out     66
no_action        49
Name: label, dtype: int64

In [6]:
train['label_id'] = train.label.map(label2id)

In [7]:
# X_train, X_val, _, _ = train_test_split(train, train['label'], test_size=0.1, random_state=42)

# X_train.to_csv("train.csv", index=False)
# X_val.to_csv("test.csv", index=False)
# X_train = X_val = train

X_train = pd.read_csv("train.csv")
X_val = pd.read_csv("test.csv")

# Load model

In [8]:
# model = models.video.mvit_v2_s("DEFAULT") # .swin3d_t("DEFAULT") #
# model.head[1] = torch.nn.Linear(new_head[1].in_features, 4)
# model.to(device)

# model = models.video.swin3d_t("DEFAULT")
# model.head = torch.nn.Linear(model.head.in_features, N_CLASSES)
# model.to(device)

model = models.video.s3d("DEFAULT")
model.classifier[1] = torch.nn.Conv3d(model.classifier[1].in_channels, N_CLASSES, kernel_size=1, stride=1)
model.to(device)



S3D(
  (features): Sequential(
    (0): TemporalSeparableConv(
      (0): Conv3dNormActivation(
        (0): Conv3d(3, 64, kernel_size=(1, 7, 7), stride=(1, 2, 2), padding=(0, 3, 3), bias=False)
        (1): BatchNorm3d(64, eps=0.001, momentum=0.001, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
      (1): Conv3dNormActivation(
        (0): Conv3d(64, 64, kernel_size=(7, 1, 1), stride=(2, 1, 1), padding=(3, 0, 0), bias=False)
        (1): BatchNorm3d(64, eps=0.001, momentum=0.001, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
      )
    )
    (1): MaxPool3d(kernel_size=(1, 3, 3), stride=(1, 2, 2), padding=(0, 1, 1), dilation=1, ceil_mode=False)
    (2): Conv3dNormActivation(
      (0): Conv3d(64, 64, kernel_size=(1, 1, 1), stride=(1, 1, 1), bias=False)
      (1): BatchNorm3d(64, eps=0.001, momentum=0.001, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
    )
    (3): TemporalSeparableConv(
      (0): Conv3d

In [9]:
# t = torch.zeros((1, 3, 16, 224, 224)).cuda()
# model(t)

# Train

In [10]:
train_transform = A.Compose([
    A.Resize(256, 256),
    A.CenterCrop(224, 224),
    A.HorizontalFlip(p=0.5),
    A.Cutout(p=0.5),
    A.ShiftScaleRotate(
        shift_limit=0.05, scale_limit=0.05, rotate_limit=15, p=0.5
    ),
    A.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.5),
    A.RandomBrightnessContrast(p=0.5),
    A.OneOf([  # One of blur or adding gauss noise
        A.Blur(p=0.5),  # Blurs the image
        A.GaussNoise(var_limit=5.0 / 255.0, p=0.50)  # Adds Gauss noise to image
    ], p=0.5),
    A.HueSaturationValue(p=0.5),
    A.Normalize(OPENAI_CLIP_MEAN, OPENAI_CLIP_STD),
    ToTensorV2(),
], additional_targets={
    f'image{i}': 'image'
    for i in range(1, n_frames)
})

transform = A.Compose([
    A.Resize(232, 232),
    A.CenterCrop(224, 224),
    A.Normalize(OPENAI_CLIP_MEAN, OPENAI_CLIP_STD),
    ToTensorV2(),
], additional_targets={
    f'image{i}': 'image'
    for i in range(1, n_frames)
})



In [11]:
def mixup_transform(batch1, target1, batch2, target2, alpha=0.4):
    lambda_ = np.random.beta(alpha, alpha)
    batch = lambda_ * batch1 + (1 - lambda_) * batch2
    target = lambda_ * target1 + (1 - lambda_) * target2
    return batch, target

In [12]:
class ActionDataset(Dataset):

    def __init__(self, meta, stage, transform=None, n_frames=16):
        self.meta = meta
        self.transform = transform
        self.n_frames = n_frames
        self.stage = stage

    def __len__(self):
        return len(self.meta)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        file_path = self.meta['video_path'].iloc[idx]
        container = av.open(file_path)
        # indices = sample_frame_indices(clip_len=self.n_frames, seg_len=container.streams.video[0].frames)

        seg_len=container.streams.video[0].frames
        if self.stage == "train":
            # mask indices
            # в тесте 5/6 всех видео замаскированы
            if np.random.random() < 0.8: 
                first_idxs = np.random.choice(range(0, seg_len), int(seg_len*0.75), replace=False).astype(int)
                first_idxs.sort()
            else:
                first_idxs = np.arange(seg_len)
            # n_frames indices
            start_idx = np.random.randint(0, len(first_idxs) // 2)
            end_idx = min(np.random.randint(len(first_idxs) // 2, len(first_idxs)) + self.n_frames, len(first_idxs))
            indices = np.linspace(start_idx, end_idx, num=self.n_frames)
            indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
            indices = first_idxs[indices]
        else:
            indices = sample_frame_indices(clip_len=self.n_frames, seg_len=container.streams.video[0].frames)

        video = read_video_pyav(container, indices)
                    
        while video.shape[0] < self.n_frames:
            video = np.vstack([video, video[-1:]])

        if self.transform:
            video = apply_video_augmentations_torch(video, self.transform)

        target = np.zeros(N_CLASSES)
        target[self.meta.iloc[idx].label_id] = 1
            
        return video, target

In [13]:
train_dataset1 = ActionDataset(meta=X_train, stage="train", transform=transform, n_frames=n_frames)
train_dataloader1 = DataLoader(train_dataset1, batch_size=batch_size, shuffle=True, num_workers=0)

train_dataset2 = ActionDataset(meta=X_train, stage="train", transform=transform, n_frames=n_frames)
train_dataloader2 = DataLoader(train_dataset2, batch_size=batch_size, shuffle=True, num_workers=0)

test_dataset = ActionDataset(meta=X_val, stage="test", transform=transform, n_frames=n_frames)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

In [14]:
epochs = 8
lr = 1e-4 #5e-5

criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.05)
optimizer = optim.AdamW(model.parameters(), lr)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-7)

In [15]:
best_score = 0
best_loss = np.inf

for epoch in range(epochs):

    model.train()    

    train_loss = []
    for i, batch in enumerate(tqdm(zip(train_dataloader1, train_dataloader2), desc=f"Epoch: {epoch}", total=len(train_dataloader1))):
        (batch1, target1), (batch2, target2) = batch
        optimizer.zero_grad()

        # mixup transform
        batch, target = mixup_transform(batch1, target1, batch2, target2, MIXUP_ALPHA)
        
        with torch.autocast("cuda"):
            batch = batch.to(device)
            target = target.to(device)

            logits = model(batch)

            loss = criterion(logits, target)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        # loss.backward()
        # optimizer.step()

        train_loss.append(loss.item())
    
    model.eval()  

    val_targets = []
    val_preds = []
    val_loss = 0
    for i, (batch, target) in enumerate(tqdm(test_dataloader, desc=f"Epoch: {epoch}")):

        with torch.autocast("cuda"):
            batch = batch.to(device)
            target = target.to(device)

            with torch.no_grad():
                logits = model(batch)
                loss = criterion(logits, target)

        val_loss += loss.item()
        val_targets.extend(target.argmax(1).cpu().tolist())
        val_preds.extend(logits.argmax(1).cpu().tolist())

    val_loss /= len(test_dataloader)
    score = round(f1_score(val_targets, val_preds, average='macro'), 4)
    print(f'Training loss: {np.mean(train_loss):.4f}')
    print(f'Valindation loss: {val_loss:.4f}')
    print('F1:', score)

    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model, "best.pt")
    torch.save(model, "last.pt")

Epoch: 0:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 0:   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 1.1119
Valindation loss: 0.8531
F1: 0.83


Epoch: 1:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 1:   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.5027
Valindation loss: 0.4944
F1: 0.9407


Epoch: 2:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 2:   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.3918
Valindation loss: 0.3724
F1: 0.9726


Epoch: 3:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 3:   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.4261
Valindation loss: 0.3468
F1: 1.0


Epoch: 4:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 4:   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.4255
Valindation loss: 0.3963
F1: 0.9577


Epoch: 5:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 5:   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.3782
Valindation loss: 0.4026
F1: 0.9487


Epoch: 6:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 6:   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.4034
Valindation loss: 0.3896
F1: 1.0


Epoch: 7:   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 7:   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.4590
Valindation loss: 0.4373
F1: 1.0


# Convert to onnx & openvino

In [16]:
X_val = pd.read_csv("test.csv")

In [16]:
model = torch.load("last.pt")
model.eval()

file_path = X_val.iloc[0].video_path
container = av.open(file_path)
indices = sample_frame_indices(clip_len=n_frames, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)
inputs = apply_video_augmentations_torch(video, transform).unsqueeze(0)

outputs = model(inputs.to(device)).cpu()
input_names = ["input"]
output_names = ["output"]

In [17]:
import onnx
import onnxruntime as ort

onnx_path = "model.onnx"

model = model.float().cpu()
model.eval()

torch.onnx.export(
    model,
    inputs,
    onnx_path,  # where to save the model
    opset_version=14,  # the ONNX version to export the model to
    input_names=["input"],  # the model's input names
    output_names=['output'],  # the model's output names
    dynamic_axes={  # variable length axes
        "input": {0: "batch", 1: "channels", 2: "sequence"},
        "output": {0: "batch"},
    }
)

verbose: False, log level: Level.ERROR



In [18]:
from openvino.runtime import serialize
from openvino.tools import mo

ov_model = mo.convert_model('model.onnx', compress_to_fp16=True)
serialize(ov_model, 'model.xml')

In [19]:
# dummy_inputs = {"input": inputs.cpu().numpy()}

# ort_session = ort.InferenceSession("model.onnx")

# # compute ONNX Runtime output prediction
# ort_outs = ort_session.run(None, dummy_inputs)[0]

# # compute pytorch model outputs
# with torch.no_grad():
#     model.cpu()
#     torch_model_outs = model(inputs).numpy()
#     model.cuda()

# np.testing.assert_allclose(
#     torch_model_outs,
#     ort_outs,
#     rtol=1e-03,
#     atol=1e-05,
# )

# Test

In [20]:
# X_val.to_csv("test.csv", index=False)
X_val = pd.read_csv("test.csv")
len(X_val)

50

In [19]:
from dataset import get_frames
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import random


test_videos_paths = [Path(p) for p in X_val.video_path.values][:]
test_targets = X_val.label_id[:]
test_targets_masked = [t for t in test_targets for _ in range(6)]

clips = [
    get_frames(vp)
    for vp in tqdm(test_videos_paths[:])
]

masks = [
    np.ones(len(c), dtype=bool)
    for c in clips
]

new_masks = []
for m in masks:
    new_masks.append(m)
    for _ in range(5):
        new_m = m.copy()
        new_m[np.random.choice(range(0, len(m)), int(len(m)*0.2), replace=False).astype(int)] = False
        new_masks.append(new_m)

masked_clips = []
for i, c in enumerate(clips):
    for m in new_masks[i*6:(i+1)*6]:
        masked_clips.append(c[m])

  0%|          | 0/50 [00:00<?, ?it/s]

## onnx/openvino model

In [22]:
import sys
sys.path.append("../")
# from submit_transformer.predict import predict
# from submit_videorec.predict_openvino import predict as predict_openvino
from submit_videorec.predict_onnx import predict as predict_onnx

In [23]:
BUFFER = {}
# 32 frames = 1.38 min
# 16 frames = 

preds = [
    predict_onnx(clip, BUFFER)
    for clip in tqdm(masked_clips)
]
preds_ids = [label2id[i] for i in preds]
round(f1_score(test_targets_masked, preds_ids, average='macro'), 4)

  0%|          | 0/300 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [29]:
BUFFER = {}

preds = [
    predict_openvino(clip, BUFFER)
    for clip in tqdm(masked_clips)
]
preds_ids = [label2id[i] for i in preds]
round(f1_score(test_targets_masked, preds_ids, average='macro'), 4)

  0%|          | 0/300 [00:00<?, ?it/s]

1.0

In [41]:
# 0.748 window=2

## torch model

In [42]:
model = torch.load("best.pt", map_location="cpu")
model.cuda()
model.eval()

preds_torch = []
for clip in tqdm(masked_clips):
    # start_idx, end_idx = 0, len(clip)
    # indices = np.linspace(start_idx, end_idx, num=n_frames)
    # indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int32)
    step = 2
    s = 0
    e = (s + n_frames) * step
    clip = clip[s:e:step]
    clip = apply_video_augmentations_torch(clip, transform).unsqueeze(0)

    with torch.autocast("cuda"):
        clip = clip.to(device)
        with torch.no_grad():
            logits = model(clip).cpu()
        preds_torch.append(logits.argmax(1)[0].item())

  0%|          | 0/300 [00:00<?, ?it/s]

In [43]:
round(f1_score(test_targets_masked, preds_torch, average='macro'), 4)

0.9142

In [27]:
# for i, j in zip(preds_torch, test_targets):
#     print(i, j)