In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

import av
import numpy as np
import pandas as pd
from pathlib import Path

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import albumentations as A
from albumentations.pytorch import ToTensorV2

import torch
import cv2
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision import models
from timm.data import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from torch.cuda.amp import GradScaler
from torch.utils.data.dataloader import default_collate
from torch.utils.tensorboard import SummaryWriter
from vidaug import augmentors as va

from transforms import RandomCutmix, RandomMixup


os.environ['TOKENIZERS_PARALLELISM'] = 'false'

scaler = GradScaler()
tb_writer = SummaryWriter(log_dir="runs/exp")

root_dir = '../data/sibur_data/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

batch_size = 48
n_frames = 4
N_CLASSES = 4

USE_EMA = False
world_size = 1
model_ema_steps=32
model_ema_decay = 0.99998

MIXUP_ALPHA = 0.2
CUTMIX_ALPHA = 1.0

device

In [None]:
def get_frames(video_path: Path, img_size=None, n_frames=None):
    cpr = cv2.VideoCapture(video_path.as_posix())
    has_frame = True
    frames = []

    while has_frame:
        has_frame, frame = cpr.read()
        if has_frame:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            if img_size:
                frame = cv2.resize(frame, img_size)
            frames.append(frame)
    cpr.release()
    return np.array(frames)

In [None]:
class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel):
    """Maintains moving averages of model parameters using an exponential decay.
    ``ema_avg = decay * avg_model_param + (1 - decay) * model_param``
    `torch.optim.swa_utils.AveragedModel <https://pytorch.org/docs/stable/optim.html#custom-averaging-strategies>`_
    is used to compute the EMA.
    """
    def __init__(self, model, decay, device="cpu"):
        def ema_avg(avg_model_param, model_param, num_averaged):
            return decay * avg_model_param + (1 - decay) * model_param

        super().__init__(model, device, ema_avg, use_buffers=True)

In [None]:
def apply_video_augmentations(video, transform):
    targets={'image': video[0]}
    for i in range(1, video.shape[0]):
        targets[f'image{i}'] = video[i]
    transformed = transform(**targets)
    transformed = np.concatenate(
        [np.expand_dims(transformed['image'], axis=0)] 
        + [np.expand_dims(transformed[f'image{i}'], axis=0) for i in range(1, video.shape[0])]
    )
    return transformed

def apply_video_augmentations_torch(video, transform):
    targets={'image': video[0]}
    for i in range(1, video.shape[0]):
        targets[f'image{i}'] = video[i]
    transformed = transform(**targets)
    transformed = torch.cat(
        [transformed['image'][None]] 
        + [transformed[f'image{i}'][None] for i in range(1, video.shape[0])]
    )
    transformed = transformed.permute(1, 0, 2, 3) # (batch,seq,ch,w,h) -> (batch,ch,seq,w,h)
    return transformed


def read_video_pyav(container, indices):
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, seg_len):
    start_idx, end_idx = 0, seg_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices



# Dataset preparation

In [None]:
id2label = {0: "bridge_down", 1: "bridge_up", 2: "no_action", 3: "train_in_out"}
label2id = {l:i for i, l in id2label.items()}
labels = list(id2label.values())

video_paths = list(Path(root_dir).rglob("*.mp4"))
targets = [vp.parent.name for vp in video_paths]
train = pd.DataFrame({
    "video_path": [v.as_posix() for v in video_paths],
    "label": targets,
})

In [None]:
train.label.value_counts()

In [None]:
train['label_id'] = train.label.map(label2id)

In [None]:
# 1
# X_train, X_val, _, _ = train_test_split(train, train['label'], test_size=0.1, random_state=42)

# X_train.to_csv("train.csv", index=False)
# X_val.to_csv("test.csv", index=False)

# 2
# X_train = pd.read_csv("train.csv")
# X_val = pd.read_csv("test.csv")

###
X_train = X_val = train

# Load model

In [None]:
# model = models.video.mvit_v2_s("DEFAULT")
# model.head[1] = torch.nn.Linear(model.head[1].in_features, 4)
# model.to(device)

model = models.video.swin3d_t("DEFAULT")
model.head = torch.nn.Linear(model.head.in_features, N_CLASSES)
model.to(device)

# model = models.video.s3d("DEFAULT")
# model.classifier[1] = torch.nn.Conv3d(model.classifier[1].in_channels, N_CLASSES, kernel_size=1, stride=1)
# model.to(device)

# model = models.video.r2plus1d_18("DEFAULT")
# model.fc = torch.nn.Linear(model.fc.in_features, N_CLASSES)
# model.to(device)

# model = models.video.swin3d_s("DEFAULT")
# model.head = torch.nn.Linear(model.head.in_features, N_CLASSES)
# model.to(device)

# model = models.video.mc3_18("DEFAULT")
# model.fc = torch.nn.Linear(model.fc.in_features, N_CLASSES)
# model.to(device)

# Train

In [None]:
train_transform = A.Compose([
    A.RandomResizedCrop(224, 224, scale=(0.4, 1.0)),
    A.HorizontalFlip(p=0.5),
    A.ShiftScaleRotate(
        shift_limit=0.0625, scale_limit=0.1, rotate_limit=15, p=0.5
    ),
    A.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.5),
    A.RandomBrightnessContrast(p=0.5),
    A.OneOf([  # One of blur or adding gauss noise
        A.Blur(blur_limit=4, p=0.5),  # Blurs the image
        A.GaussNoise(var_limit=5.0 / 255.0, p=0.5)  # Adds Gauss noise to image
    ], p=0.5),
    A.HueSaturationValue(p=0.5),
    A.Cutout(num_holes=8, p=0.15),
    A.Normalize(OPENAI_CLIP_MEAN, OPENAI_CLIP_STD),
    ToTensorV2(),
], additional_targets={
    f'image{i}': 'image'
    for i in range(1, n_frames)
})

transform = A.Compose([
    A.Resize(256, 256),
    A.CenterCrop(224, 224),
    A.Normalize(OPENAI_CLIP_MEAN, OPENAI_CLIP_STD),
    ToTensorV2(),
], additional_targets={
    f'image{i}': 'image'
    for i in range(1, n_frames)
})

# VIDEO AUGS
sometimes = lambda aug: va.Sometimes(0.5, aug)
video_aug = va.Sequential([
    va.OneOf([
        va.Upsample(1.3),
        va.Downsample(0.7)
    ]),
    sometimes(va.OneOf([
        va.Pepper(),
        va.Salt(),
    ])),
    sometimes(va.RandomShear(0.1, 0.1))
])


# MIXUP CUTMIX
mix_transforms = []
if MIXUP_ALPHA:
    mix_transforms.append(RandomMixup(N_CLASSES, p=1.0, alpha=MIXUP_ALPHA))
if CUTMIX_ALPHA:
    mix_transforms.append(RandomCutmix(N_CLASSES, p=1.0, alpha=CUTMIX_ALPHA))

mixup_cutmix = torchvision.transforms.RandomChoice(mix_transforms)
def collate_fn(batch):
    return mixup_cutmix(*default_collate(batch))

In [None]:
class ActionDataset(Dataset):

    def __init__(self, meta, stage, transform=None, n_frames=16):
        self.meta = meta
        self.transform = transform
        self.n_frames = n_frames
        self.stage = stage

    def __len__(self):
        return len(self.meta)

    @staticmethod
    def _sample_frame_indices_with_frame_rate(clip_len, frame_sample_rate, seg_len):
        converted_len = int(clip_len * frame_sample_rate)
        try:
            end_idx = np.random.randint(converted_len, seg_len)
        except:
            end_idx = seg_len - 1
        start_idx = end_idx - converted_len
        indices = np.linspace(start_idx, end_idx, num=clip_len)
        indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        return indices

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        file_path = self.meta['video_path'].iloc[idx]
        container = av.open(file_path)
        frame_sample_rate = np.random.choice([5, 10, 15])
        indices = self._sample_frame_indices_with_frame_rate(
            clip_len=int(container.streams.video[0].frames / frame_sample_rate), 
            frame_sample_rate=frame_sample_rate, 
            seg_len=container.streams.video[0].frames
        )
        video = read_video_pyav(container, indices)
        
        if self.stage == "train" and len(video) > 8:
            video = np.array(video_aug(video))

        if self.stage == "train":
            video = np.array(video_aug(video))
            seg_len = len(video)
            # mask indices
            # в тесте 5/6 всех видео замаскированы
            if np.random.random() < 0.8 and seg_len > 8: 
                first_idxs = np.random.choice(
                    range(0, seg_len), 
                    int(seg_len*np.random.uniform(0.5, 0.75)), 
                    replace=False
                ).astype(int)
                first_idxs.sort()
            else:
                first_idxs = np.arange(seg_len)
            # first_idxs = np.arange(seg_len)
            
            # get frames
            start_idx = np.random.randint(0, len(first_idxs) // 2)
            end_idx = min(np.random.randint(len(first_idxs) // 2, len(first_idxs)) + self.n_frames, len(first_idxs))
            indices = np.linspace(start_idx, end_idx, num=self.n_frames)
            indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)

            indices = first_idxs[indices]
            video = video[indices]
        else:
            indices = sample_frame_indices(clip_len=self.n_frames, seg_len=container.streams.video[0].frames)
            video = read_video_pyav(container, indices)
                    
        while video.shape[0] < self.n_frames:
            video = np.vstack([video, video[-1:]])

        if self.transform:
            video = apply_video_augmentations_torch(video, self.transform)

        target = np.zeros(N_CLASSES)
        target[self.meta.iloc[idx].label_id] = 1
            
        return video, target

In [None]:
train_dataset = ActionDataset(meta=X_train, stage="train", transform=transform, n_frames=n_frames)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=collate_fn, pin_memory=True)

test_dataset = ActionDataset(meta=X_val, stage="test", transform=transform, n_frames=n_frames)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=collate_fn, pin_memory=True)

In [None]:
epochs = 12
warm_epochs = 0
ema_warm_epochs = 4
lr = 1e-4

criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.08)
optimizer = optim.AdamW(model.parameters(), lr, weight_decay=2e-05)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=0)

In [None]:
model_ema = None
if USE_EMA:
    adjust = world_size * batch_size * model_ema_steps / epochs
    alpha = 1.0 - model_ema_decay
    alpha = min(1.0, alpha * adjust)
    model_ema = ExponentialMovingAverage(model, device=device, decay=1.0 - alpha)

In [None]:
best_score = 0
best_loss = np.inf

for epoch in range(epochs):

    # TRAIN MODEL
    model.train()    

    if epoch <= warm_epochs:
        if epoch < warm_epochs:
            for param in model.parameters():
                param.requires_grad = False
            for param in model.head.parameters():
                param.requires_grad = True
        if epoch == warm_epochs:
            for param in model.parameters():
                param.requires_grad = True

    train_loss = []
    for i, (batch, target) in enumerate(tqdm(train_dataloader, desc=f"Epoch: {epoch} (train)")):
        optimizer.zero_grad()
        
        with torch.autocast("cuda"):
            batch = batch.to(device)
            target = target.to(device)

            logits = model(batch)

            loss = criterion(logits, target)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss.append(loss.item())

        # update ema model
        if model_ema and i % model_ema_steps == 0:
            model_ema.update_parameters(model)
            if epoch < ema_warm_epochs:
                model_ema.n_averaged.fill_(0)
    
    # EVAL MODEL 
    model.eval()  

    val_targets = []
    val_preds = []
    val_loss = 0
    for i, (batch, target) in enumerate(tqdm(test_dataloader, desc=f"Epoch: {epoch} (eval)")):

        with torch.autocast("cuda"):
            batch = batch.to(device)
            target = target.to(device)

            with torch.no_grad():
                logits = model(batch)
                loss = criterion(logits, target)

        val_loss += loss.item()
        val_targets.extend(target.argmax(1).cpu().tolist())
        val_preds.extend(logits.argmax(1).cpu().tolist())

    val_loss /= len(test_dataloader)
    val_score = round(f1_score(val_targets, val_preds, average='macro'), 4)

    # EVAL EMA MODEL
    if epoch >= ema_warm_epochs and model_ema is not None:
        model_ema.eval()  

        val_targets = []
        val_preds = []
        ema_val_loss = 0
        for i, (batch, target) in enumerate(tqdm(test_dataloader, desc=f"Epoch: {epoch} (ema)")):

            with torch.autocast("cuda"):
                batch = batch.to(device)
                target = target.to(device)

                with torch.no_grad():
                    logits = model_ema(batch)
                    loss = criterion(logits, target)

            ema_val_loss += loss.item()
            val_targets.extend(target.argmax(1).cpu().tolist())
            val_preds.extend(logits.argmax(1).cpu().tolist())

        ema_val_loss /= len(test_dataloader)
        ema_val_score = round(f1_score(val_targets, val_preds, average='macro'), 4)

    # LOG RESULTS
    print(f'Training loss: {np.mean(train_loss):.4f}')
    print(f'Valindation loss: {val_loss:.4f}')
    tb_writer.add_scalar("loss/train", np.mean(train_loss), epoch)
    tb_writer.add_scalar("loss/val", val_loss, epoch)
    if model_ema is not None and epoch >= ema_warm_epochs:
            print(f'Valindation loss (EMA): {ema_val_loss:.4f}')
            tb_writer.add_scalar("loss/ema", ema_val_loss, epoch)
    print('F1:', val_score)
    
    tb_writer.add_scalar("f1/val", val_score, epoch)
    if epoch >= ema_warm_epochs and model_ema is not None:
        print('F1 (EMA):', ema_val_score)
        tb_writer.add_scalar("f1/ema", val_score, epoch)

    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model, "best.pt")
        if epoch >= ema_warm_epochs and model_ema is not None:
            torch.save(model_ema.state_dict(), "best_ema.pt")

    torch.save(model, "last.pt")
    if epoch > ema_warm_epochs and model_ema is not None:
        torch.save(model_ema.state_dict(), "last_ema.pt")

# Convert to onnx & openvino

In [None]:
# Training loss: 0.3523
# Valindation loss: 0.3076
# Valindation loss (EMA): 0.3470
# F1: 1.0
# F1 (EMA): 0.9871

X_val = pd.read_csv("test.csv")

In [None]:
mt = "last"
model = torch.load(f"{mt}.pt")

if USE_EMA:
    adjust = world_size * batch_size * model_ema_steps / epochs
    alpha = 1.0 - model_ema_decay
    alpha = min(1.0, alpha * adjust)
    model_ema = ExponentialMovingAverage(model, device=device, decay=1.0 - alpha)
    ema_weights = torch.load(f"{mt}_ema.pt", map_location="cpu")
    model_ema.load_state_dict(ema_weights)
    model = model_ema

model.to(device)
model.eval()

file_path = X_val.iloc[0].video_path
container = av.open(file_path)
indices = sample_frame_indices(clip_len=n_frames, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)
inputs = apply_video_augmentations_torch(video, transform).unsqueeze(0)

outputs = model(inputs.to(device)).cpu()
input_names = ["input"]
output_names = ["output"]

In [None]:
import onnx
import onnxruntime as ort

onnx_path = "../submit_videorec/model/model.onnx"
openvino_path = "../submit_videorec/model/model.xml"

In [None]:
model = model.float().cpu()
model.eval()

torch.onnx.export(
    model,
    inputs,
    onnx_path,  # where to save the model
    opset_version=14,  # the ONNX version to export the model to
    input_names=["input"],  # the model's input names
    output_names=['output'],  # the model's output names
    dynamic_axes={  # variable length axes
        "input": {0: "batch", 1: "channels", 2: "sequence"},
        "output": {0: "batch"},
    }
)

In [None]:
from openvino.runtime import serialize
from openvino.tools import mo

ov_model = mo.convert_model(onnx_path, compress_to_fp16=True)
serialize(ov_model, openvino_path)

# Test

In [None]:
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np


test_videos_paths = [Path(p) for p in X_val.video_path.values][:]
test_targets = X_val.label_id[:]
test_targets_masked = [t for t in test_targets for _ in range(6)]

clips = [
    get_frames(vp)
    for vp in tqdm(test_videos_paths[:])
]

masks = [
    np.ones(len(c), dtype=bool)
    for c in clips
]

new_masks = []
for m in masks:
    new_masks.append(m)
    for _ in range(5):
        new_m = m.copy()
        new_m[np.random.choice(range(0, len(m)), int(len(m)*0.4), replace=False).astype(int)] = False
        new_masks.append(new_m)

masked_clips = []
for i, c in enumerate(clips):
    for m in new_masks[i*6:(i+1)*6]:
        masked_clips.append(c[m])

## onnx/openvino model

In [None]:
import sys
sys.path.append("../")
from submit_videorec.predict_openvino import predict as predict_openvino
from submit_videorec.predict_onnx import predict as predict_onnx

In [None]:
preds = [
    predict_onnx(clip)
    for clip in tqdm(masked_clips)
]
preds_ids = [label2id[i] for i in preds]
round(f1_score(test_targets_masked, preds_ids, average='macro'), 4)

In [None]:
preds = [
    predict_openvino(clip)
    for clip in tqdm(masked_clips)
]
preds_ids = [label2id[i] for i in preds]
round(f1_score(test_targets_masked, preds_ids, average='macro'), 4)