In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os

import av
import numpy as np
import pandas as pd
from pathlib import Path

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import albumentations as A
from albumentations.pytorch import ToTensorV2

import torch
import torch.optim as optim
import torchvision
from torch.utils.data import Dataset, DataLoader, RandomSampler
from torchvision import models
from timm.data import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
from torch.cuda.amp import GradScaler
from torch.utils.data.dataloader import default_collate
from torch.utils.tensorboard import SummaryWriter

from transforms import RandomCutmix, RandomMixup
from sampler import RASampler


os.environ['TOKENIZERS_PARALLELISM'] = 'false'

scaler = GradScaler()
tb_writer = SummaryWriter(log_dir="runs/exp")

root_dir = '../data/sibur_data/'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

batch_size = 32
n_frames = 4
N_CLASSES = 4

USE_EMA = True
world_size = 1
model_ema_steps=32
model_ema_decay = 0.99998

MIXUP_ALPHA = 0.2
CUTMIX_ALPHA = 1.0

RA_REPS = 3

device

'cuda'

In [3]:
class ExponentialMovingAverage(torch.optim.swa_utils.AveragedModel):
    """Maintains moving averages of model parameters using an exponential decay.
    ``ema_avg = decay * avg_model_param + (1 - decay) * model_param``
    `torch.optim.swa_utils.AveragedModel <https://pytorch.org/docs/stable/optim.html#custom-averaging-strategies>`_
    is used to compute the EMA.
    """

    def __init__(self, model, decay, device="cpu"):
        def ema_avg(avg_model_param, model_param, num_averaged):
            return decay * avg_model_param + (1 - decay) * model_param

        super().__init__(model, device, ema_avg, use_buffers=True)

In [4]:
def apply_video_augmentations(video, transform):
    targets={'image': video[0]}
    for i in range(1, video.shape[0]):
        targets[f'image{i}'] = video[i]
    transformed = transform(**targets)
    transformed = np.concatenate(
        [np.expand_dims(transformed['image'], axis=0)] 
        + [np.expand_dims(transformed[f'image{i}'], axis=0) for i in range(1, video.shape[0])]
    )
    return transformed

def apply_video_augmentations_torch(video, transform):
    targets={'image': video[0]}
    for i in range(1, video.shape[0]):
        targets[f'image{i}'] = video[i]
    transformed = transform(**targets)
    transformed = torch.cat(
        [transformed['image'][None]] 
        + [transformed[f'image{i}'][None] for i in range(1, video.shape[0])]
    )
    transformed = transformed.permute(1, 0, 2, 3) # (batch,seq,ch,w,h) -> (batch,ch,seq,w,h)
    return transformed


def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, seg_len):
    start_idx, end_idx = 0, seg_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices

# Dataset preparation

In [5]:
id2label = {0: "bridge_down", 1: "bridge_up", 2: "no_action", 3: "train_in_out"}
label2id = {l:i for i, l in id2label.items()}
labels = list(id2label.values())

video_paths = list(Path(root_dir).rglob("*.mp4"))
targets = [vp.parent.name for vp in video_paths]
train = pd.DataFrame({
    "video_path": [v.as_posix() for v in video_paths],
    "label": targets,
})

In [6]:
train.label.value_counts()

bridge_down     306
bridge_up        75
train_in_out     66
no_action        49
Name: label, dtype: int64

In [7]:
train['label_id'] = train.label.map(label2id)

In [8]:
# X_train, X_val, _, _ = train_test_split(train, train['label'], test_size=0.1, random_state=42)

# X_train.to_csv("train.csv", index=False)
# X_val.to_csv("test.csv", index=False)
# X_train = X_val = train

X_train = pd.read_csv("train.csv")
X_val = pd.read_csv("test.csv")

# Load model

In [9]:
# model = models.video.mvit_v2_s("DEFAULT")
# model.head[1] = torch.nn.Linear(model.head[1].in_features, 4)
# model.to(device)

model = models.video.swin3d_t("DEFAULT")
model.head = torch.nn.Linear(model.head.in_features, N_CLASSES)
model.to(device)

# model = models.video.s3d("DEFAULT")
# model.classifier[1] = torch.nn.Conv3d(model.classifier[1].in_channels, N_CLASSES, kernel_size=1, stride=1)
# model.to(device)

# model = models.video.r2plus1d_18("DEFAULT")
# model.fc = torch.nn.Linear(model.fc.in_features, N_CLASSES)
# model.to(device)

# model = models.video.swin3d_s("DEFAULT")
# model.head = torch.nn.Linear(model.head.in_features, N_CLASSES)
# model.to(device)



SwinTransformer3d(
  (patch_embed): PatchEmbed3d(
    (proj): Conv3d(3, 96, kernel_size=(2, 4, 4), stride=(2, 4, 4))
    (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (features): Sequential(
    (0): Sequential(
      (0): SwinTransformerBlock(
        (norm1): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
        (attn): ShiftedWindowAttention3d(
          (qkv): Linear(in_features=96, out_features=288, bias=True)
          (proj): Linear(in_features=96, out_features=96, bias=True)
        )
        (stochastic_depth): StochasticDepth(p=0.0, mode=row)
        (norm2): LayerNorm((96,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (0): Linear(in_features=96, out_features=384, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=384, out_features=96, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
     

In [10]:
# t = torch.zeros((1, 3, 16, 224, 224)).cuda()
# model(t)

# Train

In [11]:
train_transform = A.Compose([
    A.Resize(256, 256),
    # A.CenterCrop(224, 224),
    A.RandomResizedCrop(224, 224),
    A.HorizontalFlip(p=0.5),
    A.Cutout(p=0.15),
    A.ShiftScaleRotate(
        shift_limit=0.05, scale_limit=0.05, rotate_limit=15, p=0.5
    ),
    A.RGBShift(r_shift_limit=15, g_shift_limit=15, b_shift_limit=15, p=0.5),
    A.RandomBrightnessContrast(p=0.5),
    A.OneOf([  # One of blur or adding gauss noise
        A.Blur(p=0.5),  # Blurs the image
        A.GaussNoise(var_limit=5.0 / 255.0, p=0.50)  # Adds Gauss noise to image
    ], p=0.5),
    # A.HueSaturationValue(p=0.5),
    A.Normalize(OPENAI_CLIP_MEAN, OPENAI_CLIP_STD),
    ToTensorV2(),
], additional_targets={
    f'image{i}': 'image'
    for i in range(1, n_frames)
})

transform = A.Compose([
    A.Resize(232, 232),
    A.CenterCrop(224, 224),
    A.Normalize(OPENAI_CLIP_MEAN, OPENAI_CLIP_STD),
    ToTensorV2(),
], additional_targets={
    f'image{i}': 'image'
    for i in range(1, n_frames)
})


# MIXUP CUTMIX
mix_transforms = []
if MIXUP_ALPHA:
    mix_transforms.append(RandomMixup(N_CLASSES, p=1.0, alpha=MIXUP_ALPHA))
if CUTMIX_ALPHA:
    mix_transforms.append(RandomCutmix(N_CLASSES, p=1.0, alpha=CUTMIX_ALPHA))
mixup_cutmix = torchvision.transforms.RandomChoice(mix_transforms)
def collate_fn(batch):
    return mixup_cutmix(*default_collate(batch))



In [12]:
class ActionDataset(Dataset):

    def __init__(self, meta, stage, transform=None, n_frames=16):
        self.meta = meta
        self.transform = transform
        self.n_frames = n_frames
        self.stage = stage

    def __len__(self):
        return len(self.meta)

    @staticmethod
    def _sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        converted_len = int(clip_len * frame_sample_rate)
        end_idx = np.random.randint(converted_len, seg_len+1)
        start_idx = end_idx - converted_len
        indices = np.linspace(start_idx, end_idx, num=clip_len)
        indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        return indices

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        file_path = self.meta['video_path'].iloc[idx]
        container = av.open(file_path)

        seg_len=container.streams.video[0].frames
        if self.stage == "train":
            # mask indices
            # в тесте 5/6 всех видео замаскированы
            if np.random.random() < 0.8: 
                first_idxs = np.random.choice(
                    range(0, seg_len), 
                    int(seg_len*np.random.choice([0.75])), 
                    replace=False
                ).astype(int)
                first_idxs.sort()
            else:
                first_idxs = np.arange(seg_len)

            # get frames
            start_idx = np.random.randint(0, len(first_idxs) // 2)
            end_idx = min(np.random.randint(len(first_idxs) // 2, len(first_idxs)) + self.n_frames, len(first_idxs))
            indices = np.linspace(start_idx, end_idx, num=self.n_frames)
            indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)

            indices = first_idxs[indices]
        else:
            indices = sample_frame_indices(clip_len=self.n_frames, seg_len=container.streams.video[0].frames)

        video = read_video_pyav(container, indices)
                    
        while video.shape[0] < self.n_frames:
            video = np.vstack([video, video[-1:]])

        if self.transform:
            video = apply_video_augmentations_torch(video, self.transform)

        target = np.zeros(N_CLASSES)
        target[self.meta.iloc[idx].label_id] = 1
            
        return video, target

In [13]:
train_dataset = ActionDataset(meta=X_train, stage="train", transform=transform, n_frames=n_frames)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, collate_fn=collate_fn, pin_memory=True)

test_dataset = ActionDataset(meta=X_val, stage="test", transform=transform, n_frames=n_frames)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=0, collate_fn=collate_fn, pin_memory=True)

In [14]:
epochs = 50
warm_epochs = 10
ema_warm_epochs = 14
lr = 2e-4 #5e-5

criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = optim.AdamW(model.parameters(), lr, weight_decay=2e-05)
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=5e-8)

In [15]:
model_ema = None
if USE_EMA:
    adjust = world_size * batch_size * model_ema_steps / epochs
    alpha = 1.0 - model_ema_decay
    alpha = min(1.0, alpha * adjust)
    model_ema = ExponentialMovingAverage(model, device=device, decay=1.0 - alpha)

In [16]:
best_score = 0
best_loss = np.inf

for epoch in range(epochs):

    # TRAIN MODEL
    model.train()    

    if epoch <= warm_epochs:
        if epoch < warm_epochs:
            for param in model.parameters():
                param.requires_grad = False
            for param in model.head.parameters():
                param.requires_grad = True
        if epoch == warm_epochs:
            for param in model.parameters():
                param.requires_grad = True

    train_loss = []
    for i, (batch, target) in enumerate(tqdm(train_dataloader, desc=f"Epoch: {epoch} (train)")):
        optimizer.zero_grad()
        
        with torch.autocast("cuda"):
            batch = batch.to(device)
            target = target.to(device)

            logits = model(batch)

            loss = criterion(logits, target)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        # loss.backward()
        # optimizer.step()

        train_loss.append(loss.item())

        # update ema model
        if model_ema and i % model_ema_steps == 0:
            model_ema.update_parameters(model)
            if epoch <= ema_warm_epochs:
                model_ema.n_averaged.fill_(0)
    
    # EVAL MODEL 
    model.eval()  

    val_targets = []
    val_preds = []
    val_loss = 0
    for i, (batch, target) in enumerate(tqdm(test_dataloader, desc=f"Epoch: {epoch} (eval)")):

        with torch.autocast("cuda"):
            batch = batch.to(device)
            target = target.to(device)

            with torch.no_grad():
                logits = model(batch)
                loss = criterion(logits, target)

        val_loss += loss.item()
        val_targets.extend(target.argmax(1).cpu().tolist())
        val_preds.extend(logits.argmax(1).cpu().tolist())

    val_loss /= len(test_dataloader)
    val_score = round(f1_score(val_targets, val_preds, average='macro'), 4)

    # EVAL EMA MODEL
    if epoch > ema_warm_epochs and model_ema is not None:
        model_ema.eval()  

        val_targets = []
        val_preds = []
        ema_val_loss = 0
        for i, (batch, target) in enumerate(tqdm(test_dataloader, desc=f"Epoch: {epoch} (ema)")):

            with torch.autocast("cuda"):
                batch = batch.to(device)
                target = target.to(device)

                with torch.no_grad():
                    logits = model_ema(batch)
                    loss = criterion(logits, target)

            ema_val_loss += loss.item()
            val_targets.extend(target.argmax(1).cpu().tolist())
            val_preds.extend(logits.argmax(1).cpu().tolist())

        ema_val_loss /= len(test_dataloader)
        ema_val_score = round(f1_score(val_targets, val_preds, average='macro'), 4)

    # LOG RESULTS
    print(f'Training loss: {np.mean(train_loss):.4f}')
    print(f'Valindation loss: {val_loss:.4f}')
    tb_writer.add_scalar("loss/train", np.mean(train_loss))
    tb_writer.add_scalar("loss/val", val_loss)
    if epoch > ema_warm_epochs and model_ema is not None:
        print(f'Valindation loss (EMA): {ema_val_loss:.4f}')
        tb_writer.add_scalar("loss/ema", ema_val_loss)

    print('F1:', val_score)
    tb_writer.add_scalar("f1/val", val_score)
    if epoch > ema_warm_epochs and model_ema is not None:
        print('F1 (EMA):', ema_val_score)
        tb_writer.add_scalar("f1/ema", val_score)

    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model, "best.pt")

    torch.save(model, "last.pt")
    if epoch > ema_warm_epochs and model_ema is not None:
        torch.save(model_ema.state_dict(), "last_ema.pt")

Epoch: 0 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 0 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 1.3284
Valindation loss: 1.2331
F1: 0.4088


Epoch: 1 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 1 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 1.1183
Valindation loss: 1.1608
F1: 0.1711


Epoch: 2 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 2 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 1.0503
Valindation loss: 1.2094
F1: 0.1711


Epoch: 3 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 3 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 1.0171
Valindation loss: 1.0500
F1: 0.2781


Epoch: 4 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 4 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.9861
Valindation loss: 1.1109
F1: 0.4042


Epoch: 5 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 5 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.9448
Valindation loss: 1.1781
F1: 0.2222


Epoch: 6 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 6 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.9486
Valindation loss: 0.9432
F1: 0.5274


Epoch: 7 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 7 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.9351
Valindation loss: 1.1139
F1: 0.4561


Epoch: 8 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 8 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.8939
Valindation loss: 0.9838
F1: 0.5032


Epoch: 9 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 9 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.9274
Valindation loss: 0.8168
F1: 0.6336


Epoch: 10 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 10 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.9505
Valindation loss: 0.9901
F1: 0.6865


Epoch: 11 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 11 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.8057
Valindation loss: 0.7994
F1: 0.78


Epoch: 12 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 12 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.7231
Valindation loss: 1.0729
F1: 0.4095


Epoch: 13 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 13 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6656
Valindation loss: 0.4737
F1: 1.0


Epoch: 14 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 14 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.5696
Valindation loss: 0.5877
F1: 0.9759


Epoch: 15 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 15 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 15 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.5854
Valindation loss: 1.1374
Valindation loss (EMA): 0.6809
F1: 0.6678
F1 (EMA): 1.0


Epoch: 16 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 16 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 16 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6530
Valindation loss: 0.7286
Valindation loss (EMA): 1.0117
F1: 0.9726
F1 (EMA): 0.5593


Epoch: 17 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 17 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 17 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6075
Valindation loss: 0.5763
Valindation loss (EMA): 1.1217
F1: 1.0
F1 (EMA): 0.8134


Epoch: 18 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 18 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 18 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.5893
Valindation loss: 1.2551
Valindation loss (EMA): 0.8432
F1: 0.5221
F1 (EMA): 0.7605


Epoch: 19 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 19 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 19 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.8358
Valindation loss: 0.6159
Valindation loss (EMA): 0.9635
F1: 1.0
F1 (EMA): 1.0


Epoch: 20 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 20 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 20 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6297
Valindation loss: 0.6575
Valindation loss (EMA): 0.4238
F1: 0.9216
F1 (EMA): 1.0


Epoch: 21 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 21 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 21 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6557
Valindation loss: 0.5726
Valindation loss (EMA): 1.3477
F1: 0.916
F1 (EMA): 0.3592


Epoch: 22 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 22 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 22 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.7587
Valindation loss: 0.6497
Valindation loss (EMA): 0.4826
F1: 1.0
F1 (EMA): 1.0


Epoch: 23 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 23 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 23 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.5852
Valindation loss: 0.7638
Valindation loss (EMA): 0.6971
F1: 1.0
F1 (EMA): 1.0


Epoch: 24 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 24 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 24 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.7254
Valindation loss: 0.6421
Valindation loss (EMA): 0.7257
F1: 1.0
F1 (EMA): 0.8439


Epoch: 25 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 25 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 25 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.5943
Valindation loss: 0.8427
Valindation loss (EMA): 0.9849
F1: 0.5593
F1 (EMA): 0.9673


Epoch: 26 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 26 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 26 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6222
Valindation loss: 0.6652
Valindation loss (EMA): 0.4187
F1: 1.0
F1 (EMA): 1.0


Epoch: 27 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 27 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 27 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6763
Valindation loss: 0.5728
Valindation loss (EMA): 1.1499
F1: 0.8842
F1 (EMA): 0.7073


Epoch: 28 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 28 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 28 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6376
Valindation loss: 0.9240
Valindation loss (EMA): 1.0362
F1: 1.0
F1 (EMA): 1.0


Epoch: 29 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 29 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 29 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.5476
Valindation loss: 0.5861
Valindation loss (EMA): 0.5132
F1: 1.0
F1 (EMA): 1.0


Epoch: 30 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 30 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 30 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.7354
Valindation loss: 1.0501
Valindation loss (EMA): 0.7121
F1: 0.9726
F1 (EMA): 0.9726


Epoch: 31 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 31 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 31 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.5282
Valindation loss: 0.6698
Valindation loss (EMA): 0.8644
F1: 1.0
F1 (EMA): 1.0


Epoch: 32 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 32 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 32 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.7107
Valindation loss: 0.6836
Valindation loss (EMA): 1.1551
F1: 0.839
F1 (EMA): 1.0


Epoch: 33 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 33 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 33 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6548
Valindation loss: 0.9006
Valindation loss (EMA): 1.1956
F1: 0.5593
F1 (EMA): 0.6737


Epoch: 34 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 34 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 34 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6555
Valindation loss: 0.8968
Valindation loss (EMA): 0.5022
F1: 0.757
F1 (EMA): 1.0


Epoch: 35 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 35 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 35 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6858
Valindation loss: 0.4626
Valindation loss (EMA): 0.8230
F1: 1.0
F1 (EMA): 0.8102


Epoch: 36 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 36 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 36 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6776
Valindation loss: 0.7518
Valindation loss (EMA): 0.3927
F1: 1.0
F1 (EMA): 1.0


Epoch: 37 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 37 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 37 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6183
Valindation loss: 0.7381
Valindation loss (EMA): 0.6462
F1: 0.7828
F1 (EMA): 1.0


Epoch: 38 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 38 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 38 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.5461
Valindation loss: 0.6867
Valindation loss (EMA): 1.1177
F1: 0.7005
F1 (EMA): 0.2891


Epoch: 39 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 39 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 39 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.7062
Valindation loss: 0.4176
Valindation loss (EMA): 0.6253
F1: 1.0
F1 (EMA): 0.9409


Epoch: 40 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 40 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 40 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6307
Valindation loss: 0.7500
Valindation loss (EMA): 1.0264
F1: 0.9453
F1 (EMA): 0.5499


Epoch: 41 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 41 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 41 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6702
Valindation loss: 0.5414
Valindation loss (EMA): 0.5402
F1: 1.0
F1 (EMA): 1.0


Epoch: 42 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 42 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 42 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6758
Valindation loss: 0.6450
Valindation loss (EMA): 1.1738
F1: 1.0
F1 (EMA): 0.5593


Epoch: 43 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 43 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 43 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6479
Valindation loss: 0.6167
Valindation loss (EMA): 0.9470
F1: 1.0
F1 (EMA): 0.6737


Epoch: 44 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 44 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 44 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.7284
Valindation loss: 0.8430
Valindation loss (EMA): 0.6332
F1: 0.8401
F1 (EMA): 1.0


Epoch: 45 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 45 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 45 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.5857
Valindation loss: 0.3739
Valindation loss (EMA): 0.6564
F1: 1.0
F1 (EMA): 1.0


Epoch: 46 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 46 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 46 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.7124
Valindation loss: 0.7992
Valindation loss (EMA): 0.3797
F1: 0.8761
F1 (EMA): 1.0


Epoch: 47 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 47 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 47 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.5875
Valindation loss: 0.5583
Valindation loss (EMA): 1.0409
F1: 1.0
F1 (EMA): 0.4611


Epoch: 48 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 48 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 48 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6231
Valindation loss: 0.5278
Valindation loss (EMA): 0.4345
F1: 1.0
F1 (EMA): 1.0


Epoch: 49 (train):   0%|          | 0/14 [00:00<?, ?it/s]

Epoch: 49 (eval):   0%|          | 0/2 [00:00<?, ?it/s]

Epoch: 49 (ema):   0%|          | 0/2 [00:00<?, ?it/s]

Training loss: 0.6931
Valindation loss: 0.6768
Valindation loss (EMA): 0.4021
F1: 0.8241
F1 (EMA): 1.0


# Convert to onnx & openvino

In [17]:
X_val = pd.read_csv("test.csv")

In [18]:
model = torch.load("last.pt")

if USE_EMA:
    adjust = world_size * batch_size * model_ema_steps / epochs
    alpha = 1.0 - model_ema_decay
    alpha = min(1.0, alpha * adjust)
    model_ema = ExponentialMovingAverage(model, device=device, decay=1.0 - alpha)
    ema_weights = torch.load("last_ema.pt", map_location="cpu")
    model_ema.load_state_dict(ema_weights)
    model = model_ema

model.to(device)
model.eval()

file_path = X_val.iloc[0].video_path
container = av.open(file_path)
indices = sample_frame_indices(clip_len=n_frames, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)
inputs = apply_video_augmentations_torch(video, transform).unsqueeze(0)

outputs = model(inputs.to(device)).cpu()
input_names = ["input"]
output_names = ["output"]

In [19]:
import onnx
import onnxruntime as ort

onnx_path = "../submit_videorec/model/model.onnx"

model = model.float().cpu()
model.eval()

torch.onnx.export(
    model,
    inputs,
    onnx_path,  # where to save the model
    opset_version=14,  # the ONNX version to export the model to
    input_names=["input"],  # the model's input names
    output_names=['output'],  # the model's output names
    dynamic_axes={  # variable length axes
        "input": {0: "batch", 1: "channels", 2: "sequence"},
        "output": {0: "batch"},
    }
)

verbose: False, log level: Level.ERROR



In [20]:
# from onnxsim import simplify

# model = onnx.load(onnx_path)
# model_simp, check = simplify(model)
# assert check, "Simplified ONNX model could not be validated"

# onnx.save(model_simp, "model_simplified.onnx")

In [21]:
from openvino.runtime import serialize
from openvino.tools import mo

ov_model = mo.convert_model('../submit_videorec/model/model.onnx', compress_to_fp16=True)
serialize(ov_model, '../submit_videorec/model/model.xml')

In [20]:
# dummy_inputs = {"input": inputs.cpu().numpy()}

# ort_session = ort.InferenceSession("model.onnx")

# # compute ONNX Runtime output prediction
# ort_outs = ort_session.run(None, dummy_inputs)[0]

# # compute pytorch model outputs
# with torch.no_grad():
#     model.cpu()
#     torch_model_outs = model(inputs).numpy()
#     model.cuda()

# np.testing.assert_allclose(
#     torch_model_outs,
#     ort_outs,
#     rtol=1e-03,
#     atol=1e-05,
# )

# Test

In [22]:
# X_val.to_csv("test.csv", index=False)
X_val = pd.read_csv("test.csv")
len(X_val)

50

In [23]:
from dataset import get_frames
from pathlib import Path
from tqdm.auto import tqdm
import numpy as np
import random


test_videos_paths = [Path(p) for p in X_val.video_path.values][:]
test_targets = X_val.label_id[:]
test_targets_masked = [t for t in test_targets for _ in range(6)]

clips = [
    get_frames(vp)
    for vp in tqdm(test_videos_paths[:])
]

masks = [
    np.ones(len(c), dtype=bool)
    for c in clips
]

new_masks = []
for m in masks:
    new_masks.append(m)
    for _ in range(5):
        new_m = m.copy()
        new_m[np.random.choice(range(0, len(m)), int(len(m)*0.2), replace=False).astype(int)] = False
        new_masks.append(new_m)

masked_clips = []
for i, c in enumerate(clips):
    for m in new_masks[i*6:(i+1)*6]:
        masked_clips.append(c[m])

  0%|          | 0/50 [00:00<?, ?it/s]

## onnx/openvino model

In [24]:
import sys
sys.path.append("../")
# from submit_transformer.predict import predict
# from submit_videorec.predict_openvino import predict as predict_openvino
from submit_videorec.predict_onnx import predict as predict_onnx

In [25]:
BUFFER = {}
# 32 frames = 1.38 min
# 16 frames = 

preds = [
    predict_onnx(clip, BUFFER)
    for clip in tqdm(masked_clips)
]
preds_ids = [label2id[i] for i in preds]
round(f1_score(test_targets_masked, preds_ids, average='macro'), 4)

  0%|          | 0/300 [00:00<?, ?it/s]

1.0

In [29]:
BUFFER = {}

preds = [
    predict_openvino(clip, BUFFER)
    for clip in tqdm(masked_clips)
]
preds_ids = [label2id[i] for i in preds]
round(f1_score(test_targets_masked, preds_ids, average='macro'), 4)

  0%|          | 0/300 [00:00<?, ?it/s]

1.0

In [41]:
# 0.748 window=2

## torch model

In [42]:
model = torch.load("best.pt", map_location="cpu")
model.cuda()
model.eval()

preds_torch = []
for clip in tqdm(masked_clips):
    # start_idx, end_idx = 0, len(clip)
    # indices = np.linspace(start_idx, end_idx, num=n_frames)
    # indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int32)
    step = 2
    s = 0
    e = (s + n_frames) * step
    clip = clip[s:e:step]
    clip = apply_video_augmentations_torch(clip, transform).unsqueeze(0)

    with torch.autocast("cuda"):
        clip = clip.to(device)
        with torch.no_grad():
            logits = model(clip).cpu()
        preds_torch.append(logits.argmax(1)[0].item())

  0%|          | 0/300 [00:00<?, ?it/s]

In [43]:
round(f1_score(test_targets_masked, preds_torch, average='macro'), 4)

0.9142

In [27]:
# for i, j in zip(preds_torch, test_targets):
#     print(i, j)