In [2]:
from src.dataset.datasets import VideoDataset
from src.backbone.videomaev2 import vit_small_patch16_224
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm
import torch.nn as nn
from math import ceil
import torch.nn.functional as F
from torch.optim import Adam
import numpy as np

In [2]:
# Extract feature
cp_model = vit_small_patch16_224()
cp = torch.load("../vit_s_k710_dl_from_giant.pth", map_location="cpu")

for model_key in ["model", "module"]:
    if model_key in cp:
        cp = cp[model_key]
        break

cp_model.load_state_dict(cp)
cp_model.eval()
cp_model.cuda()

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv3d(3, 384, kernel_size=(2, 16, 16), stride=(2, 16, 16))
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (blocks): ModuleList(
    (0-11): 12 x Block(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=384, out_features=1152, bias=False)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (drop_path): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (norm): Identity()
  (fc_norm): LayerNorm((384,), eps=1e-06, elementwise_affine=

In [6]:
# Model 
class HNeRVMAE(nn.Module):
    def __init__(self, embedding=None):
        super().__init__()
        self.embedding = embedding

        self.conv1 = nn.Conv2d(
            192,
            96 * 4**2,
            kernel_size=3,
            stride=(1, 1),
            padding=ceil((3 - 1) // 2),
        )

        self.px1 = nn.PixelShuffle(4)

        self.conv2 = nn.Conv2d(
            96,
            48 * 2**2,
            kernel_size=3,
            stride=(1, 1),
            padding=ceil((3 - 1) // 2),
        )

        self.px2 = nn.PixelShuffle(2)

        self.conv3 = nn.Conv2d(
            48,
            24 * 2**2,
            kernel_size=3,
            stride=(1, 1),
            padding=ceil((3 - 1) // 2),
        )

        self.px3 = nn.PixelShuffle(2)

        self.conv4 = nn.Conv2d(
            24, 3, kernel_size=3, stride=(1, 1), padding=ceil((3 - 1) // 2)
        )

        self.act = nn.ReLU()

    def forward(self, x):
        x = self.act(self.px1(self.conv1(x)))
        x = self.act(self.px2(self.conv2(x)))
        x = self.act(self.px3(self.conv3(x)))
        x = self.act(self.conv4(x))

        x = x.permute(0, 2, 3, 1) * 255
        return x


model = HNeRVMAE()
print(model)

HNeRVMAE(
  (conv1): Conv2d(192, 1536, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (px1): PixelShuffle(upscale_factor=4)
  (conv2): Conv2d(96, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (px2): PixelShuffle(upscale_factor=2)
  (conv3): Conv2d(48, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (px3): PixelShuffle(upscale_factor=2)
  (conv4): Conv2d(24, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (act): ReLU()
)


In [4]:
from src.dataset.loader import get_video_loader
video_loader = get_video_loader()

vr = video_loader("data/uvghd30/uvghd30.mp4")

dataloader = DataLoader(vr, batch_size=12)

x = next(iter(dataloader))
print(x.shape)

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'decord.ndarray.NDArray'>

: 

In [4]:
len(vr)

3900

In [None]:
# DataLoader
dataset = VideoDataset(mode="validation")

In [20]:
len(dataset)

1

In [12]:
# DataLoader
dataset = VideoDataset(
    anno_path="data/uvghd30/uvghd30.csv", data_root="data", mode="validation"
)

dataloader = DataLoader(dataset, batch_size=12)

x = next(iter(dataloader))
print(x.shape)

torch.Size([1, 600, 224, 224, 3])


In [7]:
from pytorch_msssim import ssim

def loss_fn(pred, target, loss_type="L2", batch_average=True):
    target = target.detach()

    if loss_type == "L2":
        loss = F.mse_loss(pred, target, reduction="none").flatten(1).mean(1)
    elif loss_type == "L1":
        loss = F.l1_loss(pred, target, reduction="none").flatten(1).mean(1)
    elif loss_type == "SSIM":
        loss = 1 - ssim(pred, target, data_range=1, size_average=False)
        
    return loss.mean() if batch_average else loss

In [2]:
def psnr_fn_single(output, gt):
    l2_loss = F.mse_loss(output.detach(), gt.detach(), reduction="none")
    psnr = -10 * torch.log10(l2_loss.flatten(start_dim=1).mean(1) + 1e-9)
    return psnr.cpu()

In [10]:
SEED = 42
torch.manual_seed(SEED)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
np.random.seed(SEED)

optimizer = Adam(model.parameters(), lr=0.01, betas=(0.9, 0.99))

# Training

for ep in range(100):
    
    tqdm_batch = tqdm(
        iterable=dataloader,
        desc="Epoch {}".format(ep),
        total=len(dataloader),
        unit="it",
    )
    
    for batch_idx, data in enumerate(tqdm_batch):
        data = data.permute(3, 0, 1, 2).unsqueeze(0).cuda()

        # Forward feature ckpt
        feature = cp_model.forward_features(data)
        input = feature.reshape(12, 192, 14, 14)

        optimizer.zero_grad()

        # HNeRV MAE
        model = HNeRVMAE().cuda()

        output = model(input)

        loss = loss_fn(output.reshape(1, 3, 12, 224, 224), data, loss_type="L2", batch_average=True)
        loss.backward()

        optimizer.step()
        
        psnr = psnr_fn_single(output.reshape(1, 3, 12, 224, 224), data)

        tqdm_batch.set_postfix(loss=loss.item(), psnr=psnr)
        
    if ep % 50 ==0:
        torch.save({
            'epoch': ep,
            'model_state_dict': model.state_dict(),
        }, "model.pt")

Epoch 0:   0%|          | 0/1 [00:18<?, ?it/s]


RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 5 is not equal to len(dims) = 4

In [38]:
ckt = torch.load("model.pt", map_location="cpu")

In [40]:
print(ckt.items())

dict_items([('epoch', 0), ('model_state_dict', OrderedDict([('conv1.weight', tensor([[[[ 0.0127, -0.0156, -0.0156],
          [-0.0003,  0.0141,  0.0089],
          [-0.0024,  0.0207,  0.0215]],

         [[ 0.0116, -0.0106, -0.0233],
          [-0.0221,  0.0163, -0.0028],
          [ 0.0119, -0.0224, -0.0045]],

         [[ 0.0189, -0.0068, -0.0090],
          [ 0.0226, -0.0215, -0.0040],
          [ 0.0122,  0.0068, -0.0233]],

         ...,

         [[-0.0170, -0.0195, -0.0017],
          [ 0.0028,  0.0061,  0.0069],
          [-0.0225,  0.0108,  0.0118]],

         [[ 0.0143,  0.0007,  0.0037],
          [-0.0138,  0.0058, -0.0116],
          [-0.0175, -0.0226,  0.0026]],

         [[ 0.0154, -0.0178,  0.0082],
          [-0.0182, -0.0085,  0.0165],
          [ 0.0116,  0.0203,  0.0239]]],


        [[[-0.0199,  0.0159, -0.0013],
          [ 0.0122, -0.0102,  0.0206],
          [ 0.0063,  0.0196,  0.0205]],

         [[ 0.0235, -0.0009, -0.0176],
          [ 0.0068,  0.0122, -0.01