In [170]:
from abc import ABC

from gulpio2 import GulpDirectory
from pathlib import Path

from collections import defaultdict

import pickle
import pandas as pd
from pathlib import Path

import torch as t
import torch.nn as nn
import numpy as np
from omegaconf import OmegaConf
from typing import Any, Dict, List, Sequence, Union

from systems import EpicActionRecognitionSystem
from systems import EpicActionRecogintionDataModule

from utils.metrics import compute_metrics
from utils.actions import action_id_from_verb_noun
from scipy.special import softmax

from GPUtil import showUtilization as gpu_usage
from tqdm import tqdm

from frame_sampling import RandomSampler
from torchvideo.samplers import FrameSampler
from torchvideo.samplers import frame_idx_to_list

from torch.utils.data import DataLoader
import torch.optim as optim
import torch.utils.data as data_utils

In [157]:
class PickleDataset:
    
    def __init__(self, pkl_path: Path, frame_sampler: FrameSampler, features_dim: int = 256):
        self.pkl_path = pkl_path
        self.frame_sampler = frame_sampler
        self.features_dim = features_dim
        self.pkl_dict = Dict[str, Any]
        
        self.frame_cumsum = np.array([0.])
        self.sampled_frame_idx = []
        self.sampled_frames = []
        
    def load(self):
        with open(self.pkl_path, 'rb') as f:
            self.pkl_dict = pickle.load(f)
            frame_counts = [label['num_frames'] for label in self.pkl_dict['labels']]
            self.frame_cumsum = np.cumsum(np.concatenate([self.frame_cumsum, frame_counts]), dtype=int)
    
    def video_from_narration_id(self, narration_id: str):
        video_no = self.pkl_dict['narration_id'].index(narration_id)
        l = self.frame_cumsum[video_no]
        r = self.frame_cumsum[video_no+1]
        return video_no, self.pkl_dict['features'][l:r]
    
    def sample_frame_features_from_narration_id(self, narration_id: str):
        video_no, features = self.video_from_narration_id(narration_id)
        video_length = features.shape[0]
        
        assert video_length == self.pkl_dict['labels'][video_no]['num_frames']
        if video_length < frame_sampler.frame_count:
            raise ValueError(f"Video too short to sample {n_frames} from")
        
        sample_idxs = np.array(frame_idx_to_list(frame_sampler.sample(video_length)))
        return sample_idxs, features[sample_idxs]
    
    def sample_features(self):
        for i, narration_id in tqdm(
            enumerate(self.pkl_dict['narration_id']),
            unit=" video",
            total=len(self.pkl_dict['narration_id']),
            dynamic_ncols=True
        ):
            sample_idx, sample_features = self.sample_frame_features_from_narration_id(narration_id)
            self.sampled_frame_idx.append(sample_idx)
            self.sampled_frames.append(sample_features)
            
#         self.sampled_frames = np.concatenate(self.sampled_frames)
        
    def train_dataloader(self):
        dataset = data_utils.TensorDataset(t.from_numpy(self.sampled_frames))
        return DataLoader(
            (self.sampled_frames, self.pkl_dict['labels']),
            shuffle=True
        )

In [158]:
n_frames = 8
frame_sampler = RandomSampler(frame_count=n_frames, snippet_length=1, test=False)

In [159]:
dataset = PickleDataset('../datasets/epic/features/p01_01_1_features.pkl', frame_sampler)
dataset.load()

In [160]:
xd, xs = dataset.video_from_narration_id(dataset.pkl_dict['narration_id'][1])

In [161]:
dataset.sample_features()
dataloader = dataset.train_dataloader()

100%|██████████| 100/100 [00:00<00:00, 19060.69 video/s]


In [169]:
# dataset.pkl_dict['labels']
# for inp, lables in :
#     print(inp, labels)

xd = iter(dataloader)

pff = xd.next()

pff

[tensor([[[-0.4094,  3.0647,  3.3857,  ..., -0.4687,  1.1552, -0.4135],
          [-0.4299,  3.6703,  3.0355,  ..., -0.4901,  1.3125, -0.4301],
          [-0.4106,  3.9602,  2.4782,  ..., -0.4711,  0.9978, -0.4192],
          ...,
          [-0.4070,  3.0495,  2.8970,  ..., -0.4557,  0.0784, -0.4151],
          [-0.6840,  8.4899,  3.7129,  ..., -0.8701,  9.8667, -0.7156],
          [-0.6715,  8.8838,  3.3958,  ..., -0.8666,  9.0637, -0.7172]]]),
 tensor([[[-4.3644, 21.2263, -3.9591,  ..., -5.9617, 65.5009, -3.8685],
          [-4.3553, 21.0767, -4.1348,  ..., -5.9373, 65.3950, -3.8584],
          [-4.3861, 20.4751, -4.2398,  ..., -5.9349, 65.3287, -3.8865],
          ...,
          [-4.5009, 23.5898, -2.7861,  ..., -6.2264, 68.0189, -4.1037],
          [-4.2791, 23.6527, -1.4918,  ..., -6.0396, 64.4510, -3.9073],
          [-4.1482, 26.2707, -0.0975,  ..., -5.8804, 61.1179, -3.7940]]]),
 tensor([[[-4.1957e+00,  1.1207e+01,  1.0053e+00,  ..., -5.1269e+00,
            5.9829e+01, -3.5413

In [135]:
class Net(nn.Module):
    
    def __init__(self, frame_count: int):
        super().__init__()
        self.frame_count = frame_count
        self.fc1 = nn.Linear(256 * frame_count, 512)
        self.fc2 = nn.Linear(512, 397)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        
        return x

In [156]:
net = Net(frame_count=8)
criterion = nn.CrossEntropyLoss()
optimiser = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)

for epoch in range(2):
    running_loss_v = 0.0
    running_loss_n = 0.0
    
    for i, data in tqdm(
        enumerate(dataloader)
    ):
        print(data)
        
        labels = np.concatenate([labels['verb_class'],labels['noun_class']])
        optimizer.zero_grad()
        
        out = net(inputs)
        scores = {
            'verb': out[:,:97].cpu().numpy(),
            'noun': out[:,97:].cpu().numpy(),
            'narration_id': labels['narration_id']
        }
    
        verb_top_n = scores['verb'][0].argsort()[::-1][0]
        noun_top_n = scores['noun'][0].argsort()[::-1][0]
        
        output = np.concatenate([verb_top_n,noun_top_n])
        
        loss = criterion(output, labels)
        
        optimiser.step()
        
        running_loss += loss.item()
        if i % 50 == 0:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0

0it [00:00, ?it/s]

[tensor([[[-0.3532,  2.1416,  2.6242,  ..., -0.3933,  0.3866, -0.3648],
         [-0.4333,  2.1779,  3.4238,  ..., -0.4774,  0.6675, -0.4179],
         [-0.6608,  8.5129,  3.6061,  ..., -0.8634, 10.5095, -0.7057],
         ...,
         [-0.8650, 14.7795,  6.5477,  ..., -1.2052, 14.5274, -0.9835],
         [-1.1211, 15.5162,  6.0061,  ..., -1.5773, 16.6292, -1.2428],
         [-1.2168,  8.3154,  4.9403,  ..., -1.3084, 11.7712, -0.9494]]]), tensor([[[-4.3915, 21.4051, -2.5160,  ..., -5.9400, 65.4343, -3.8452],
         [-4.3655, 20.2588, -4.2245,  ..., -5.9274, 64.9303, -3.8721],
         [-4.3690, 20.4209, -4.1050,  ..., -5.9450, 65.3720, -3.8796],
         ...,
         [-4.4603, 23.2763, -1.5238,  ..., -6.1288, 66.3579, -4.0150],
         [-4.3183, 22.9296, -2.9331,  ..., -6.0787, 65.9999, -3.9516],
         [-4.0629, 26.2378,  1.2861,  ..., -5.7781, 60.2500, -3.7188]]]), tensor([[[-4.1617, 10.2909,  0.8511,  ..., -5.0744, 59.1806, -3.4645],
         [-4.2475, 11.2500,  0.9742,  ...,




TypeError: list indices must be integers or slices, not str