In [1]:
import torch
from totensor import ToTensor
from PIL import Image
from torchvision import get_image_backend
import json
import os
import math
import copy

# Utilities

In [2]:
def video_loader(video_dir_path,frame_indices):
    video = []
    for i in frame_indices:
        image_path = os.path.join(video_dir_path,'image_{:05d}.jpg'.format(i))
        if os.path.exists(image_path):
            with open(image_path,'rb') as f:
                with Image.open(f) as img:
                    video.append(img.convert('RGB'))
        else:
            return video
    return video
        
def load_value_file(file_path):
    with open(file_path, 'r') as input_file:
        value = float(input_file.read().rstrip('\n\r'))

    return value


# Make data from video

In [3]:
def make_dataset(root_path, annotation_path, subset,
                 n_samples_for_each_video, sample_duration):
    
    # Load annotation data
    with open(annotation_path,'r') as data_file:
        data = json.load(data_file)

    # get video names and annotations
    video_names = []
    annotations = []

    
    for key, value in data['database'].items():
        this_subset = value['subset']
        if this_subset == subset:
            if subset == 'test':
                video_names.append('test/{}'.format(key))
            else:
                
                label = value['annotations']['label']
                video_names.append('{}/{}'.format(label, key))
                annotations.append(value['annotations'])
                #print('{}/{}'.format(label,key),value['annotations'])

    
    # compute class to label ids 
    class_to_idx ={}
    index = 0
    for class_label in data['labels']:
        class_to_idx[class_label] = index
        index +=1

        
    # compute label to class ids
    idx_to_class ={}
    for name,label in class_to_idx.items():
        idx_to_class[label] = name
        
    dataset = []
    for i in range(len(video_names)):
        if i % 1000 == 0:
            print('dataset loading [{}/{}]'.format(i, len(video_names)))

        video_path = os.path.join(root_path, video_names[i])
        if not os.path.exists(video_path):
            continue

        n_frames_file_path = os.path.join(video_path, 'n_frames')
        n_frames = int(load_value_file(n_frames_file_path))
        if n_frames <= 0:
            continue

        begin_t = 1
        end_t = n_frames
        sample = {
            'video': video_path,
            'segment': [begin_t, end_t],
            'n_frames': n_frames,
            'video_id': video_names[i][:-14].split('/')[1]
        }
        if len(annotations) != 0:
            sample['label'] = class_to_idx[annotations[i]['label']]
        else:
            sample['label'] = -1

        if n_samples_for_each_video == 1:
            sample['frame_indices'] = list(range(1, n_frames + 1))
            dataset.append(sample)
        else:
            if n_samples_for_each_video > 1:
                step = max(1,
                           math.ceil((n_frames - 1 - sample_duration) /
                                     (n_samples_for_each_video - 1)))
                print(n_frames,step)
            else:
                step = sample_duration
            for j in range(1, n_frames, step):
                sample_j = copy.deepcopy(sample)
                sample_j['frame_indices'] = list(
                    range(j, min(n_frames + 1, j + sample_duration)))
                dataset.append(sample_j)

    return dataset, idx_to_class

# Kinetics Dataset Class

In [4]:
class Kinetics(torch.utils.data.Dataset):
    
    def __init__(self,
                root_path,
                annotation_path,
                subset,
                n_samples_for_each_video=1,
                sample_duration=16,
                get_loader = video_loader):
        self.data,self.class_names = make_dataset(
            root_path,annotation_path,subset,
            n_samples_for_each_video,sample_duration)
        self.loader = get_loader
        
    def __getitem__(self,index):
        
        path = self.data[index]['video']
        
        frame_indices = self.data[index]['frame_indices']
        
        clip = self.loader(path,frame_indices)
        clip = [ToTensor(1)(img) for img in clip]
        clip = torch.stack(clip,0).permute(1,0,2,3)
        
        target = self.data[index]
        
        return clip,target
    def __len__(self):
        return len(self.data)
        
        
            

# Usage 

In [5]:

training_data = Kinetics('/mnt/hdd/Kinetics_jpg/','3drKinetics.json','training')

dataset loading [0/1135]
dataset loading [1000/1135]


In [6]:
train_loader = torch.utils.data.DataLoader(
    training_data,
    batch_size=1,
    shuffle=True,
    num_workers = 4,
    pin_memory=True)

In [7]:
for  (inputs,targets) in train_loader:
    print(inputs)
    print(targets)
    break

tensor([[[[[  1.,   1.,   1.,  ...,   1.,   1.,   1.],
           [  1.,   1.,   1.,  ...,   1.,   1.,   1.],
           [  1.,   1.,   1.,  ...,   1.,   1.,   1.],
           ...,
           [100., 101., 101.,  ...,   4.,   4.,   4.],
           [ 99.,  97.,  96.,  ...,   4.,   4.,   4.],
           [ 99.,  95.,  92.,  ...,   4.,   4.,   4.]],

          [[  1.,   1.,   1.,  ...,   1.,   1.,   1.],
           [  1.,   1.,   1.,  ...,   1.,   1.,   1.],
           [  1.,   1.,   1.,  ...,   1.,   1.,   1.],
           ...,
           [ 98.,  98.,  95.,  ...,   4.,   4.,   4.],
           [ 96.,  94.,  93.,  ...,   3.,   3.,   3.],
           [ 94.,  95.,  96.,  ...,   3.,   3.,   3.]],

          [[  1.,   1.,   1.,  ...,   1.,   1.,   1.],
           [  1.,   1.,   1.,  ...,   1.,   1.,   1.],
           [  1.,   1.,   1.,  ...,   1.,   1.,   1.],
           ...,
           [ 93.,  92.,  92.,  ...,   4.,   4.,   4.],
           [ 92.,  92.,  95.,  ...,   3.,   3.,   3.],
           [ 