## Step 0: directory structure

```bash
program_root_dir/
    video_data_loader.ipynb
    read_datasetBreakfast.py
    test_segment.txt
    training_segment.txt
    
    splits/
        mapping_bf.txt
        train.split1.bundle
        test.split1.bundle
        
    groundTruth/
        P16_cam01_P16_cereals.txt
            ...
        P54_webcam02_P54_tea.txt
        
    data/
        P03_cam01_P03_cereals.gz
            ...
        P54_webcam02_P54_tea.gz
```

In [1]:
import os
import torch
import numpy as np
import os.path 
from read_datasetBreakfast import read_mapping_dict
from torch.utils.data import Dataset, DataLoader


## Step 1: define data path

In [2]:
train_split =  'splits/train.split1.bundle' # Train Split file
test_split  =  'splits/test.split1.bundle' # Test Split file
GT_folder   =  'groundTruth/' # Ground Truth Labels for each training video 
DATA_folder =  'data/' # Frame I3D features for all videos
mapping_loc =  'splits/mapping_bf.txt' # mapping from string label to int label

actions_dict = read_mapping_dict(mapping_loc)  # a dict contains mapping from string label to int label

## Step 2: define dataset (only load a video when you need it)

In [3]:
class VideoDataset(Dataset):

    def __init__(self, train=True, deleteSIL=True):
        """
            train: -> True to load train videos, False to load test videos
            deleteSIL -> only works for train videos: delete background frames based on labels
        """
        self.train = train
        self.deleteSIL = deleteSIL
        
        self.videofiles = []
        self.split_file = train_split if train else test_split
        with open(self.split_file, 'r') as file:
            videofiles = file.read().split('\n')[1:-1]
            # 删掉前缀，只保留文件名，返回所有train video的名字list
            self.videofiles = [x.strip('./data/groundTruth/') + 't' for x in videofiles]  

    def __len__(self):
        return len(self.videofiles)

    def __getitem__(self, index):
        """
            If train is True, return a tuple (video_feature_tensor, shape: [N*400], label_tensor, shape: [N])
                -> N 是该视频中的frame数目
            If train is False（即测试数据集）, only return video_feature_tensor
        """
        content = self.videofiles[index]
        
        # read labels
        data_breakfast = None
        labels_breakfast = None
        
        # For train video, load lables
        if self.train:  
            with open(GT_folder + content, 'r') as label_file:
                curr_gt = label_file.read().split('\n')[:-1]  # read label for each frame
                label_curr_video = []    # 按照dict把文本标签转换为数字
                for iik in range(len(curr_gt)):
                    label_curr_video.append(actions_dict[curr_gt[iik]])
                labels_breakfast = torch.tensor(label_curr_video)
                
        # read video feature for this video 
        loc_curr_data = DATA_folder + os.path.splitext(content)[0] + '.gz'
        curr_data = np.loadtxt(loc_curr_data, dtype='float32')  # read features for a specific video
        data_breakfast = torch.tensor(curr_data,  dtype=torch.float64 )
        
        # delete background frames
        if self.train and self.deleteSIL:
            mask = labels_breakfast != 0
            data_breakfast = data_breakfast[mask]
            labels_breakfast = labels_breakfast[mask]
            
        return (data_breakfast, labels_breakfast) if self.train else data_breakfast


## Step 3: define dataset and test

In [4]:
train_dataset = VideoDataset()
print("len(train_dataset)", len(train_dataset))
test_dataset = VideoDataset(train=False)
print("len(test_dataset)", len(test_dataset))

train_video0, label_train_video0 = train_dataset[0]
print("train feature shape:", train_video0.shape, "; lable shape:", label_train_video0.shape)
test_video0 = test_dataset[0]
print("test feature shape:", test_video0.shape)

len(train_dataset) 1460
len(test_dataset) 252
train feature shape: torch.Size([465, 400]) ; lable shape: torch.Size([465])
test feature shape: torch.Size([832, 400])


## Step 4: define dataloader

In [5]:
# 注意，因为每个视频的帧数不同，所以 batch_size>1 时会报错
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=True)
train_batch_video, train_batch_label = next(iter(trainloader))
train_batch_video.shape, train_batch_label.shape

(torch.Size([1, 682, 400]), torch.Size([1, 682]))