## Step 0: directory structure

```bash
program_root_dir/
    video_data_loader.ipynb
    read_datasetBreakfast.py
    test_segment.txt
    training_segment.txt
    
    splits/
        mapping_bf.txt
        train.split1.bundle
        test.split1.bundle
        
    groundTruth/
        P16_cam01_P16_cereals.txt
            ...
        P54_webcam02_P54_tea.txt
        
    data/
        P03_cam01_P03_cereals.gz
            ...
        P54_webcam02_P54_tea.gz
```

In [1]:
import os
import torch
import numpy as np
import os.path 
from read_datasetBreakfast import read_mapping_dict
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import time
import math
import matplotlib.pyplot as plt

## Step 1: define data path

In [2]:
train_split =  'splits/train.split1.bundle' # Train Split file
test_split  =  'splits/test.split1.bundle' # Test Split file
GT_folder   =  'groundTruth/' # Ground Truth Labels for each training video 
DATA_folder =  'data/' # Frame I3D features for all videos
mapping_loc =  'splits/mapping_bf.txt' # mapping from string label to int label
train_segment = 'training_segment.txt'
test_segment = "test_segment.txt"

actions_dict = read_mapping_dict(mapping_loc)  # a dict contains mapping from string label to int label
validation_videos = 60  # how many videos are used for validation (must < 1460)

## Step 2: define dataset (only load a video when you need it)

In [3]:
class VideoDataset(Dataset):

    def __init__(self, type="train",  deleteSIL=False):
        """
            type: "train", "validation" or "test"
            deleteSIL: only works for train or validation videos: delete background frames based on labels
        """
        self.type = type
        self.deleteSIL = deleteSIL
        self.videofiles = []
        self.videosegments = []
        self.split_file = test_split if type == "test" else train_split
        self.segment_file = test_segment if type == "test" else train_segment
         
        with open(self.split_file, 'r') as file:
            videofiles = file.read().split('\n')[1:-1]
            # 删掉前缀，只保留文件名，返回所有train video的名字list
            self.videofiles = [x.strip('./data/groundTruth/') + 't' for x in videofiles]  
        
        # read segments for test videos
        with open(self.segment_file, 'r') as file:
            vs = file.read().split('\n')[: -1]
            self.videosegments = [torch.tensor(list(map(int, seg.split()))) for seg in vs]

        if type == "train":
            self.videofiles = self.videofiles[validation_videos: len(self.videofiles)]
            self.videosegments = self.videosegments[validation_videos: len(self.videosegments)]
        elif type == "validation":
            self.videofiles = self.videofiles[:validation_videos]
            self.videosegments = self.videosegments[:validation_videos]
        
    def __len__(self):
        return len(self.videofiles)

    def __getitem__(self, index):
        """
            :If train or validation, return a tuple (video_feature_tensor, shape: [N*400], label_tensor, shape: [N], video_segments, shape: [S])
                N 是该视频中的frame数目, S为分段节点的数目
            :If test（即测试数据集）, only return (video_feature_tensor, video_segments)
        """
        content = self.videofiles[index]
        
        # read labels
        data_breakfast = None
        labels_breakfast = None
        
        # For train or validation video, load lables
        if self.type != "test":  
            with open(GT_folder + content, 'r') as label_file:
                curr_gt = label_file.read().split('\n')[:-1]  # read label for each frame
                label_curr_video = []    # 按照dict把文本标签转换为数字
                for iik in range(len(curr_gt)):
                    label_curr_video.append(actions_dict[curr_gt[iik]])
                labels_breakfast = torch.tensor(label_curr_video)
                
        # read video feature for this video 
        loc_curr_data = DATA_folder + os.path.splitext(content)[0] + '.gz'
        curr_data = np.loadtxt(loc_curr_data, dtype='float32')  # read features for a specific video
        data_breakfast = torch.tensor(curr_data,  dtype=torch.float64 )
        
        # delete background frames
        if self.type != "test" and self.deleteSIL:
            mask = labels_breakfast != 0
            data_breakfast = data_breakfast[mask]
            labels_breakfast = labels_breakfast[mask]
            
        return (data_breakfast, labels_breakfast, self.videosegments[index]) if self.type != "test" else (data_breakfast, self.videosegments[index])


## Step 3: define dataset and test

In [4]:
train_dataset = VideoDataset(type="train")
print("len(train_dataset)", len(train_dataset))
valid_dataset = VideoDataset(type="validation")
print("len(valid_dataset)", len(valid_dataset))
test_dataset = VideoDataset(type="test")
print("len(test_dataset)", len(test_dataset))

train_video0, label_train_video0, train_seg = train_dataset[0]
print("train feature shape:", train_video0.shape, "; lable shape:", label_train_video0.shape)
valid_video0, label_valid_video0, valid_seg = valid_dataset[0]
print("validation feature shape:", valid_video0.shape, "; lable shape:", label_valid_video0.shape)
test_video0, test_seg = test_dataset[0]
print("test feature shape:", test_video0.shape)

len(train_dataset) 1400
len(valid_dataset) 60
len(test_dataset) 252
train feature shape: torch.Size([7910, 400]) ; lable shape: torch.Size([7910])
validation feature shape: torch.Size([544, 400]) ; lable shape: torch.Size([544])
test feature shape: torch.Size([832, 400])


In [5]:
# check segment information
print(valid_seg, valid_seg.shape)
for i in range(valid_seg.shape[0]):
    print(valid_seg[i])
    print(label_valid_video0[valid_seg[i]-1], label_valid_video0[valid_seg[i]], label_valid_video0[valid_seg[i]+1])

tensor([  9, 269, 474]) torch.Size([3])
tensor(9)
tensor(0) tensor(1) tensor(1)
tensor(269)
tensor(1) tensor(2) tensor(2)
tensor(474)
tensor(2) tensor(0) tensor(0)


## Step 4: define dataloader

In [7]:
# 注意，因为每个视频的帧数不同，所以 batch_size>1 时会报错
trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=True)
train_batch_video, train_batch_label, train_seg = next(iter(trainloader))
train_batch_video.shape, train_batch_label.shape, train_seg.shape

(torch.Size([1, 731, 400]), torch.Size([1, 731]), torch.Size([1, 4]))