In [1]:
from frames import Frames
from audio import Audio
from label_map import label_map

import torch
# import torchvision.transforms as transforms
# import torchaudio
from torch.utils.data import Dataset

import pandas as pd
from omegaconf import OmegaConf
from pathlib import Path


class MultimodalDataset(Dataset):
    """
    加载原始的多模态数据。
    """
    def __init__(self, is_need_audio=True, path_config_path_str='../configs/path.yaml'):
        # 导入配置。
        self.path_config = OmegaConf.load(path_config_path_str)

        # 加载视频、字幕、音频的路径。
        self.base_dir = Path(self.path_config['datasets']['base_dir'])
        self.base_video_dir = Path(self.path_config['datasets']['base_video_dir'])
        self.base_subtitle_dir = Path(self.path_config['datasets']['base_subtitle_dir'])
        self.base_audio_dir = Path(self.path_config['datasets']['base_audio_dir'])

        # 导入主控制文件。
        self.is_need_audio = is_need_audio
        self.all_data = pd.read_json(self.path_config['datasets']['base_all_data'], dtype={'video_id': str})

    def __len__(self):
        return len(self.all_data)

    def __getitem__(self, idx):
        frames_data = self.get_frames_data(self.all_data.loc[idx, 'video_id'])
        result = {
            'title': self.all_data.loc[idx, 'title'],
            'emotion_name': self.all_data.loc[idx, 'emotion'],
            'emotion': label_map[self.all_data.loc[idx, 'emotion']],
            'scene': frames_data['images'],
            'subtitle': frames_data['subtitles'],
        }
        if self.is_need_audio:
            audio_data = self.get_audio_data(self.all_data.loc[idx, 'video_id'])
            result = result | audio_data
        return result

    def get_frames_data(self, video_id):
        frames = Frames(video_id)
        video_info = frames.get_video_info()
        frames_image = frames.get_frame_image_by_time()
        frames_subtitle = frames.get_frame_subtitle_by_time()
        return {
            'images': frames_image,
            'subtitles': frames_subtitle
        }

    def get_audio_data(self, video_id):
        audio = Audio(video_id)
        waveform, sample_rate = audio.load_audio()
        return {
            'audio_waveform': waveform,
            'audio_sample_rate': sample_rate
        }



In [2]:
dataset = MultimodalDataset()

In [3]:
print(label_map)

{'喜悦': 0, '信任': 1, '害怕': 2, '惊讶': 3, '难过': 4, '厌恶': 5, '生气': 6, '盼望': 7}


In [6]:
dataset[2]

{'title': '美导弹驱逐舰过航台湾海峡，东部战区：一切动向尽在掌握。',
 'emotion_name': '生气',
 'emotion': 6,
 'scene': [array([[[195, 175,  91],
          [195, 175,  91],
          [195, 175,  91],
          ...,
          [178, 195, 226],
          [178, 195, 226],
          [178, 195, 226]],
  
         [[195, 175,  91],
          [195, 175,  91],
          [195, 175,  91],
          ...,
          [178, 195, 226],
          [178, 195, 226],
          [178, 195, 226]],
  
         [[198, 173,  90],
          [198, 173,  90],
          [198, 173,  90],
          ...,
          [178, 195, 226],
          [178, 195, 226],
          [178, 195, 226]],
  
         ...,
  
         [[213, 193, 207],
          [211, 191, 205],
          [211, 191, 205],
          ...,
          [217, 195, 209],
          [217, 195, 209],
          [218, 196, 210]],
  
         [[218, 198, 212],
          [218, 198, 212],
          [218, 198, 212],
          ...,
          [203, 181, 195],
          [203, 181, 195],
          [203, 181, 