In [1]:
from frames import Frames
from audio import Audio
from label_map import label_map

import torch
# import torchvision.transforms as transforms
from torch.utils.data import Dataset

import pandas as pd
from omegaconf import OmegaConf
from pathlib import Path


class OriginalMultimodalDataset(Dataset):
    """
    加载原始的多模态数据。
    """
    def __init__(self, is_need_audio=True, path_config_path_str='../configs/path.yaml'):
        # 导入配置。
        self.path_config = OmegaConf.load(path_config_path_str)

        # 加载视频、字幕、音频的路径。
        self.base_dir = Path(self.path_config['datasets']['base_dir'])
        self.base_video_dir = Path(self.path_config['datasets']['base_video_dir'])
        self.base_subtitle_dir = Path(self.path_config['datasets']['base_subtitle_dir'])
        self.base_audio_dir = Path(self.path_config['datasets']['base_audio_dir'])

        # 导入主控制文件。
        self.is_need_audio = is_need_audio
        self.all_data = pd.read_json(self.path_config['datasets']['base_all_data'], dtype={'video_id': str})

    def __len__(self):
        return len(self.all_data)

    def __getitem__(self, idx):
        frames_data = self.get_frames_data(self.all_data.loc[idx, 'video_id'])
        result = {
            'title': self.all_data.loc[idx, 'title'],
            'emotion_name': self.all_data.loc[idx, 'emotion'],
            'emotion': label_map[self.all_data.loc[idx, 'emotion']],
            'scene': frames_data['images'],
            'subtitle': frames_data['subtitles'],
        }
        if self.is_need_audio:
            audio_data = self.get_audio_data(self.all_data.loc[idx, 'video_id'])
            result = result | audio_data
        return result

    def get_frames_data(self, video_id):
        frames = Frames(video_id)
        video_info = frames.get_video_info()
        frames_image = frames.get_frame_image_by_time()
        frames_subtitle = frames.get_frame_subtitle_by_time()
        return {
            'images': frames_image,
            'subtitles': frames_subtitle
        }

    def get_audio_data(self, video_id):
        audio = Audio(video_id)
        waveform, sample_rate = audio.load_audio()
        return {
            'audio_waveform': waveform,
            'audio_sample_rate': sample_rate
        }


In [2]:
dataset = OriginalMultimodalDataset()

In [3]:
dataset[0]

{'title': '新年快乐，皆得所愿！#你好2023',
 'emotion_name': '盼望',
 'emotion': 7,
 'scene': [array([[[192,  63,  29],
          [191,  62,  28],
          [203,  58,  40],
          ...,
          [191,  59,  26],
          [181,  63,  20],
          [186,  68,  25]],
  
         [[193,  64,  30],
          [190,  61,  27],
          [204,  59,  41],
          ...,
          [191,  59,  26],
          [180,  62,  19],
          [174,  56,  13]],
  
         [[196,  59,  27],
          [199,  62,  30],
          [183,  65,  10],
          ...,
          [189,  61,  18],
          [201,  57,  27],
          [208,  64,  34]],
  
         ...,
  
         [[201,  62,  17],
          [202,  63,  18],
          [181,  68,  24],
          ...,
          [183,  64,  23],
          [188,  60,  35],
          [190,  62,  37]],
  
         [[201,  61,  39],
          [197,  57,  35],
          [206,  60,  32],
          ...,
          [200,  59,  35],
          [197,  57,  31],
          [198,  58,  32]],
  


In [4]:
scene = dataset[2]['scene']

In [5]:
type(scene)

list

In [6]:
len(scene)

18

In [7]:
len(dataset[0]['scene'])

8

In [8]:
type(scene[0])

numpy.ndarray

In [9]:
scene[0].shape

(1024, 576, 3)

In [10]:
scene[0]

array([[[195, 175,  91],
        [195, 175,  91],
        [195, 175,  91],
        ...,
        [178, 195, 226],
        [178, 195, 226],
        [178, 195, 226]],

       [[195, 175,  91],
        [195, 175,  91],
        [195, 175,  91],
        ...,
        [178, 195, 226],
        [178, 195, 226],
        [178, 195, 226]],

       [[198, 173,  90],
        [198, 173,  90],
        [198, 173,  90],
        ...,
        [178, 195, 226],
        [178, 195, 226],
        [178, 195, 226]],

       ...,

       [[213, 193, 207],
        [211, 191, 205],
        [211, 191, 205],
        ...,
        [217, 195, 209],
        [217, 195, 209],
        [218, 196, 210]],

       [[218, 198, 212],
        [218, 198, 212],
        [218, 198, 212],
        ...,
        [203, 181, 195],
        [203, 181, 195],
        [203, 181, 195]],

       [[221, 201, 215],
        [221, 201, 215],
        [222, 202, 216],
        ...,
        [190, 168, 182],
        [191, 169, 183],
        [191, 169, 183]]

In [11]:
from embedding import Captioner

In [12]:
captioner = Captioner()

In [13]:
caption = captioner.generate(scene[0])

In [14]:
caption

'a woman in a red suit sitting at a desk with a microphone'