In [22]:
from frames import Frames
from audio import Audio
from label_map import label_map

import torch
# import torchvision.transforms as transforms
from torch.utils.data import Dataset

import pandas as pd
from omegaconf import OmegaConf
from pathlib import Path


class OriginalMultimodalDataset(Dataset):
    """
    加载原始的多模态数据。
    """
    def __init__(self, is_need_audio=True, path_config_path_str='../configs/path.yaml'):
        # 导入配置。
        self.path_config = OmegaConf.load(path_config_path_str)

        # 加载视频、字幕、音频的路径。
        self.base_dir = Path(self.path_config['datasets']['base_dir'])
        self.base_video_dir = Path(self.path_config['datasets']['base_video_dir'])
        self.base_subtitle_dir = Path(self.path_config['datasets']['base_subtitle_dir'])
        self.base_audio_dir = Path(self.path_config['datasets']['base_audio_dir'])

        # 导入主控制文件。
        self.is_need_audio = is_need_audio
        self.all_data = pd.read_json(self.path_config['datasets']['base_all_data'], dtype={'video_id': str})

    def __len__(self):
        return len(self.all_data)

    def __getitem__(self, idx):
        frames_data = self.get_frames_data(self.all_data.loc[idx, 'video_id'])
        result = {
            'title': self.all_data.loc[idx, 'title'],
            'emotion_name': self.all_data.loc[idx, 'emotion'],
            'emotion': label_map[self.all_data.loc[idx, 'emotion']],
            'scenes': frames_data['images'],
            'subtitles': frames_data['subtitles'],
        }
        if self.is_need_audio:
            audio_data = self.get_audio_data(self.all_data.loc[idx, 'video_id'])
            result = result | audio_data
        return result

    def get_frames_data(self, video_id):
        frames = Frames(video_id)
        video_info = frames.get_video_info()
        frames_image = frames.get_frame_image_by_time()
        frames_subtitle = frames.get_frame_subtitle_by_time()
        return {
            'images': frames_image,
            'subtitles': frames_subtitle
        }

    def get_audio_data(self, video_id):
        audio = Audio(video_id)
        waveform, sample_rate = audio.load_audio()
        return {
            'audio_waveform': waveform,
            'audio_sample_rate': sample_rate
        }


In [23]:
dataset = OriginalMultimodalDataset()

In [24]:
dataset[2]

In [25]:
scenes = dataset[2]['scenes']

In [26]:
type(scenes)

In [27]:
len(scenes)

In [28]:
len(dataset[0]['scenes'])

In [29]:
type(scenes[0])

In [30]:
scenes[0].shape

In [31]:
scenes[0]

In [32]:
# from embedding import Captioner

In [33]:
# captioner = Captioner()

In [34]:
# caption = captioner.generate(scene[0])

In [35]:
# caption

In [36]:
from embedding import FaceExtractor

In [37]:
face_extractor = FaceExtractor()

In [38]:
faces = face_extractor.extract_face(scenes[0])

In [39]:
faces[0]

In [40]:
len(faces)

In [55]:
faces[0][0]

In [41]:
faces[0][0].size

In [42]:
from embedding import ImageEncoder

In [43]:
image_encoder = ImageEncoder()

In [44]:
image_embedding = image_encoder.encode(scenes[0])

In [45]:
image_embedding

In [49]:
vars(image_embedding)

In [51]:
image_embedding.last_hidden_state.shape

In [52]:
image_embedding.pooler_output.shape

In [46]:
face_embedding = image_encoder.encode(faces[0][0])

In [47]:
face_embedding

In [53]:
face_embedding.last_hidden_state.shape

In [54]:
face_embedding.pooler_output.shape