In [1]:
import pandas as pd
from pathlib import Path
from omegaconf import OmegaConf, DictConfig

In [2]:
from moviepy.editor import VideoFileClip
import pysrt
from datetime import timedelta

In [3]:
def get_video_info(video_file):
    """获取视频时长和帧数"""
    clip = VideoFileClip(video_file)
    duration = clip.duration  # 视频时长（秒）
    fps = clip.fps  # 帧率
    resolution = clip.size
    return duration, fps, resolution

In [4]:
demo_data_path = OmegaConf.load('../configs/demo_data_path.yaml')

video_path_str = demo_data_path['video_path_str']
audio_path_str = demo_data_path['audio_path_str']
subtitle_path_str = demo_data_path['subtitle_path_str']

video_eg_path_str = demo_data_path['video_eg_path_str']
subtitle_eg_path_str = demo_data_path['subtitle_eg_path_str']
audio_eg_path_str = demo_data_path['audio_eg_path_str']

In [8]:
print(demo_data_path)

{'video_path_str': 'D:\\dcmt\\code\\python\\playground\\sample_videos\\renminribao', 'subtitle_path_str': 'D:\\dcmt\\code\\python\\playground\\sample_videos\\subtitle\\renminribao', 'audio_path_str': 'D:\\dcmt\\dataset\\news_emotion\\audio', 'video_eg_path_str': 'D:\\dcmt\\code\\python\\playground\\sample_videos\\renminribao\\7183343454993485115.mp4', 'subtitle_eg_path_str': 'D:\\dcmt\\code\\python\\playground\\sample_videos\\subtitle\\renminribao\\7183343454993485115.srt', 'audio_eg_path_str': 'D:\\dcmt\\dataset\\news_emotion\\audio\\7183343024657747258.wav'}

In [7]:
subtitle_eg_path_str

'D:\\dcmt\\code\\python\\playground\\sample_videos\\subtitle\\renminribao\\7183343454993485115.srt'

In [6]:
def print_fps(video_path_str):
    video_path = Path(video_path_str)
    for file in video_path.glob("*.mp4"):
        print(file.name)
        print(get_video_info(str(file)))

print_fps(video_path_str)

7183343454993485115.mp4
(7.89, 25.0, [576, 1024])
7185030947190967555.mp4
(18.37, 25.0, [540, 960])
7185400788041108796.mp4
(17.13, 25.0, [576, 1024])
7186896342118124855.mp4
(20.65, 25.0, [576, 1024])
7187017932747148581.mp4
(7.57, 25.0, [576, 1024])
7187268299481255224.mp4


KeyboardInterrupt: 

In [None]:
subs = pysrt.open(subtitle_eg_path_str)

In [None]:
for sub in subs:
    print(sub)

In [None]:
for sub in subs:
    print(sub)
    print(sub.start)
    print(sub.end)
    print(sub.text)
    break

In [None]:
def get_subtitle_at_time(srt_file, target_time):
    subtitles = pysrt.open(srt_file)

    for subtitle in subtitles:
        # 检查字幕是否在指定时间点内
        if subtitle.start <= target_time <= subtitle.end:
            return subtitle.text
    return None  # 如果没有找到对应的字幕

In [None]:
get_subtitle_at_time(subtitle_eg_path_str, pysrt.SubRipTime(seconds=5))

In [None]:
def extract_frames(video_path, timestamps):
    # 加载视频文件
    clip = VideoFileClip(video_path)
    frames = []

    # 遍历给定的时间戳
    for timestamp in timestamps:
        # 获取视频帧并转换为 numpy 数组
        frame = clip.get_frame(timestamp)
        frames.append(frame)

    return frames

In [None]:
eg_frames = extract_frames(video_eg_path_str, [1, 2, 3])

In [None]:
eg_frames[0].shape

In [None]:
def get_subtitles_at_times(srt_file, target_times):
    subtitles = pysrt.open(srt_file)
    results = []

    for target_time in target_times:
        found_subtitle = None
        for subtitle in subtitles:
            # 检查字幕是否在指定时间点内
            if subtitle.start <= target_time <= subtitle.end:
                found_subtitle = subtitle.text
                break
        results.append(found_subtitle)
    return results


In [None]:
get_subtitles_at_times(subtitle_eg_path_str, [pysrt.SubRipTime(seconds=5), pysrt.SubRipTime(seconds=10)])

In [None]:
import torchaudio

In [None]:
waveform, sample_rate = torchaudio.load(r"D:\dcmt\dataset\news_emotion\audio\7183343024657747258.wav", format="wav")

In [None]:
sample_rate

In [None]:
waveform

In [None]:
from embedding import AudioEncoder

In [None]:
audio_encoder = AudioEncoder()

In [None]:
from audio import Audio

In [None]:
audio = Audio('7184773290962013477')

In [None]:
audio.load_audio()

In [None]:
outputs = audio_encoder.encode(audio.load_audio())

In [None]:
outputs

In [None]:
outputs.shape

In [None]:
outputs.last_hidden_state.shape

In [None]:
outputs.last_hidden_state

In [None]:
outputs.extract_features.shape

In [None]:
from embedding import TextEncoder

In [None]:
text_encoder = TextEncoder()

In [None]:
text_outputs = text_encoder.encode("喂喂喂，你是谁？ How are you?今天是星期五")

In [None]:
text_outputs

In [None]:
text_outputs.last_hidden_state.shape

In [None]:
text_outputs.pooler_output.shape