<a href="https://colab.research.google.com/github/zman2013/tts/blob/main/tts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Step 1: Install all required dependencies
!pip install pytube requests git+https://github.com/openai/whisper.git transformers torch torchaudio unidecode
!apt-get install espeak
!pip install git+https://github.com/pytorch/audio.git

# Clone MeloTTS repository (not executable in Python cell, run in a separate shell cell)
!git clone https://github.com/myshell-ai/MeloTTS.git
%cd /content/MeloTTS
!pip install -e .
!python -m unidic download

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-14_dorhx
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-14_dorhx
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting tiktoken (from openai-whisper==20231117)
  Downloading tiktoken-0.6.0-cp31

In [4]:
# Import necessary libraries
import os
import subprocess
from pytube import YouTube
import whisper
from IPython.display import Audio

# Function to download YouTube video
def download_youtube_video(url, save_path='./'):
    yt = YouTube(url)
    video = yt.streams.get_highest_resolution()
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    video_file_path = video.download(save_path)
    print(f"Video saved to: {video_file_path}")
    return video_file_path

# Function to extract audio from video
def extract_audio_from_video(video_file_path, audio_format="mp3"):
    base, _ = os.path.splitext(video_file_path)
    audio_file_path = f"{base}.{audio_format}"
    command = ['ffmpeg', '-i', video_file_path, '-q:a', '0', '-map', 'a', audio_file_path, '-y']
    subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    print(f"Audio extracted to: {audio_file_path}")
    return audio_file_path

# Function to transcribe audio to text with Whisper
def audio_to_text_whisper_with_timestamps(audio_file_path):
    model = whisper.load_model("small")
    result = model.transcribe(audio_file_path)
    segments = result.get("segments", [])

    # 拼接带有时间戳的文本，时间戳保留一位小数
    text_with_timestamps = ""
    for segment in segments:
        start = segment.get("start")
        end = segment.get("end")
        text = segment.get("text")
        # 使用格式化字符串设置时间戳的格式
        text_with_timestamps += f"{start:.1f}-{end:.1f}: {text}\n"

    return text_with_timestamps

# Example usage
url = "https://www.youtube.com/watch?v=5OSP5DNAozU"
video_file_path = download_youtube_video(url)
audio_file_path = extract_audio_from_video(video_file_path)
transcribed_text = audio_to_text_whisper_with_timestamps(audio_file_path)
print(f'transcribed_text: {transcribed_text}')

def merge_sentences(input_text):
    lines = input_text.strip().split("\n")
    merged_lines = []
    current_text = ""
    start_time = ""
    end_time = ""

    for line in lines:
        if not line:
            continue  # 跳过空行
        parts = line.split(": ", 1)
        if len(parts) != 2:
            continue  # 忽略格式不正确的行
        time_range, sentence = parts
        start, end = time_range.split('-')

        if sentence.endswith(('.', '?', '!')):
            # 如果当前行以结束符结尾
            if current_text:
                # 如果当前文本非空，则先添加当前文本到结果中
                merged_lines.append(f"{start_time}-{end}: {current_text} {sentence}")
                current_text = ""
            else:
                merged_lines.append(line)  # 添加当前行到结果中
        else:
            # 如果当前行不以结束符结尾，需要合并
            if not current_text:
                # 如果当前文本为空，这是合并序列的开始
                start_time = start
                current_text = sentence
            else:
                # 否则，继续合并到当前文本
                current_text += " " + sentence
            end_time = end  # 更新结束时间

    # 确保最后一段合并的文本被添加到结果中
    if current_text:
        merged_lines.append(f"{start_time}-{end_time}: {current_text}")

    return "\n".join(merged_lines)

merged_transcribed_text = merge_sentences(transcribed_text)
print(f'merged transcribed_text: {merged_transcribed_text}')

Video saved to: /content/MeloTTS/./Building domain-specific compilers quickly with MLIR compiler infrastructure  Chris Lattner.mp4
Audio extracted to: /content/MeloTTS/./Building domain-specific compilers quickly with MLIR compiler infrastructure  Chris Lattner.mp3
transcribed_text: 0.0-5.4:  We built this new compiler framework called MLIR.
5.4-5.9:  Yes.
5.9-8.4:  MLIR is a whole new framework.
8.4-11.4:  It's not many people think it's about machine learning.
11.4-14.6:  The ML stands for multi-level because compiler people
14.6-16.1:  can't name things very well, I guess.
16.1-19.5:  Can we dig into what MLIR is?
19.5-22.1:  Yeah. So when you look at compilers,
22.1-26.0:  compilers have historically been solutions for a given space.
26.0-31.8:  So LLVM is really good for dealing with CPUs, let's just say,
31.8-32.5:  at a high level.
32.5-34.4:  You look at Java.
34.4-36.0:  Java has a JVM.
36.0-38.6:  The JVM is very good for garbage collected languages
38.6-39.9:  that need dyna

In [5]:
import os
import torch
from pydub import AudioSegment
from IPython.display import Audio
import hashlib
from melo.api import TTS
from pydub import AudioSegment


# 假设的文本内容
transcribed_text = """
0.0-5.4: 我们开发了一种全新的编译器框架,称为 MLIR(Multi-Level Intermediate Representation)。
5.4-5.9: 是的。
5.9-8.4: MLIR 是一个全新的编译器基础架构。
8.4-11.4: 这与很多人误解它与机器学习有关不同。
11.4-16.1: 其中 ML 代表多级(Multi-Level),因为编译器人员在命名方面可能不太擅长。
16.1-19.5: 能否深入解释一下 MLIR 是什么?
19.5-26.0: 好的。对于编译器而言, 传统上它们都是为特定领域量身定制的解决方案。
"""

# Function to convert text to speech with MeloTTS-Chinese
# 修改后的 text_to_speech_melotts 函数
def text_to_speech_melotts(text, audio_file_path, target_duration):
    initial_speed = 1.0  # 初始速度设定为1.0
    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    model = TTS(language='ZH', device=device)
    speaker_ids = model.hps.data.spk2id

    # 首次尝试以初始速度生成语音
    model.tts_to_file(text, speaker_ids['ZH'], audio_file_path, speed=initial_speed)

    # 计算生成的语音文件的时长
    generated_audio = AudioSegment.from_file(audio_file_path)
    generated_duration = len(generated_audio) / 1000.0  # 将毫秒转换为秒

    # 如果生成的语音时长不等于目标时长，则调整速度
    if generated_duration != target_duration:
        adjusted_speed = initial_speed * (generated_duration / target_duration)
        # 使用调整后的速度重新生成语音，以匹配目标时长
        model.tts_to_file(text, speaker_ids['ZH'], audio_file_path, speed=adjusted_speed)

    return audio_file_path

# 步骤1: 解析文本
def parse_text(text):
    lines = text.strip().split('\n')
    segments = []
    for line in lines:
        time_range, content = line.split(': ', 1)
        begin_time, end_time = map(float, time_range.split('-'))
        segments.append((begin_time, end_time, content))
    return segments

# 修改后的步骤2: 文本到语音转换，为每个文件生成唯一名称
def generate_tts_audio(segments, output_dir):
    tts_audio_segments = []
    os.makedirs(output_dir, exist_ok=True)

    for begin_time, end_time, content in segments:
        unique_filename = f"{begin_time}-{end_time}.mp3"
        unique_filepath = os.path.join(output_dir, unique_filename)
        target_duration = end_time - begin_time

        # 调用修改后的 TTS 函数，传递内容、文件路径及目标时长
        audio_path = text_to_speech_melotts(content, unique_filepath, target_duration)
        tts_audio_segments.append((begin_time, end_time, audio_path))

    return tts_audio_segments

# 步骤3: 拼接音频
def concatenate_audio_segments(tts_audio_segments):
    combined = AudioSegment.silent(duration=0)  # 创建一个初始的静音片段
    for _, _, audio_path in tts_audio_segments:
        segment = AudioSegment.from_file(audio_path)
        combined += segment
    return combined

# 步骤4: 保存并播放音频
def save_and_play_audio(combined_audio, output_path="combined_tts_audio.mp3"):
    combined_audio.export(output_path, format="mp3")
    return Audio(output_path)

# 执行流程
tts_output_dir = '/content/tts_output_dir'
segments = parse_text(transcribed_text)
tts_audio_segments = generate_tts_audio(segments, tts_output_dir)
combined_audio = concatenate_audio_segments(tts_audio_segments)
audio_player = save_and_play_audio(combined_audio)

# 显示播放器
audio_player


tokenizer_config.json:   0%|          | 0.00/251 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/231k [00:00<?, ?B/s]

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package cmudict to /root/nltk_data...
[nltk_data]   Unzipping corpora/cmudict.zip.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/344k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/83.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/420 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/227k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/310 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/486k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.30k [00:00<?, ?B/s]

checkpoint.pth:   0%|          | 0.00/208M [00:00<?, ?B/s]

 > Text split to sentences.
我们开发了一种全新的编译器框架,
称为 MLIR(Multi-Level Intermediate Representation).


  0%|          | 0/2 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.075 seconds.
DEBUG:jieba:Loading model cost 1.075 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


model.safetensors:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 2/2 [00:08<00:00,  4.22s/it]


 > Text split to sentences.
我们开发了一种全新的编译器框架,
称为 MLIR(Multi-Level Intermediate Representation).


100%|██████████| 2/2 [00:00<00:00,  6.62it/s]


 > Text split to sentences.
是的.


100%|██████████| 1/1 [00:00<00:00,  5.84it/s]


 > Text split to sentences.
是的.


100%|██████████| 1/1 [00:00<00:00,  6.02it/s]


 > Text split to sentences.
MLIR 是一个全新的编译器基础架构.


100%|██████████| 1/1 [00:00<00:00,  5.24it/s]


 > Text split to sentences.
MLIR 是一个全新的编译器基础架构.


100%|██████████| 1/1 [00:00<00:00,  5.70it/s]


 > Text split to sentences.
这与很多人误解它与机器学习有关不同.


100%|██████████| 1/1 [00:00<00:00,  5.92it/s]


 > Text split to sentences.
这与很多人误解它与机器学习有关不同.


100%|██████████| 1/1 [00:00<00:00,  6.43it/s]


 > Text split to sentences.
其中 ML 代表多级(Multi-Level),
因为编译器人员在命名方面可能不太擅长.


100%|██████████| 2/2 [00:00<00:00,  5.89it/s]


 > Text split to sentences.
其中 ML 代表多级(Multi-Level),
因为编译器人员在命名方面可能不太擅长.


100%|██████████| 2/2 [00:00<00:00,  7.34it/s]


 > Text split to sentences.
能否深入解释一下 MLIR 是什么?


100%|██████████| 1/1 [00:00<00:00,  6.03it/s]


 > Text split to sentences.
能否深入解释一下 MLIR 是什么?


100%|██████████| 1/1 [00:00<00:00,  4.68it/s]


 > Text split to sentences.
好的. 对于编译器而言,
传统上它们都是为特定领域量身定制的解决方案.


100%|██████████| 2/2 [00:00<00:00,  5.72it/s]


 > Text split to sentences.
好的. 对于编译器而言,
传统上它们都是为特定领域量身定制的解决方案.


100%|██████████| 2/2 [00:00<00:00,  5.58it/s]


In [None]:
from pydub import AudioSegment
import hashlib
import os

origin_audio_dir = '/content/origin_audio_dir'
def split_audio_segments(audio_file_path, segments, output_dir):
    # 加载原始音频文件
    audio = AudioSegment.from_file(audio_file_path)

    # 确保输出目录存在
    os.makedirs(output_dir, exist_ok=True)

    split_audio_segments = []
    for begin_time, end_time, content in segments:
        unique_filename = f"{begin_time}-{end_time}.mp3"
        unique_filepath = os.path.join(output_dir, unique_filename)

        # 计算切分时间点，pydub中时间单位为毫秒
        start_ms = begin_time * 1000
        end_ms = end_time * 1000

        # 切分音频
        segment_audio = audio[start_ms:end_ms]

        # 导出切分后的音频段
        segment_audio.export(unique_filepath, format="mp3")
        split_audio_segments.append((begin_time, end_time, unique_filepath))

    return split_audio_segments

origin_audio_segments=split_audio_segments(audio_file_path, tts_audio_segments, origin_audio_dir)

In [None]:
!git clone https://github.com/myshell-ai/OpenVoice.git
%cd /content/OpenVoice
!ls
!pip install -e .
!wget https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_1226.zip
!unzip checkpoints_1226.zip -d .

Cloning into 'OpenVoice'...
remote: Enumerating objects: 376, done.[K
remote: Counting objects: 100% (193/193), done.[K
remote: Compressing objects: 100% (95/95), done.[K
remote: Total 376 (delta 132), reused 126 (delta 98), pack-reused 183[K
Receiving objects: 100% (376/376), 2.92 MiB | 19.42 MiB/s, done.
Resolving deltas: 100% (185/185), done.
/content/OpenVoice
demo_part1.ipynb  docs	   openvoice  requirements.txt	setup.py
demo_part2.ipynb  LICENSE  README.md  resources
Obtaining file:///content/OpenVoice
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: MyShell-OpenVoice
  Running setup.py develop for MyShell-OpenVoice
Successfully installed MyShell-OpenVoice-0.0.0
--2024-03-10 14:02:16--  https://myshell-public-repo-hosting.s3.amazonaws.com/openvoice/checkpoints_1226.zip
Resolving myshell-public-repo-hosting.s3.amazonaws.com (myshell-public-repo-hosting.s3.amazonaws.com)... 3.5.10.188, 52.216.211.233, 52.217.71.172, ...
Connecting to myshell-p

In [None]:
import os
import torch
from openvoice import se_extractor
from openvoice.api import ToneColorConverter

# 假设的检查点和配置路径
ckpt_converter = 'checkpoints/converter'
device = "cuda:0" if torch.cuda.is_available() else "cpu"
output_dir = '/content/converted_audio'
os.makedirs(output_dir, exist_ok=True)

# 初始化音色转换器
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

# 提取原始音色
source_se, _ = se_extractor.get_se('/content/MeloTTS/combined_tts_audio.mp3', tone_color_converter, vad=True)
# 目标音色设为TTS音频的音色
# 如果TTS音频的音色不需要特别提取，这行代码可以省略
target_se, _ = se_extractor.get_se(audio_file_path, tone_color_converter, vad=True)

# 对每对音频文件进行处理
for (begin_time, end_time, original_path), (begin_time, end_time, tts_path) in zip(origin_audio_segments, tts_audio_segments):

    # 设置输出文件路径
    output_path = os.path.join(output_dir, os.path.basename(tts_path))

    # 应用音色转换
    tone_color_converter.convert(
        audio_src_path=tts_path,
        src_se=source_se,
        tgt_se=target_se,  # 如果TTS音频不提取音色，这里使用source_se代替
        output_path=output_path,
        message="@MyShell"  # 根据需要修改或省略该参数
    )

    print(f"Converted audio saved to: {output_path}")


Loaded checkpoint 'checkpoints/converter/checkpoint.pth'
missing/unexpected keys: [] []
Converted audio saved to: /content/converted_audio/0.0-5.4.mp3
Audio too short, fail to add watermark
Converted audio saved to: /content/converted_audio/5.4-5.9.mp3
Converted audio saved to: /content/converted_audio/5.9-8.4.mp3
Converted audio saved to: /content/converted_audio/8.4-11.4.mp3
Converted audio saved to: /content/converted_audio/11.4-16.1.mp3
Converted audio saved to: /content/converted_audio/16.1-19.5.mp3
Converted audio saved to: /content/converted_audio/19.5-26.0.mp3


In [None]:
!pip install moviepy



In [None]:
from moviepy.editor import VideoFileClip

# Path to your original video
original_video_path = '/content/Building domain-specific compilers quickly with MLIR compiler infrastructure  Chris Lattner.mp4'
# Path where the trimmed video will be saved
trimmed_video_path = '/content/trimmed_video.mp4'

# Load the original video
video_clip = VideoFileClip(original_video_path)

# Trim the first 30 seconds
# Here, we assume the video is longer than 30 seconds
trimmed_clip = video_clip.subclip(0, 30)

# Write the trimmed clip to a new video file
trimmed_clip.write_videofile(trimmed_video_path, codec='libx264', audio_codec='aac')

# Cleanup
video_clip.close()
trimmed_clip.close()


t:   0%|          | 2/900 [00:14<1:51:14,  7.43s/it, now=None]

Moviepy - Building video /content/trimmed_video.mp4.
MoviePy - Writing audio in trimmed_videoTEMP_MPY_wvf_snd.mp4



chunk:   0%|          | 0/662 [00:00<?, ?it/s, now=None][A
chunk:   8%|▊         | 54/662 [00:00<00:01, 529.38it/s, now=None][A
chunk:  16%|█▌        | 107/662 [00:00<00:01, 424.58it/s, now=None][A
chunk:  23%|██▎       | 152/662 [00:00<00:01, 431.58it/s, now=None][A
chunk:  30%|██▉       | 196/662 [00:00<00:01, 434.56it/s, now=None][A
chunk:  37%|███▋      | 244/662 [00:00<00:00, 450.01it/s, now=None][A
chunk:  47%|████▋     | 312/662 [00:00<00:00, 522.72it/s, now=None][A
chunk:  57%|█████▋    | 377/662 [00:00<00:00, 561.50it/s, now=None][A
chunk:  69%|██████▉   | 459/662 [00:00<00:00, 636.51it/s, now=None][A
chunk:  81%|████████▏ | 538/662 [00:00<00:00, 681.11it/s, now=None][A
chunk:  94%|█████████▎| 620/662 [00:01<00:00, 722.37it/s, now=None][A
t:   0%|          | 2/900 [00:16<2:00:09,  8.03s/it, now=None]

MoviePy - Done.
Moviepy - Writing video /content/trimmed_video.mp4




t:   0%|          | 0/900 [00:00<?, ?it/s, now=None][A
t:   1%|          | 7/900 [00:00<00:14, 60.83it/s, now=None][A
t:   2%|▏         | 14/900 [00:00<00:15, 58.09it/s, now=None][A
t:   2%|▏         | 20/900 [00:00<00:15, 57.12it/s, now=None][A
t:   3%|▎         | 27/900 [00:00<00:15, 57.97it/s, now=None][A
t:   4%|▍         | 34/900 [00:00<00:14, 61.68it/s, now=None][A
t:   5%|▍         | 41/900 [00:00<00:13, 62.97it/s, now=None][A
t:   5%|▌         | 48/900 [00:00<00:13, 62.46it/s, now=None][A
t:   6%|▌         | 55/900 [00:00<00:14, 58.94it/s, now=None][A
t:   7%|▋         | 61/900 [00:01<00:26, 31.30it/s, now=None][A
t:   7%|▋         | 66/900 [00:01<00:29, 28.33it/s, now=None][A
t:   8%|▊         | 70/900 [00:01<00:31, 26.10it/s, now=None][A
t:   8%|▊         | 74/900 [00:01<00:32, 25.18it/s, now=None][A
t:   9%|▊         | 77/900 [00:02<00:33, 24.86it/s, now=None][A
t:   9%|▉         | 80/900 [00:02<00:33, 24.65it/s, now=None][A
t:   9%|▉         | 83/900 [00:02<

Moviepy - Done !
Moviepy - video ready /content/trimmed_video.mp4


In [None]:
from moviepy.editor import VideoFileClip, concatenate_audioclips, AudioFileClip, CompositeAudioClip
from moviepy.audio.AudioClip import AudioClip
import os

video_file_path = '/content/trimmed_video.mp4'
audio_segments_dir = '/content/converted_audio'  # Directory containing your audio segments
output_video_path = '/content/new_video_with_audio.mp4'  # Output video file path

# Load the original video
video = VideoFileClip(video_file_path)

# Generate a silent audio clip of the same duration as the video
silent_audio = AudioClip(lambda t: 0, duration=video.duration).set_fps(44100)

audio_clips = []  # Initialize an empty list to store audio clips with their start times

# Iterate over audio segment files in the directory
for segment_file in sorted(os.listdir(audio_segments_dir)):
    if segment_file.endswith('.mp3') or segment_file.endswith('.wav'):
        # Parse start and end times from the file name
        file_name = os.path.splitext(segment_file)[0]
        start_time, end_time = map(float, file_name.split('-'))

        # Load the audio segment
        segment = AudioFileClip(os.path.join(audio_segments_dir, segment_file)).set_start(start_time)

        # Add the audio segment to the list
        audio_clips.append(segment)

# Combine silent audio with the segments
if audio_clips:
    final_audio = CompositeAudioClip([silent_audio] + audio_clips)
else:
    final_audio = silent_audio

# Set the composite audio as the audio of the video clip
video_with_audio = video.set_audio(final_audio)

# Export the video with the new audio
video_with_audio.write_videofile(output_video_path, codec='libx264', audio_codec='aac')

# Cleanup
video.close()
video_with_audio.close()
for clip in audio_clips:
    clip.close()


t:   0%|          | 2/900 [01:54<14:19:59, 57.46s/it, now=None]

Moviepy - Building video /content/new_video_with_audio.mp4.
MoviePy - Writing audio in new_video_with_audioTEMP_MPY_wvf_snd.mp4



chunk:   0%|          | 0/662 [00:00<?, ?it/s, now=None][A
chunk:   9%|▉         | 59/662 [00:00<00:01, 588.12it/s, now=None][A
chunk:  18%|█▊        | 119/662 [00:00<00:00, 592.28it/s, now=None][A
chunk:  30%|██▉       | 196/662 [00:00<00:00, 672.49it/s, now=None][A
chunk:  40%|████      | 268/662 [00:00<00:00, 689.67it/s, now=None][A
chunk:  51%|█████     | 337/662 [00:00<00:00, 664.13it/s, now=None][A
chunk:  61%|██████    | 404/662 [00:00<00:00, 662.83it/s, now=None][A
chunk:  71%|███████   | 471/662 [00:00<00:00, 644.62it/s, now=None][A
chunk:  81%|████████  | 536/662 [00:00<00:00, 621.06it/s, now=None][A
t:   0%|          | 2/900 [01:55<14:27:11, 57.94s/it, now=None]

MoviePy - Done.
Moviepy - Writing video /content/new_video_with_audio.mp4




t:   0%|          | 0/901 [00:00<?, ?it/s, now=None][A
t:   1%|          | 8/901 [00:00<00:11, 79.10it/s, now=None][A
t:   2%|▏         | 16/901 [00:00<00:12, 73.69it/s, now=None][A
t:   3%|▎         | 24/901 [00:00<00:12, 72.17it/s, now=None][A
t:   4%|▎         | 32/901 [00:00<00:12, 71.81it/s, now=None][A
t:   4%|▍         | 40/901 [00:00<00:12, 70.05it/s, now=None][A
t:   5%|▌         | 48/901 [00:00<00:12, 69.39it/s, now=None][A
t:   6%|▌         | 55/901 [00:00<00:12, 67.32it/s, now=None][A
t:   7%|▋         | 62/901 [00:01<00:26, 31.52it/s, now=None][A
t:   8%|▊         | 68/901 [00:01<00:28, 29.62it/s, now=None][A
t:   8%|▊         | 73/901 [00:01<00:29, 27.93it/s, now=None][A
t:   9%|▊         | 77/901 [00:01<00:32, 25.74it/s, now=None][A
t:   9%|▉         | 81/901 [00:02<00:32, 25.36it/s, now=None][A
t:   9%|▉         | 84/901 [00:02<00:33, 24.40it/s, now=None][A
t:  10%|▉         | 87/901 [00:02<00:42, 19.13it/s, now=None][A
t:  10%|▉         | 90/901 [00:02<

Moviepy - Done !
Moviepy - video ready /content/new_video_with_audio.mp4


In [None]:
from IPython.display import Video

# 视频文件的路径
video_file_path = '/content/new_video_with_audio.mp4'

# 在Colab中显示视频
Video(video_file_path)
