In [1]:
import os
import pandas as pd
import torchaudio
import shutil  # 用于复制文件

# 创建输出目录，存放处理后的音频（截断或原样复制）
os.makedirs("pre_deal/cut_data/", exist_ok=True)

# 定义原始数据根目录
fold_root = "./AISumerCamp_audio_generation_fight/aigc_speech_generation_tasks/"
output_root = "pre_deal/cut_data/"
# 加载任务配置文件：包含 utt ID、文本、参考音频文件名等
task = pd.read_csv(fold_root + "aigc_speech_generation_tasks.csv")

# 设置最大允许音频长度（单位：秒）
# 注：原注释“5000*0.02=100秒”可能是笔误，此处实际限制为 30 秒
MAX_LEN_SECONDS = 30  

# 存储被截断的音频对应的 utt ID，便于后续追踪
truncated_ids = []
#📌 被截断的音频 utt ID 列表： [13, 17, 26, 42, 44, 57, 70, 79, 82, 86, 98, 104, 105, 117, 126, 128, 177, 180, 188, 193]

# 遍历任务中的每一行
for index, row in task.iterrows():
    # 获取参考音频文件名（如 reference_1.wav）
    voice_filename = str(row.reference_speech)
    
    # 构建原始音频完整路径
    voice = fold_root + voice_filename
    
    # 目标路径：将音频（截断版或原版）统一保存到 result/ 下同名位置
    output_path = f"{output_root}/{voice_filename}"

    # --- 加载音频并检查时长 ---
    try:
        waveform, sample_rate = torchaudio.load(voice)
    except Exception as e:
        print(f"❌ 无法加载音频文件 {voice}: {e}")
        continue  # 跳过当前任务

    # 计算音频时长（秒）
    duration = waveform.shape[1] / sample_rate

    # --- 判断是否需要截断或复制 ---
    if duration > MAX_LEN_SECONDS:
        print(f"⚠️  音频 {voice_filename} 过长 ({duration:.1f}s > {MAX_LEN_SECONDS}s)，将截断")
        max_samples = int(MAX_LEN_SECONDS * sample_rate)
        waveform = waveform[:, :max_samples]  # 截取前 MAX_LEN_SECONDS 的音频

        # 保存截断后的音频到 result/ 目录
        torchaudio.save(output_path, waveform, sample_rate)
        
        # 记录被截断的 utt ID
        truncated_ids.append(row.utt)
        
        # 更新 voice 变量指向处理后文件（为后续 TTS 准备）
        voice = output_path

    else:
        # 音频未超长，直接复制原文件到 result/ 目录
        print(f"📁 复制音频 {voice_filename} 到 result/ 目录")
        try:
            shutil.copy(voice, output_path)
        except Exception as e:
            print(f"❌ 复制失败 {voice} -> {output_path}: {e}")
            continue

        # voice 保持指向 result 中的副本（统一接口）
        voice = output_path

print("✅ 所有音频已处理完毕")
print("📌 被截断的音频 utt ID 列表：", truncated_ids)

📁 复制音频 reference_1.wav 到 result/ 目录
📁 复制音频 reference_2.wav 到 result/ 目录
📁 复制音频 reference_3.wav 到 result/ 目录
📁 复制音频 reference_4.wav 到 result/ 目录
📁 复制音频 reference_5.wav 到 result/ 目录
📁 复制音频 reference_6.wav 到 result/ 目录
📁 复制音频 reference_7.wav 到 result/ 目录
📁 复制音频 reference_8.wav 到 result/ 目录
📁 复制音频 reference_9.wav 到 result/ 目录
📁 复制音频 reference_10.wav 到 result/ 目录
📁 复制音频 reference_11.wav 到 result/ 目录
📁 复制音频 reference_12.wav 到 result/ 目录
⚠️  音频 reference_13.wav 过长 (61.3s > 30s)，将截断
📁 复制音频 reference_14.wav 到 result/ 目录
📁 复制音频 reference_15.wav 到 result/ 目录
📁 复制音频 reference_16.wav 到 result/ 目录
⚠️  音频 reference_17.wav 过长 (643.2s > 30s)，将截断
📁 复制音频 reference_18.wav 到 result/ 目录
📁 复制音频 reference_19.wav 到 result/ 目录
📁 复制音频 reference_20.wav 到 result/ 目录
📁 复制音频 reference_21.wav 到 result/ 目录
📁 复制音频 reference_22.wav 到 result/ 目录
📁 复制音频 reference_23.wav 到 result/ 目录
📁 复制音频 reference_24.wav 到 result/ 目录
📁 复制音频 reference_25.wav 到 result/ 目录
⚠️  音频 reference_26.wav 过长 (681.0s > 30s)，将截断
📁 复制音频 reference_27.w

In [2]:
#将多人声音频剪裁为首个人声目标

import os
import pandas as pd
import numpy as np
import librosa
import soundfile as sf

# -------------------------------
# 用户配置
# -------------------------------
# 定义原始数据根目录


audio_dir = "./AISumerCamp_audio_generation_fight/aigc_speech_generation_tasks/"                # 原始音频目录
csv_path = f"{audio_dir}aigc_speech_generation_tasks.csv"           # 包含 utt 和 reference_speech 的 CSV

output_dir = "./pre_deal/multi_voice_deal"          # 输出目录（打码后）
replacement_duration = 0.5  # 所有 cuts 都替换为 0.5 秒静音

cuts_dict = {
    42: [[2.23,33.18],[38.07,45.00],[51.04,62.13]],
    44: [[1.17,5.07],[6.20,10.00],[17.01,20.10],[21.18,29.01],[30.17,32.08],[35.09,61.00]],
    98: [[3.12,6.05],[8.18,11.05],[32.07,63.0]],
    126: [[6.07,15.22],[27.17,63.08]],
    129: [[1.13,4.08],[5.10,7.12],[8.18,10.03]]
}

os.makedirs(output_dir, exist_ok=True)

# -------------------------------
# 主函数：将每个 cut 区间替换为固定时长的静音（如 0.5s）
# -------------------------------
def replace_segments_with_fixed_silence(audio_path, output_path, cuts, replacement_sec=0.5):
    """
    将音频中每个指定时间段替换为固定长度的静音（单位：秒）
    音频总时长会因原段长度 ≠ replacement_sec 而改变
    """
    try:
        y, sr = librosa.load(audio_path, sr=None, mono=True)
        segments = []  # 存储 (start, end) 的非静音区间
        current_pos = 0

        # 按时间顺序处理每个 cut
        for start_sec, end_sec in sorted(cuts):
            start_sample = int(start_sec * sr)
            end_sample = int(end_sec * sr)
            replacement_samples = int(replacement_sec * sr)

            # 边界检查
            start_sample = max(0, start_sample)
            end_sample = min(len(y), end_sample)

            # 添加 cut 前的正常段
            if current_pos < start_sample:
                segments.append(y[current_pos:start_sample])

            # 添加固定长度静音
            silence = np.zeros(replacement_samples)
            segments.append(silence)

            # 更新当前位置
            current_pos = end_sample

        # 添加最后一段
        if current_pos < len(y):
            segments.append(y[current_pos:])

        # 拼接所有段
        y_final = np.concatenate(segments) if segments else np.zeros(int(replacement_sec * sr))

        # 保存
        sf.write(output_path, y_final, sr, subtype='PCM_16')
        original_duration = len(y) / sr
        new_duration = len(y_final) / sr
        print(f"✅ {os.path.basename(audio_path)}: {original_duration:.2f}s → {new_duration:.2f}s (replaced with {replacement_sec}s silence)")

    except Exception as e:
        print(f"❌ Error processing {audio_path}: {str(e)}")

# -------------------------------
# 主流程
# -------------------------------
if __name__ == "__main__":
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"CSV not found: {csv_path}")

    df = pd.read_csv(csv_path)
    if 'utt' not in df.columns or 'reference_speech' not in df.columns:
        raise ValueError("CSV must have 'utt' and 'reference_speech' columns")

    utt_to_ref = dict(zip(df['utt'], df['reference_speech']))
    print(str(utt_to_ref))
    for utt_id, cuts in cuts_dict.items():
        if utt_id not in utt_to_ref:
            print(f"⚠️  utt '{utt_id}' not found in CSV")
            continue

        ref_name = utt_to_ref[utt_id]
        base_name, ext = os.path.splitext(ref_name)
        audio_file = ref_name if ext else f"{base_name}.wav"
        input_path = os.path.join(audio_dir, audio_file)
        output_path = os.path.join(output_dir, audio_file)

        if not os.path.exists(input_path):
            print(f"⚠️  File not found: {input_path}")
            continue

        replace_segments_with_fixed_silence(
            audio_path=input_path,
            output_path=output_path,
            cuts=cuts,
            replacement_sec=replacement_duration
        )

    print(f"\n✅ 所有音频已处理：每个 cut 段替换为 {replacement_duration}s 固定静音")
    print(f"输出目录: {output_dir}")

{1: 'reference_1.wav', 2: 'reference_2.wav', 3: 'reference_3.wav', 4: 'reference_4.wav', 5: 'reference_5.wav', 6: 'reference_6.wav', 7: 'reference_7.wav', 8: 'reference_8.wav', 9: 'reference_9.wav', 10: 'reference_10.wav', 11: 'reference_11.wav', 12: 'reference_12.wav', 13: 'reference_13.wav', 14: 'reference_14.wav', 15: 'reference_15.wav', 16: 'reference_16.wav', 17: 'reference_17.wav', 18: 'reference_18.wav', 19: 'reference_19.wav', 20: 'reference_20.wav', 21: 'reference_21.wav', 22: 'reference_22.wav', 23: 'reference_23.wav', 24: 'reference_24.wav', 25: 'reference_25.wav', 26: 'reference_26.wav', 27: 'reference_27.wav', 28: 'reference_28.wav', 29: 'reference_29.wav', 30: 'reference_30.wav', 31: 'reference_31.wav', 32: 'reference_32.wav', 33: 'reference_33.wav', 34: 'reference_34.wav', 35: 'reference_35.wav', 36: 'reference_36.wav', 37: 'reference_37.wav', 38: 'reference_38.wav', 39: 'reference_39.wav', 40: 'reference_40.wav', 41: 'reference_41.wav', 42: 'reference_42.wav', 43: 'refe

In [3]:
!mkdir ./pre_deal/fix
!cp -r ./pre_deal/cut_data/* ./pre_deal/fix
!cp -rf ./pre_deal/multi_voice_deal/* ./pre_deal/fix