In [1]:
import pandas as pd
import numpy as np
import shutil
import os
from pathlib import Path

def debug_csv_structure(csv_file):
    """
    调试CSV文件结构
    """
    print("=" * 60)
    print("CSV文件结构调试")
    print("=" * 60)
    
    df = pd.read_csv(csv_file)
    print(f"文件: {csv_file}")
    print(f"总行数: {len(df)}")
    print(f"列名: {list(df.columns)}")
    print(f"\n前5行数据:")
    print(df.head())
    print(f"\n数据类型:")
    print(df.dtypes)
    print(f"\n数据统计:")
    print(df.describe())
    
    # 检查特定列的数据
    for col in df.columns:
        if col not in ['index', 'reference_file', 'synthesized_file']:
            print(f"\n列 '{col}' 的详细信息:")
            print(f"  非空值数量: {df[col].count()}")
            print(f"  唯一值示例: {df[col].dropna().unique()[:5]}")
            print(f"  数据类型: {df[col].dtype}")
            
            # 尝试转换数值
            try:
                numeric_col = pd.to_numeric(df[col], errors='coerce')
                print(f"  数值转换后非空值: {numeric_col.count()}")
                print(f"  数值范围: {numeric_col.min():.4f} - {numeric_col.max():.4f}")
            except:
                print(f"  数值转换失败")

def copy_highest_score_audio_fixed(csv_file, folder_mapping, output_dir, start_idx=1, end_idx=200):
    """
    修复版的音频复制程序
    """
    
    # 读取CSV文件
    try:
        df = pd.read_csv(csv_file)
        print(f"成功读取CSV文件: {csv_file}")
        print(f"总行数: {len(df)}")
        print(f"列名: {list(df.columns)}")
    except Exception as e:
        print(f"读取CSV文件失败: {e}")
        return
    
    # 检查列名是否存在于CSV中
    valid_columns = []
    for col in folder_mapping.keys():
        if col in df.columns:
            valid_columns.append(col)
        else:
            print(f"警告: 列 '{col}' 不在CSV文件中，跳过")
    
    if not valid_columns:
        print("错误: 没有有效的列名可供比较")
        return
    
    print(f"有效的对比列: {valid_columns}")
    
    # 强制转换数据类型为数值
    for col in valid_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')
        print(f"列 '{col}' 转换后有效值数量: {df[col].notna().sum()}")
    
    # 创建输出目录
    os.makedirs(output_dir, exist_ok=True)
    print(f"输出目录: {output_dir}")
    
    # 统计信息
    copy_count = 0
    error_count = 0
    skip_count = 0
    invalid_count = 0
    
    # 处理每一行
    for idx in range(start_idx, end_idx + 1):
        if idx > len(df):
            print(f"警告: 索引 {idx} 超出CSV文件范围")
            break
        
        # 使用iloc按位置索引，避免索引不匹配的问题
        row_idx = idx - 1  # 因为CSV通常从第2行开始（第1行是标题）
        if row_idx < 0 or row_idx >= len(df):
            continue
            
        row = df.iloc[row_idx]
        
        print(f"\n处理索引 {idx} (行 {row_idx + 2}):")
        
        # 获取有效的数值
        valid_scores = {}
        for col in valid_columns:
            value = row[col]
            if pd.notna(value) and isinstance(value, (int, float)) and not np.isnan(value):
                valid_scores[col] = value
                print(f"  {col}: {value:.4f}")
            else:
                print(f"  {col}: 无效值")
        
        # 统计无效值
        if not valid_scores:
            invalid_count += 1
            print(f"  所有值都无效!")
        
        # 确定要复制的文件
        source_file = None
        source_method = ""
        
        if valid_scores:
            # 找到最高分的列
            max_col = max(valid_scores.items(), key=lambda x: x[1])[0]
            source_folder = folder_mapping[max_col]
            source_file = os.path.join(source_folder, f"synthesized_speech_{idx}.wav")
            # source_file = os.path.join(source_folder, f"reference_{idx}.wav")
            source_method = max_col
            print(f"  最高分方法: {max_col}, 分数: {valid_scores[max_col]:.4f}")
        else:
            # 所有值都无效，使用第一栏
            first_col = valid_columns[0]
            source_folder = folder_mapping[first_col]
            source_file = os.path.join(source_folder, f"synthesized_speech_{idx}.wav")
            # source_file = os.path.join(source_folder, f"reference_{idx}.wav")
            source_method = f"{first_col} (默认)"
            print(f"  所有值无效，使用默认方法: {first_col}")
        
        # 检查源文件是否存在
        if source_file and os.path.exists(source_file):
            # 目标文件名
            target_file = os.path.join(output_dir, f"synthesized_speech_{idx}.wav")
            # target_file = os.path.join(output_dir, f"reference_{idx}.wav")
            
            try:
                # 复制文件
                shutil.copy2(source_file, target_file)
                copy_count += 1
                print(f"  ✓ 复制成功: {os.path.basename(source_file)} -> {os.path.basename(target_file)}")
                print(f"  使用方法: {source_method}")
                
            except Exception as e:
                error_count += 1
                print(f"  ✗ 复制失败: {e}")
        else:
            skip_count += 1
            if source_file:
                print(f"  ⚠ 源文件不存在: {os.path.basename(source_file)}")
            else:
                print(f"  ⚠ 源文件路径为空")
    
    # 输出统计信息
    print(f"\n{'='*50}")
    print("处理完成!")
    print(f"{'='*50}")
    print(f"成功复制: {copy_count} 个文件")
    print(f"复制失败: {error_count} 个文件")
    print(f"文件不存在: {skip_count} 个文件")
    print(f"所有值无效: {invalid_count} 个文件")
    print(f"输出目录: {output_dir}")

def check_audio_files_exist(folder_mapping, start_idx=1, end_idx=200):
    """
    检查音频文件是否存在
    """
    print("检查音频文件是否存在...")
    
    for method, folder in folder_mapping.items():
        print(f"\n检查方法: {method}")
        print(f"文件夹: {folder}")
        
        exist_count = 0
        missing_count = 0
        
        for idx in range(start_idx, end_idx + 1):
            file_path = os.path.join(folder, f"synthesized_speech_{idx}.wav")
            # file_path = os.path.join(folder, f"reference_{idx}.wav")
            if os.path.exists(file_path):
                exist_count += 1
            else:
                missing_count += 1
                if missing_count <= 5:  # 只显示前5个缺失文件
                    print(f"  缺失: synthesized_speech_{idx}.wav")
                    # print(f"  缺失: reference_{idx}.wav")
        
        print(f"  存在: {exist_count}, 缺失: {missing_count}")

# 使用示例
if __name__ == "__main__":
    #[可配置]
    # 配置参数
    CSV_FILE = "./output_similarity.csv"
    # CSV_FILE = "./similarity_results.csv"
    
    # 文件夹映射：{'CSV列名': '对应文件夹路径'}
    FOLDER_MAPPING = {
        #输入
        # 'data_deal_816': "../index-tts/data_deal_new/data_deal_816/",
        # '820_enhanced_d_normalized': "../index-tts/data_deal_new/820_enhanced_d_normalized/",
        
        #输出
        # 'data_deal_816_output': "./old_data/compare/data_deal_816_output/",
        '1_output': "./new_data/compare/1_output",
        '2_output': "./new_data/compare/2_output",
        '3_output': "./new_data/compare/3_output",
        '4_output': "./new_data/compare/4_output",
        '5_output': "./new_data/compare/5_output",
        '6_output': "./new_data/compare/6_output",

    }
    OUTPUT_DIR = "./best_audio_results/"
    # OUTPUT_DIR = "./best_audio_enhanced/"
    
    # 首先调试CSV结构
    print("第一步: 调试CSV文件结构")
    debug_csv_structure(CSV_FILE)
    
    print("\n" + "="*60)
    print("第二步: 检查音频文件是否存在")
    check_audio_files_exist(FOLDER_MAPPING, 1, 10)  # 先检查前10个
    
    print("\n" + "="*60)
    print("第三步: 开始复制音频文件")
    copy_highest_score_audio_fixed(CSV_FILE, FOLDER_MAPPING, OUTPUT_DIR, 1, 200)

第一步: 调试CSV文件结构
CSV文件结构调试
文件: ./similarity_results.csv
总行数: 200
列名: ['index', 'reference_file', 'synthesized_file', 'test_output_16k', 'data_deal_816', '820_enhanced_d_normalized', 'data_deal_816_output', '1_output', '2_output', '3_output', '4_output', '5_output', '6_output', '7_output', '8_output', '9_output']

前5行数据:
   index   reference_file            synthesized_file  test_output_16k  \
0      1  reference_1.wav  synthesized_speech_1_1.wav           0.4214   
1      2  reference_2.wav  synthesized_speech_1_2.wav           0.5038   
2      3  reference_3.wav  synthesized_speech_1_3.wav           0.7536   
3      4  reference_4.wav  synthesized_speech_1_4.wav           0.7525   
4      5  reference_5.wav  synthesized_speech_1_5.wav           0.7950   

  data_deal_816 820_enhanced_d_normalized data_deal_816_output 1_output  \
0        0.9409                    0.9409               0.2136   0.3536   
1        0.9593                    0.9593               0.4932   0.5078   
2        0