In [25]:
import numpy as np
import os
import librosa
import soundfile as sf
import noisereduce as nr
from pydub import AudioSegment
from audiomentations import Compose, TimeStretch, PitchShift
import matplotlib.pyplot as plt
from tqdm import tqdm
import glob

In [26]:
from pydub import AudioSegment

FFMPEG_PATH = r"D:\ffmpeg\bin\ffmpeg.exe"  # 修改为你的实际路径
FFPROBE_PATH = r"D:\ffmpeg\bin\ffprobe.exe"  # 修改为你的实际路径

# 设置 FFmpeg 路径
AudioSegment.converter = FFMPEG_PATH
AudioSegment.ffprobe = FFPROBE_PATH

In [27]:
# 配置参数
TARGET_SR = 22050         # 目标采样率
SEGMENT_DURATION = 2.0    # 片段时长(秒)
MIN_ENERGY_RATIO = 1.2    # 显著片段的最小能量倍数
MAX_SEGMENTS_PER_FILE = 3 # 每个音频文件最多提取的片段数
AUGMENTATIONS_PER_SEGMENT = 3  # 每个片段的数据增强次数

In [28]:
def process_audio_directory(input_dir, output_dir):
    """
    处理音频目录中的所有文件
    :param input_dir: 输入目录路径
    :param output_dir: 输出目录路径
    """
    # 确保输出目录存在
    os.makedirs(output_dir, exist_ok=True)
    
    # 获取所有音频文件
    audio_files = glob.glob(os.path.join(input_dir, "*.flac"))
    
    # 按物种ID分组文件
    species_files = {}
    for file_path in audio_files:
        species_id = os.path.basename(file_path).split("_")[0]
        if species_id not in species_files:
            species_files[species_id] = []
        species_files[species_id].append(file_path)
    
    # 计数器：记录每个物种的处理片段数
    species_counters = {species_id: 1 for species_id in species_files.keys()}
    
    # 处理每个物种的文件
    for species_id, files in tqdm(species_files.items(), desc="处理物种"):
        for file_path in tqdm(files, desc=f"物种 {species_id}", leave=False):
            # 加载音频
            try:
                y, sr = librosa.load(file_path, sr=TARGET_SR)
            except Exception as e:
                print(f"加载 {file_path} 失败: {str(e)}")
                continue
            
            # 降噪处理
            y_denoised = nr.reduce_noise(y=y, sr=TARGET_SR)
            
            # 提取显著片段
            segments = extract_significant_segments(y_denoised, TARGET_SR)
            
            # 处理每个片段
            for segment in segments[:MAX_SEGMENTS_PER_FILE]:
                # 数据增强
                augmented_segments = augment_segment(segment, TARGET_SR)
                
                # 保存增强后的片段
                for aug_segment in augmented_segments:
                    # 生成输出文件名
                    counter = species_counters[species_id]
                    output_filename = f"{species_id}_{counter:04d}.mp3"
                    output_path = os.path.join(output_dir, output_filename)
                    
                    # 保存为MP3
                    save_as_mp3(aug_segment, TARGET_SR, output_path)
                    
                    # 更新计数器
                    species_counters[species_id] += 1

def extract_significant_segments(y, sr, min_duration=0.1):
    """
    提取音频中的显著片段
    :param y: 音频数据
    :param sr: 采样率
    :param min_duration: 最小持续时间(秒)
    :return: 显著片段列表
    """
    segments = []
    
    # 计算能量
    frame_length = int(sr * 0.1)  # 100ms帧
    hop_length = frame_length // 2
    energy = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]
    
    # 计算能量阈值
    energy_threshold = np.mean(energy) * MIN_ENERGY_RATIO
    
    # 找出超过阈值的区域
    above_threshold = energy > energy_threshold
    indices = np.where(above_threshold)[0]
    
    if len(indices) == 0:
        return segments
    
    # 将连续区域分组
    segments_indices = []
    start_idx = indices[0]
    for i in range(1, len(indices)):
        if indices[i] - indices[i-1] > 1:  # 不连续
            segments_indices.append((start_idx, indices[i-1]))
            start_idx = indices[i]
    segments_indices.append((start_idx, indices[-1]))
    
    # 将索引转换为样本位置
    for start_frame, end_frame in segments_indices:
        start_sample = start_frame * hop_length
        end_sample = end_frame * hop_length + frame_length
        duration = (end_sample - start_sample) / sr
        
        # 跳过太短的片段
        if duration < min_duration:
            continue
        
        # 提取片段
        segment = y[start_sample:end_sample]
        
        # 将片段分割为2秒的子片段
        segment_duration = len(segment) / sr
        num_subsegments = max(1, int(segment_duration // SEGMENT_DURATION))
        
        for i in range(num_subsegments):
            start = int(i * SEGMENT_DURATION * sr)
            end = int((i + 1) * SEGMENT_DURATION * sr)
            
            if end > len(segment):
                # 填充不足的片段
                padding = np.zeros(end - len(segment))
                subsegment = np.concatenate([segment[start:], padding])
            else:
                subsegment = segment[start:end]
            
            segments.append(subsegment)
    
    return segments

def augment_segment(segment, sr):
    """
    对音频片段进行数据增强
    :param segment: 音频片段
    :param sr: 采样率
    :return: 增强后的片段列表
    """
    augmented_segments = [segment]  # 包含原始片段
    
    # 创建增强器
    augmenter = Compose([
        TimeStretch(min_rate=0.8, max_rate=1.2, p=1.0),
        PitchShift(min_semitones=-3, max_semitones=3, p=1.0)
    ])
    
    # 生成增强版本
    for _ in range(AUGMENTATIONS_PER_SEGMENT):
        augmented = augmenter(samples=segment, sample_rate=sr)
        augmented_segments.append(augmented)
    
    return augmented_segments

def save_as_mp3(y, sr, output_path):
    """
    将音频保存为MP3格式
    :param y: 音频数据
    :param sr: 采样率
    :param output_path: 输出路径
    """
    # 转换为16位PCM格式
    y_int = np.int16(y * 32767)
    
    # 创建临时WAV文件
    temp_wav = "temp.wav"
    sf.write(temp_wav, y_int, sr)
    
    # 转换为MP3
    audio = AudioSegment.from_wav(temp_wav)
    audio.export(output_path, format="mp3", bitrate="128k")
    
    # 删除临时文件
    os.remove(temp_wav)

def plot_audio_segments(input_dir, output_dir, num_examples=3):
    """
    绘制音频处理前后的对比图（用于调试和验证）
    :param input_dir: 输入目录
    :param output_dir: 输出目录
    :param num_examples: 示例数量
    """
    # 获取示例文件
    input_files = glob.glob(os.path.join(input_dir, "*.flac"))[:num_examples]
    
    for input_file in input_files:
        # 加载原始音频
        y_orig, sr = librosa.load(input_file, sr=TARGET_SR)
        
        # 处理音频
        y_denoised = nr.reduce_noise(y=y_orig, sr=sr)
        segments = extract_significant_segments(y_denoised, sr)
        
        if not segments:
            continue
            
        # 获取输出文件名
        species_id = os.path.basename(input_file).split("_")[0]
        output_files = glob.glob(os.path.join(output_dir, f"{species_id}_*.mp3"))
        
        if not output_files:
            continue
            
        # 加载处理后的音频
        output_file = output_files[0]
        y_processed, _ = librosa.load(output_file, sr=TARGET_SR)
        
        # 创建图形
        plt.figure(figsize=(15, 10))
        
        # 原始音频波形
        plt.subplot(3, 1, 1)
        plt.plot(y_orig)
        plt.title(f"原始音频: {os.path.basename(input_file)}")
        plt.xlabel("样本")
        plt.ylabel("幅度")
        
        # 降噪后音频波形
        plt.subplot(3, 1, 2)
        plt.plot(y_denoised)
        plt.title("降噪后音频")
        plt.xlabel("样本")
        plt.ylabel("幅度")
        
        # 处理后音频波形
        plt.subplot(3, 1, 3)
        plt.plot(y_processed)
        plt.title(f"处理后音频: {os.path.basename(output_file)}")
        plt.xlabel("样本")
        plt.ylabel("幅度")
        
        plt.tight_layout()
        
        # 保存图像
        plot_path = os.path.join(output_dir, f"comparison_{os.path.basename(input_file).split('.')[0]}.png")
        plt.savefig(plot_path)
        plt.close()


In [29]:
INPUT_DIR = "Bird_Song/renamed"
OUTPUT_DIR = "Bird_Song/all_data"

# 检查输入目录是否存在
if not os.path.exists(INPUT_DIR):
    print(f"错误: 输入目录 '{INPUT_DIR}' 不存在")
    exit(1)

# 处理音频
print("开始处理音频文件...")
process_audio_directory(INPUT_DIR, OUTPUT_DIR)
print(f"处理完成! 结果保存在: {OUTPUT_DIR}")

# 生成对比图（可选）
print("生成处理前后对比图...")
plot_audio_segments(INPUT_DIR, OUTPUT_DIR)
print("对比图生成完成!")

开始处理音频文件...


处理物种:   0%|          | 0/85 [00:00<?, ?it/s]
物种 10:   0%|          | 0/3 [00:00<?, ?it/s][A
物种 10:  33%|███▎      | 1/3 [00:01<00:03,  1.58s/it][A
物种 10:  67%|██████▋   | 2/3 [00:02<00:01,  1.25s/it][A
物种 10: 100%|██████████| 3/3 [00:03<00:00,  1.32s/it][A
处理物种:   1%|          | 1/85 [00:03<05:35,  4.00s/it]  [A
物种 11:   0%|          | 0/3 [00:00<?, ?it/s][A
物种 11:  33%|███▎      | 1/3 [00:00<00:00,  2.91it/s][A
物种 11:  67%|██████▋   | 2/3 [00:00<00:00,  1.90it/s][A
物种 11: 100%|██████████| 3/3 [00:02<00:00,  1.11it/s][A
处理物种:   2%|▏         | 2/85 [00:06<04:11,  3.03s/it]  [A
物种 12:   0%|          | 0/3 [00:00<?, ?it/s][A
物种 12:  33%|███▎      | 1/3 [00:01<00:02,  1.40s/it][A
物种 12:  67%|██████▋   | 2/3 [00:02<00:01,  1.36s/it][A
物种 12: 100%|██████████| 3/3 [00:04<00:00,  1.43s/it][A
处理物种:   4%|▎         | 3/85 [00:10<04:54,  3.59s/it]  [A
物种 13:   0%|          | 0/3 [00:00<?, ?it/s][A
物种 13:  33%|███▎      | 1/3 [00:00<00:01,  1.00it/s][A
物种 13:  67%|██████▋   | 2/3 [

处理完成! 结果保存在: Bird_Song/all_data
生成处理前后对比图...


  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.tight_layout()
  plt.savefig(plot_path)
  plt.savefig(plot_path)
  plt.savefig(plot_path)
  plt.savefig(plot_path)
  plt.savefig(plot_path)
  plt.savefig(plot_path)
  plt.savefig(plot_path)
  plt.savefig(plot_path)
  plt.savefig(plot_path)
  plt.savefig(plot_path)
  plt.savefig(plot_path)
  plt.savefig(plot_path)
  plt.savefig(plot_path)


对比图生成完成!
