In [1]:
import os
import numpy as np
import webrtcvad
import noisereduce as nr
from scipy.io import wavfile
from pydub import AudioSegment
import librosa
from tqdm import tqdm

class AudioPreprocessor:
    def __init__(self):
        self.vad = webrtcvad.Vad(2)
        self.target_sr = 16000
        self.frame_duration_ms = 30
        
    def load_audio(self, input_path):
        """Load and convert audio to mono 16kHz"""
        audio = AudioSegment.from_wav(input_path).set_channels(1).set_frame_rate(self.target_sr)
        raw_audio = np.array(audio.get_array_of_samples())
        rate = audio.frame_rate
        return raw_audio, rate
        
    def denoise(self, audio, sr):
        """Apply noise reduction"""
        denoised = nr.reduce_noise(y=audio.astype(np.float32), sr=sr)
        if sr != self.target_sr:
            denoised = librosa.resample(denoised, orig_sr=sr, target_sr=self.target_sr)
        return denoised, self.target_sr
        
    def apply_vad(self, audio, sr):
        """Apply Voice Activity Detection"""
        frame_length = int(sr * self.frame_duration_ms / 1000)
        frames = [audio[i:i+frame_length] for i in range(0, len(audio) - frame_length, frame_length)]

        def is_speech(frame):
            int16_frame = (frame * 32768).astype(np.int16)
            return self.vad.is_speech(int16_frame.tobytes(), sr)

        flags = [is_speech(frame) for frame in frames]
        speech_mask = np.repeat(flags, frame_length)
        speech_mask = np.pad(speech_mask, (0, len(audio) - len(speech_mask)), mode='constant')
        return audio * speech_mask
        
    def normalize_audio(self, audio):
        """Apply peak normalization"""
        max_val = np.max(np.abs(audio))
        if max_val > 0:
            audio = audio / max_val * 0.99
        return (audio * 32767).astype(np.int16)

# Cấu hình
background_noise_dir = '../data/_background_noise_'
output_dir = '../data/processed/unknown'
os.makedirs(output_dir, exist_ok=True)

# Số lượng file cần tạo
num_output_files = 150

# Khởi tạo preprocessor
processor = AudioPreprocessor()

# Danh sách file background noise
background_files = [f for f in os.listdir(background_noise_dir) if f.endswith('.wav')]

# Đọc và xử lý tất cả background audio
background_segments = []
for fname in tqdm(background_files, desc="Processing background files"):
    path = os.path.join(background_noise_dir, fname)
    samples, sr = librosa.load(path, sr=16000)
    
    # Chỉ lấy 1 đoạn 1s ngẫu nhiên từ mỗi file
    if len(samples) >= sr:  # Đảm bảo file đủ dài 1s
        # Chọn vị trí bắt đầu ngẫu nhiên
        max_start = len(samples) - sr
        start_idx = np.random.randint(0, max_start + 1)
        segment = samples[start_idx:start_idx + sr]
        
        # Áp dụng tiền xử lý
        segment = processor.normalize_audio(segment)
        background_segments.append(segment)

# Nếu số lượng segment ít hơn num_output_files, lặp lại cho đủ
while len(background_segments) < num_output_files:
    # Chọn ngẫu nhiên một file
    fname = np.random.choice(background_files)
    path = os.path.join(background_noise_dir, fname)
    samples, sr = librosa.load(path, sr=16000)
    
    if len(samples) >= sr:
        max_start = len(samples) - sr
        start_idx = np.random.randint(0, max_start + 1)
        segment = samples[start_idx:start_idx + sr]
        segment = processor.normalize_audio(segment)
        background_segments.append(segment)

# Ghi ra file
for idx, segment in enumerate(tqdm(background_segments, desc="Saving unknown segments")):
    output_path = os.path.join(output_dir, f"noise_{idx}.wav")
    wavfile.write(output_path, processor.target_sr, segment)

print(f"✅ Đã tạo {len(background_segments)} file noise trong {output_dir}")

Processing background files: 100%|██████████| 6/6 [00:00<00:00,  9.00it/s]
Saving unknown segments: 100%|██████████| 150/150 [00:00<00:00, 33470.53it/s]

✅ Đã tạo 150 file noise trong ../data/processed/unknown



