In [1]:
import os
import numpy as np
import webrtcvad
import noisereduce as nr
from scipy.io import wavfile
from pydub import AudioSegment
import librosa

def process_audio_file(input_path, output_path):
    # Đọc và chuyển sang mono, 16kHz
    audio = AudioSegment.from_wav(input_path).set_channels(1).set_frame_rate(16000)
    raw_audio = np.array(audio.get_array_of_samples())
    rate = audio.frame_rate

    # Lọc nhiễu
    denoised_audio = nr.reduce_noise(y=raw_audio.astype(np.float32), sr=rate)

    # Resample nếu cần
    if rate != 16000:
        raw_audio = librosa.resample(raw_audio.astype(np.float32), orig_sr=rate, target_sr=16000)
        denoised_audio = librosa.resample(denoised_audio.astype(np.float32), orig_sr=rate, target_sr=16000)
        rate = 16000

    # VAD
    vad = webrtcvad.Vad(2)
    frame_duration_ms = 30
    frame_length = int(rate * frame_duration_ms / 1000)
    frames = [denoised_audio[i:i+frame_length] for i in range(0, len(denoised_audio) - frame_length, frame_length)]

    def is_speech(frame):
        int16_frame = (frame * 32768).astype(np.int16)
        return vad.is_speech(int16_frame.tobytes(), rate)

    flags = [is_speech(frame) for frame in frames]
    speech_mask = np.repeat(flags, frame_length)
    speech_mask = np.pad(speech_mask, (0, len(denoised_audio) - len(speech_mask)), mode='constant')
    speech_audio = denoised_audio * speech_mask

    # CAD: 1.7s đoạn có năng lượng cao
    window_sec = 1.7
    window_len = int(window_sec * rate)
    stride = int(0.2 * rate)

    max_energy = 0
    best_segment = None
    for i in range(0, len(speech_audio) - window_len, stride):
        window = speech_audio[i:i+window_len]
        energy = np.sum(window.astype(np.float32)**2)
        if energy > max_energy:
            max_energy = energy
            best_segment = window

    # Cắt 0.2s đầu hoặc cuối
    cut_sec = 0.2
    cut_len = int(cut_sec * rate)
    check_len = int(0.3 * rate)
    stride = int(0.01 * rate)

    max_energy = 0
    best_start = 0
    for i in range(0, len(best_segment) - check_len + 1, stride):
        window = best_segment[i:i + check_len]
        energy = np.sum(window.astype(np.float32) ** 2)
        if energy > max_energy:
            max_energy = energy
            best_start = i

    best_start_sec = best_start / rate

    if best_start_sec < 0.3:
        final_segment = best_segment[:-cut_len]
    else:
        final_segment = best_segment[cut_len:] if len(best_segment) > cut_len else best_segment

    # CAD 0.8s và pad 0.1s mỗi bên
    cad_len = int(0.8 * rate)
    stride = int(0.02 * rate)

    max_energy = 0
    best_start = 0
    for i in range(0, len(final_segment) - cad_len + 1, stride):
        window = final_segment[i:i + cad_len]
        energy = np.sum(window.astype(np.float32) ** 2)
        if energy > max_energy:
            max_energy = energy
            best_start = i

    cad_segment = final_segment[best_start:best_start + cad_len]

    pad_len = int(0.1 * rate)
    padded_segment = np.pad(cad_segment, (pad_len, pad_len), mode='constant')

    # Chuẩn hóa âm lượng (Peak Normalization)
    max_val = np.max(np.abs(padded_segment))
    if max_val > 0:
        padded_segment = padded_segment / max_val * 0.99

    # Chuyển về int16 để ghi WAV
    final_output = (padded_segment * 32767).astype(np.int16)

    # Ghi file WAV
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    wavfile.write(output_path, rate, final_output)

    return final_output, rate



In [2]:
from tqdm import tqdm

# Tạo thư mục processed nếu chưa tồn tại
os.makedirs("../data/processed", exist_ok=True)

# Xử lý tất cả các file trong thư mục raw
raw_dir = "../data/raw"
for command_dir in tqdm(os.listdir(raw_dir), desc="Processing commands"):
    command_path = os.path.join(raw_dir, command_dir)
    if not os.path.isdir(command_path):
        continue

    # Tạo thư mục tương ứng trong processed
    processed_command_dir = os.path.join("../data/processed", command_dir)
    os.makedirs(processed_command_dir, exist_ok=True)

    # Xử lý tất cả các file WAV trong thư mục lệnh này
    for wav_file in tqdm(os.listdir(command_path), desc=f"Processing {command_dir}", leave=False):
        if not wav_file.endswith('.wav'):
            continue

        input_path = os.path.join(command_path, wav_file)
        output_path = os.path.join(processed_command_dir, wav_file)
        
        try:
            process_audio_file(input_path, output_path)
        except Exception as e:
            print(f"Error processing {input_path}: {str(e)}")

Processing commands: 100%|██████████| 12/12 [00:18<00:00,  1.56s/it]


In [3]:
# import random
# import matplotlib.pyplot as plt
# from scipy.io import wavfile
# from IPython.display import Audio

# # Chọn ngẫu nhiên một file đã xử lý
# processed_dir = "../data/processed"
# command_dirs = [d for d in os.listdir(processed_dir) if os.path.isdir(os.path.join(processed_dir, d))]
# command_dir = random.choice(command_dirs)
# command_path = os.path.join(processed_dir, command_dir)


# wav_files = [f for f in os.listdir(command_path) if f.endswith('.wav')]

# wav_file = random.choice(wav_files)
# processed_path = os.path.join(command_path, wav_file)

# # Đọc và hiển thị waveform
# rate, processed_samples = wavfile.read(processed_path)

# # Vẽ waveform
# time = np.linspace(0, len(processed_samples)/rate, num=len(processed_samples))
# plt.figure(figsize=(15, 4))
# plt.plot(time, processed_samples, label="Processed Audio", color='blue')
# plt.title(f"Processed Waveform: {command_dir}/{wav_file}")
# plt.xlabel("Time (s)")
# plt.ylabel("Amplitude")
# plt.legend()
# plt.show()

# # Phát audio
# Audio(processed_samples, rate=rate)