In [8]:
import whisper
import torchaudio
import numpy as np
import scipy.signal
import librosa
import subprocess
import os
import torch
import soundfile as sf


In [2]:
result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [16]:

model = whisper.load_model("base")

  4%|█▋                                     | 6.18M/139M [00:22<07:54, 292kiB/s]


KeyboardInterrupt: 

In [14]:
# 加载音频
def load_audio(path):
    wav, sr = sf.read(path)
    if wav.ndim > 1:
        wav = wav.mean(axis=1)  # 转 mono
    return torch.tensor(wav, dtype=torch.float32)

# 提取 embedding，自动切片为多个 30s 段
def get_embeddings(wav, sample_rate=16000):
    CHUNK_LEN = 30 * sample_rate
    chunks = []
    for i in range(0, len(wav), CHUNK_LEN):
        chunk = wav[i:i + CHUNK_LEN]
        mel = whisper.log_mel_spectrogram(chunk).to(model.device)
        mel = mel.unsqueeze(0)  # [1, 80, T]
        with torch.no_grad():
            features = model.encoder(mel)
        chunks.append(features.squeeze(0).cpu().numpy())
    return np.concatenate(chunks, axis=0)  # [T', dim]

# 简单边界检测（使用 embedding 差分）
def compute_change_score(embeddings):
    diff = np.linalg.norm(embeddings[1:] - embeddings[:-1], axis=1)
    return np.concatenate([[0], diff])

# 使用峰值检测定位边界（可调参数）
def detect_boundaries(change_scores, threshold=1.0, distance=20):
    peaks, _ = find_peaks(change_scores, height=threshold, distance=distance)
    return peaks

# 分割音频并保存
def save_segments(wav, boundaries, output_dir, sample_rate=16000):
    os.makedirs(output_dir, exist_ok=True)
    segments = []
    boundaries = [0] + list(boundaries) + [len(wav)]
    for i in range(len(boundaries) - 1):
        start, end = boundaries[i], boundaries[i+1]
        segment = wav[start:end]
        path = os.path.join(output_dir, f"seg_{i:03d}.wav")
        sf.write(path, segment.numpy(), samplerate=sample_rate)
        segments.append((start / sample_rate, end / sample_rate))
    return segments

In [15]:
# 主函数
def whisperseg(path, output_dir):
    wav = load_audio(path)
    embeddings = get_embeddings(wav)
    change_scores = compute_change_score(embeddings)
    boundary_frames = detect_boundaries(change_scores)
    sample_per_frame = len(wav) // len(change_scores)
    boundaries = boundary_frames * sample_per_frame
    return save_segments(wav, boundaries, output_dir)

# 示例调用
if __name__ == "__main__":
    audio_path = "../data/processed_clean/deep_clean.wav"
    output_dir = "../data/processed_cliped/deep_clips"
    segments = whisperseg(audio_path, output_dir)
    print("Detected Segments (in seconds):")
    for s, e in segments:
        print(f"{s:.2f} - {e:.2f}")

AssertionError: incorrect audio shape