In [1]:
!pip install pydub



In [8]:
from pydub import AudioSegment, silence
import os

INPUT_FILE = "all_queries.wav"
OUTPUT_DIR = "eval/queries_audio"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 🎚️ Updated parameters
MIN_SILENCE_LEN = 1200  # Only split if silence > 1.2s
SILENCE_THRESH = -35    # More tolerant to quiet voices

# Load full audio
audio = AudioSegment.from_file(INPUT_FILE)

# Optional: Normalize volume
audio = audio.normalize()

# Split by silence
chunks = silence.split_on_silence(
    audio,
    min_silence_len=MIN_SILENCE_LEN,
    silence_thresh=SILENCE_THRESH,
    keep_silence=300
)

print(f"🧩 Detected {len(chunks)} chunks. Expected 40.")

# Warn if over/under split
if len(chunks) > 45:
    print("⚠️ Too many splits — try increasing MIN_SILENCE_LEN or decreasing SILENCE_THRESH.")
elif len(chunks) < 35:
    print("⚠️ Too few splits — try decreasing MIN_SILENCE_LEN or lowering SILENCE_THRESH.")

# Save to files
for i, chunk in enumerate(chunks):
    filename = os.path.join(OUTPUT_DIR, f"q{i+1:02d}.wav")
    chunk.export(filename, format="wav")
    print(f"💾 Saved: {filename}")


🧩 Detected 39 chunks. Expected 40.
💾 Saved: eval/queries_audio\q01.wav
💾 Saved: eval/queries_audio\q02.wav
💾 Saved: eval/queries_audio\q03.wav
💾 Saved: eval/queries_audio\q04.wav
💾 Saved: eval/queries_audio\q05.wav
💾 Saved: eval/queries_audio\q06.wav
💾 Saved: eval/queries_audio\q07.wav
💾 Saved: eval/queries_audio\q08.wav
💾 Saved: eval/queries_audio\q09.wav
💾 Saved: eval/queries_audio\q10.wav
💾 Saved: eval/queries_audio\q11.wav
💾 Saved: eval/queries_audio\q12.wav
💾 Saved: eval/queries_audio\q13.wav
💾 Saved: eval/queries_audio\q14.wav
💾 Saved: eval/queries_audio\q15.wav
💾 Saved: eval/queries_audio\q16.wav
💾 Saved: eval/queries_audio\q17.wav
💾 Saved: eval/queries_audio\q18.wav
💾 Saved: eval/queries_audio\q19.wav
💾 Saved: eval/queries_audio\q20.wav
💾 Saved: eval/queries_audio\q21.wav
💾 Saved: eval/queries_audio\q22.wav
💾 Saved: eval/queries_audio\q23.wav
💾 Saved: eval/queries_audio\q24.wav
💾 Saved: eval/queries_audio\q25.wav
💾 Saved: eval/queries_audio\q26.wav
💾 Saved: eval/queries_audio\q

In [9]:
from pydub import AudioSegment
import os

chunk_dir = "eval/queries_audio"
chunks = sorted([f for f in os.listdir(chunk_dir) if f.endswith(".wav")])

# Find longest 2 chunks (likely suspects)
longest_chunks = sorted(chunks, key=lambda f: AudioSegment.from_wav(os.path.join(chunk_dir, f)).duration_seconds, reverse=True)[:2]

print("🎯 Suspect chunks that may contain double questions:")
for f in longest_chunks:
    dur = AudioSegment.from_wav(os.path.join(chunk_dir, f)).duration_seconds
    print(f" - {f}: {dur:.2f} seconds")


🎯 Suspect chunks that may contain double questions:
 - q22.wav: 8.01 seconds
 - q35.wav: 7.52 seconds


In [10]:
from pydub import AudioSegment

original = AudioSegment.from_wav("eval/queries_audio/q35.wav")
midpoint = len(original) // 2  # crude but effective split

first_half = original[:midpoint]
second_half = original[midpoint:]

first_half.export("eval/queries_audio/q35.wav", format="wav")
second_half.export("eval/queries_audio/q36.wav", format="wav")

print("✅ Split q35.wav into q35 and q36")


✅ Split q35.wav into q35 and q36


In [11]:
import os
from pathlib import Path

audio_dir = Path("eval/queries_audio")

# Start renaming from the highest index down to avoid overwriting
for i in range(40, 35, -1):  # q40 → q36
    src = audio_dir / f"q{i:02d}.wav"
    dst = audio_dir / f"q{i+1:02d}.wav"
    if src.exists():
        os.rename(src, dst)
        print(f"🔁 Renamed {src.name} → {dst.name}")


🔁 Renamed q39.wav → q40.wav
🔁 Renamed q38.wav → q39.wav
🔁 Renamed q37.wav → q38.wav
🔁 Renamed q36.wav → q37.wav
