In [None]:
Construction of the MarmAudio dataset

In [1]:
# 1. Imports
import os
import glob
import soundfile as sf
import librosa
import numpy as np

# 2. Define source and target directories
pos_dir = 'XXXX' # Marmoset USV
neg_dir = 'XXXX' # Marmoset Noise
output_dir = 'XXXX'

os.makedirs(output_dir, exist_ok=True)

# 3. Collect all WAV files
patterns = [
    os.path.join(pos_dir, '**', '*.wav'),
    os.path.join(neg_dir, '**', '*.wav')
]
all_files = []
for p in patterns:
    all_files.extend(glob.glob(p, recursive=True))

# 4. Compute durations for IQR-based outlier detection
durations = []
for fp in all_files:
    info = sf.info(fp)
    durations.append(info.frames / info.samplerate)
dur_arr = np.array(durations)

# 5. Compute IQR thresholds
q1, q3 = np.percentile(dur_arr, [25, 75])
iqr = q3 - q1
lower_fence = q1 - 1.5 * iqr
upper_fence = q3 + 1.5 * iqr
print(f'Duration IQR: Q1={q1:.3f}, Q3={q3:.3f}, lower={lower_fence:.3f}, upper={upper_fence:.3f}')

# 6. Process each file: skip outliers, resample, and save
processed = 0
skipped = 0

for fp, dur in zip(all_files, dur_arr):
    if dur < lower_fence or dur > upper_fence:
        skipped += 1
        continue

    # load & resample to 250000 Hz
    y, _ = librosa.load(fp, sr=250000)
    out_fp = os.path.join(output_dir, os.path.basename(fp))
    sf.write(out_fp, y, 250000)
    processed += 1

print(f'\nSkipped {skipped} outliers')

# 7. Final statistics on the output directory
out_files = glob.glob(os.path.join(output_dir, '*.wav'))
sr_list = []
dur_list = []

for fp in out_files:
    info = sf.info(fp)
    sr_list.append(info.samplerate)
    dur_list.append(info.frames / info.samplerate)

sr_arr = np.array(sr_list)
dur_arr = np.array(dur_list)

# 8. Print sampling rate distribution
u_sr, cnt_sr = np.unique(sr_arr, return_counts=True)
print('\nSampling Rate Distribution (after resample):')
for sr, cnt in zip(u_sr, cnt_sr):
    print(f'  {sr} Hz')

# 9. Print duration statistics and any remaining outliers
mean_d = dur_arr.mean()
med_d = np.median(dur_arr)
std_d = dur_arr.std()
q1_d, q3_d = np.percentile(dur_arr, [25, 75])
iqr_d = q3_d - q1_d
lf_d = q1_d - 1.5*iqr_d
uf_d = q3_d + 1.5*iqr_d
outliers_d = dur_arr[(dur_arr<lf_d)|(dur_arr>uf_d)]

print('\nDuration Statistics (seconds) after processing:')
print(f'  Mean  : {mean_d:.4f}')
print(f'  Median: {med_d:.4f}')
print(f'  Std   : {std_d:.4f}')
print(f'  Q1    : {q1_d:.4f}, Q3: {q3_d:.4f}, IQR: {iqr_d:.4f}')
print(f'  Lower : {lf_d:.4f}, Upper: {uf_d:.4f}')

Duration IQR: Q1=0.850, Q3=1.350, lower=0.100, upper=2.100

Skipped 27 outliers

Sampling Rate Distribution (after resample):
  250000 Hz

Duration Statistics (seconds) after processing:
  Mean  : 2.2228
  Median: 2.0152
  Std   : 1.2115
  Q1    : 1.5500, Q3: 2.4417, IQR: 0.8917
  Lower : 0.2125, Upper: 3.7792


In [2]:
import os
import glob
import soundfile as sf

# 1. Define directory
audio_dir = 'XXXX'

# 2. Collect all .wav files
audio_files = glob.glob(os.path.join(audio_dir, '**', '*.wav'), recursive=True)

# 3. Classify by filename: files starting with "NOISE" are negative, the rest are positive
negative_files = [fp for fp in audio_files if os.path.basename(fp).startswith('NOISE')]
positive_files = [fp for fp in audio_files if not os.path.basename(fp).startswith('NOISE')]

# 4. Sum durations
total_negative = sum(sf.info(fp).frames / sf.info(fp).samplerate for fp in negative_files)
total_positive = sum(sf.info(fp).frames / sf.info(fp).samplerate for fp in positive_files)
total_duration = total_positive + total_negative

# 5. Compute ratio negative/positive
ratio = total_negative / total_positive if total_positive > 0 else float('inf')

# 6. Print summary
print("=== Summary by Class ===")
print(f"Total audio duration:                      {total_duration:.2f} sec")
print(f"Total positive (USV) audio duration:       {total_positive:.2f} sec")
print(f"Total negative (noise) audio duration:     {total_negative:.2f} sec")
print(f"Noise-to-positive duration ratio:          {ratio:.1f}×")


=== Summary by Class ===
Total audio duration:                      5216.86 sec
Total positive (USV) audio duration:       723.39 sec
Total negative (noise) audio duration:     4493.47 sec
Noise-to-positive duration ratio:          6.2×


In [3]:
# === Configuration ===
ROOT_DIR             = 'XXXX'
WAV_PATTERN          = ROOT_DIR + '/**/*.wav'
DATASET_CSV          = 'MarmAudio_dataset.csv'
DATASET_NAME         = 'MarmAudio'

WINDOW_LENGTH_SEC    = 0.220    # 220 ms windows
STEP_SEC             = 0.110    # 50% overlap
TRAIN_FRAC           = 0.70
VAL_FRAC             = 0.20
TEST_FRAC            = 0.10

# Heuristic target class fractions (for sampling)
ACTUAL_POS_FRAC      = 0.138   # ≈ 13.8% positive windows
ACTUAL_NEG_FRAC      = 0.862   # ≈ 86.2% negative windows
TARGET_POS_FRAC      = (ACTUAL_POS_FRAC + 0.50) / 2   # ≈ 31.9%
TARGET_NEG_FRAC      = (ACTUAL_NEG_FRAC + 0.50) / 2   # ≈ 68.1%
NEG_PER_POS          = TARGET_NEG_FRAC / TARGET_POS_FRAC

RANDOM_SEED          = 42

# === 1. Imports ===
import glob
import os
import random
import pandas as pd
import soundfile as sf
from sklearn.model_selection import train_test_split

# === 2. Reproducibility ===
random.seed(RANDOM_SEED)

# === 3. Gather all WAV files ===
all_wavs = glob.glob(WAV_PATTERN, recursive=True)

# === 4. Generate fixed-length windows and assign label by filename prefix ===
rows = []
for wav_path in all_wavs:
    try:
        data, sr = sf.read(wav_path, dtype='float32')
    except Exception as e:
        print(f"Skipping {wav_path}: {e}")
        continue
    total_dur = len(data) / sr

    # Label: files starting with "NOISE" are negative (0), all others are positive (1)
    basename = os.path.basename(wav_path)
    label = 0 if basename.startswith('NOISE') else 1

    t = 0.0
    while t + WINDOW_LENGTH_SEC <= total_dur:
        rows.append({
            'file_name': basename,
            'start'    : t,
            'duration' : WINDOW_LENGTH_SEC,
            'end'      : t + WINDOW_LENGTH_SEC,
            'usv'      : label,
            'dataset'  : DATASET_NAME
        })
        t += STEP_SEC

df = pd.DataFrame(rows)
print("Window generation complete:")
print(df['usv'].value_counts(), "windows (0=noise, 1=usv)")

# === 5. Window-count–based negative sampling ===
pos_df = df[df['usv'] == 1]
neg_df = df[df['usv'] == 0]

n_neg_needed = int(len(pos_df) * NEG_PER_POS)
n_neg_needed = min(n_neg_needed, len(neg_df))
neg_df = neg_df.sample(n=n_neg_needed, random_state=RANDOM_SEED)

# === 6. Combine and shuffle ===
combined = pd.concat([pos_df, neg_df]).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# === 7. Split train/val/test with stratification ===
train_df, temp_df = train_test_split(
    combined, train_size=TRAIN_FRAC,
    stratify=combined['usv'], random_state=RANDOM_SEED
)
val_frac_adjusted = VAL_FRAC / (VAL_FRAC + TEST_FRAC)
val_df, test_df = train_test_split(
    temp_df, train_size=val_frac_adjusted,
    stratify=temp_df['usv'], random_state=RANDOM_SEED
)

for subset, name in [(train_df, 'train'), (val_df, 'val'), (test_df, 'test')]:
    subset['type'] = name

# === 8. Save dataset to CSV ===
dataset = pd.concat([train_df, val_df, test_df], ignore_index=True)
dataset = dataset[['file_name', 'start', 'duration', 'end', 'usv', 'type', 'dataset']]

os.makedirs(os.path.dirname(DATASET_CSV), exist_ok=True)
dataset.to_csv(DATASET_CSV, index=False)

# === 9. Report ===
print("Split proportions (train/val/test):")
print(dataset['type'].value_counts(normalize=True))
print("Class balance (0=noise, 1=usv):")
print(dataset['usv'].value_counts(normalize=True))


Window generation complete:
usv
0    38447
1     5565
Name: count, dtype: int64 windows (0=noise, 1=usv)
Split proportions (train/val/test):
type
train    0.699971
val      0.200000
test     0.100029
Name: proportion, dtype: float64
Class balance (0=noise, 1=usv):
usv
0    0.680997
1    0.319003
Name: proportion, dtype: float64
