Construction of the NABat dataset

In [1]:
# 1. Imports
import os
import glob
import soundfile as sf
import numpy as np

# 2. Define positive and negative directories
pos_dir = 'XXXX'   # Bat USV
neg_dir = 'XXXX'    # Bat noise

# 3. Collect all .wav files
patterns = [
    os.path.join(pos_dir, '**', '*.wav'),
    os.path.join(neg_dir, '**', '*.wav')
]
file_paths = []
for pattern in patterns:
    file_paths.extend(glob.glob(pattern, recursive=True))

# 4. Read samplerate and duration for each file
durations = []
samplerates = []

for fp in file_paths:
    try:
        info = sf.info(fp)
        sr = info.samplerate
        duration = info.frames / sr
        samplerates.append(sr)
        durations.append(duration)
    except Exception as e:
        print(f'Error reading {fp}: {e}')

dur_arr = np.array(durations)
sr_arr = np.array(samplerates)

# 5. Print sampling rate distribution
unique_srs, counts = np.unique(sr_arr, return_counts=True)
print('Sampling Rate Distribution:')
for sr, cnt in zip(unique_srs, counts):
    print(f'  {sr} Hz')

# 6. Compute duration statistics and IQR-based outliers
mean_dur = dur_arr.mean()
median_dur = np.median(dur_arr)
std_dur = dur_arr.std()
q1, q3 = np.percentile(dur_arr, [25, 75])
iqr = q3 - q1
lower_fence = q1 - 1.5 * iqr
upper_fence = q3 + 1.5 * iqr
outliers = dur_arr[(dur_arr < lower_fence) | (dur_arr > upper_fence)]

print('\nDuration Statistics (seconds):')
print(f'  Mean    : {mean_dur:.4f}')
print(f'  Median  : {median_dur:.4f}')
print(f'  Std Dev : {std_dur:.4f}')
print(f'  Q1      : {q1:.4f}')
print(f'  Q3      : {q3:.4f}')
print(f'  IQR     : {iqr:.4f}')
print(f'  Lower   : {lower_fence:.4f}')
print(f'  Upper   : {upper_fence:.4f}')

print('\nOutlier Durations:')
print(f'  Number of outliers: {len(outliers)}')
if len(outliers) > 0:
    print(f'  Min outlier: {outliers.min():.4f} sec')
    print(f'  Max outlier: {outliers.max():.4f} sec')
    print('  First 10 outliers:')
    for val in outliers[:10]:
        print(f'    {val:.4f} sec')


Sampling Rate Distribution:
  192000 Hz
  250000 Hz
  256000 Hz
  300000 Hz
  320000 Hz
  384000 Hz
  500000 Hz

Duration Statistics (seconds):
  Mean    : 4.4721
  Median  : 4.9990
  Std Dev : 2.2880
  Q1      : 2.8940
  Q3      : 5.0000
  IQR     : 2.1060
  Lower   : -0.2650
  Upper   : 8.1590

Outlier Durations:
  Number of outliers: 92
  Min outlier: 8.1800 sec
  Max outlier: 15.0040 sec
  First 10 outliers:
    10.2760 sec
    13.5433 sec
    8.4300 sec
    12.4980 sec
    15.0000 sec
    9.3960 sec
    15.0000 sec
    9.7860 sec
    15.0000 sec
    15.0000 sec


In [2]:
# 1. Imports
import os
import glob
import soundfile as sf
import librosa
import numpy as np

# 2. Define source and target directories
pos_dir = 'XXXX' # Bat USV
neg_dir = 'XXXX' # Bat Noise
output_dir = 'XXXX'

os.makedirs(output_dir, exist_ok=True)

# 3. Collect all WAV files
patterns = [
    os.path.join(pos_dir, '**', '*.wav'),
    os.path.join(neg_dir, '**', '*.wav')
]
all_files = []
for p in patterns:
    all_files.extend(glob.glob(p, recursive=True))

# 4. Compute durations for IQR-based outlier detection
durations = []
for fp in all_files:
    info = sf.info(fp)
    durations.append(info.frames / info.samplerate)
dur_arr = np.array(durations)

# 5. Compute IQR thresholds
q1, q3 = np.percentile(dur_arr, [25, 75])
iqr = q3 - q1
lower_fence = q1 - 1.5 * iqr
upper_fence = q3 + 1.5 * iqr
print(f'Duration IQR: Q1={q1:.3f}, Q3={q3:.3f}, lower={lower_fence:.3f}, upper={upper_fence:.3f}')

# 6. Process each file: skip outliers, resample, and save
processed = 0
skipped = 0

for fp, dur in zip(all_files, dur_arr):
    if dur < lower_fence or dur > upper_fence:
        skipped += 1
        continue

    # load & resample to 250000 Hz
    y, _ = librosa.load(fp, sr=250000)
    out_fp = os.path.join(output_dir, os.path.basename(fp))
    sf.write(out_fp, y, 250000)
    processed += 1

# 7. Final statistics on the output directory
out_files = glob.glob(os.path.join(output_dir, '*.wav'))
sr_list = []
dur_list = []

for fp in out_files:
    info = sf.info(fp)
    sr_list.append(info.samplerate)
    dur_list.append(info.frames / info.samplerate)

sr_arr = np.array(sr_list)
dur_arr = np.array(dur_list)

# 8. Print sampling rate distribution
u_sr, cnt_sr = np.unique(sr_arr, return_counts=True)
print('\nSampling Rate Distribution (after resample):')
for sr, cnt in zip(u_sr, cnt_sr):
    print(f'  {sr} Hz')

# 9. Print duration statistics and any remaining outliers
mean_d = dur_arr.mean()
med_d = np.median(dur_arr)
std_d = dur_arr.std()
q1_d, q3_d = np.percentile(dur_arr, [25, 75])
iqr_d = q3_d - q1_d
lf_d = q1_d - 1.5*iqr_d
uf_d = q3_d + 1.5*iqr_d
outliers_d = dur_arr[(dur_arr<lf_d)|(dur_arr>uf_d)]

print('\nDuration Statistics (seconds) after processing:')
print(f'  Mean  : {mean_d:.4f}')
print(f'  Median: {med_d:.4f}')
print(f'  Std   : {std_d:.4f}')
print(f'  Q1    : {q1_d:.4f}, Q3: {q3_d:.4f}, IQR: {iqr_d:.4f}')
print(f'  Lower : {lf_d:.4f}, Upper: {uf_d:.4f}')

print(f'\nRemaining outliers: {len(outliers_d)}')
if len(outliers_d)>0:
    print(f'  Min: {outliers_d.min():.4f}, Max: {outliers_d.max():.4f}')
    print('  Sample outlier durations:')
    print(outliers_d[:10])


Duration IQR: Q1=2.894, Q3=5.000, lower=-0.265, upper=8.159

Sampling Rate Distribution (after resample):
  250000 Hz

Duration Statistics (seconds) after processing:
  Mean  : 4.0527
  Median: 4.9987
  Std   : 1.4206
  Q1    : 2.6320, Q3: 5.0000, IQR: 2.3680
  Lower : -0.9200, Upper: 8.5520

Remaining outliers: 0


In [3]:
# === Summary by Class (bat vs noise) ===
import os
import glob
import soundfile as sf

# 1. Define the output directory
output_dir = 'XXXX'

# 2. Statistical positive and negative events
bat_files   = glob.glob(os.path.join(output_dir, 'MYLU*.wav'))
noise_files = glob.glob(os.path.join(output_dir, 'NOISE*.wav'))

total_bat   = sum(sf.info(fp).frames / sf.info(fp).samplerate for fp in bat_files)
total_noise = sum(sf.info(fp).frames / sf.info(fp).samplerate for fp in noise_files)
total_audio = total_bat + total_noise

ratio = total_noise / total_bat if total_bat > 0 else float('inf')

# 3. Print outcome
print("=== Summary by Class ===")
print(f"Total bat USV audio duration:   {total_bat:.2f} sec")
print(f"Total noise audio duration: {total_noise:.2f} sec")
print(f"Bat non-USV is {ratio:.1f}× USV duration")


=== Summary by Class ===
Total bat USV audio duration:   4969.00 sec
Total noise audio duration: 1486.87 sec
Bat non-USV is 0.3× USV duration


In [4]:
# === Configuration ===
ROOT_DIR             = 'XXXX'
WAV_PATTERN          = ROOT_DIR + '/**/*.wav'
DATASET_CSV          = 'NABat_dateset.csv'
DATASET_NAME         = 'NABat'

WINDOW_LENGTH_SEC    = 0.220    # 220 ms windows
STEP_SEC             = 0.110    # 50% overlap
TRAIN_FRAC           = 0.70
VAL_FRAC             = 0.20
TEST_FRAC            = 0.10

# Heuristic target class fractions (for sampling)
ACTUAL_POS_FRAC      = 0.77   # ≈ 77% positive files
ACTUAL_NEG_FRAC      = 0.23   # ≈ 23% negative files
TARGET_POS_FRAC      = (ACTUAL_POS_FRAC + 0.50) / 2   # ≈ 63.5%
TARGET_NEG_FRAC      = (ACTUAL_NEG_FRAC + 0.50) / 2   # ≈ 36.5%
NEG_PER_POS          = TARGET_NEG_FRAC / TARGET_POS_FRAC

RANDOM_SEED          = 42

# === 1. Imports ===
import glob
import os
import random
import pandas as pd
import soundfile as sf
from sklearn.model_selection import train_test_split

# === 2. Reproducibility ===
random.seed(RANDOM_SEED)

# === 3. Gather all WAV files ===
all_wavs = glob.glob(WAV_PATTERN, recursive=True)

# === 4. Generate fixed-length windows and assign label by filename prefix ===
rows = []
for wav_path in all_wavs:
    try:
        data, sr = sf.read(wav_path, dtype='float32')
    except Exception as e:
        print(f"Skipping {wav_path}: {e}")
        continue
    total_dur = len(data) / sr
    label = 1 if os.path.basename(wav_path).startswith('MYLU') else 0
    t = 0.0
    while t + WINDOW_LENGTH_SEC <= total_dur:
        rows.append({
            'file_name': os.path.basename(wav_path),
            'start'    : t,
            'duration' : WINDOW_LENGTH_SEC,
            'end'      : t + WINDOW_LENGTH_SEC,
            'usv'      : label,
            'dataset'  : DATASET_NAME
        })
        t += STEP_SEC

df = pd.DataFrame(rows)
print("Window generation complete:")
print(df['usv'].value_counts(), "windows (0=noise, 1=bat)")

# === 5. Window-count–based negative sampling ===
pos_df = df[df['usv'] == 1]
neg_df = df[df['usv'] == 0]

n_neg_needed = int(len(pos_df) * NEG_PER_POS)
n_neg_needed = min(n_neg_needed, len(neg_df))
neg_df = neg_df.sample(n=n_neg_needed, random_state=RANDOM_SEED)

# === 6. Combine and shuffle ===
combined = pd.concat([pos_df, neg_df]).sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

# === 7. Split train/val/test with stratification ===
train_df, temp_df = train_test_split(
    combined, train_size=TRAIN_FRAC,
    stratify=combined['usv'], random_state=RANDOM_SEED
)
val_frac_adjusted = VAL_FRAC / (VAL_FRAC + TEST_FRAC)
val_df, test_df = train_test_split(
    temp_df, train_size=val_frac_adjusted,
    stratify=temp_df['usv'], random_state=RANDOM_SEED
)

for subset, name in [(train_df, 'train'), (val_df, 'val'), (test_df, 'test')]:
    subset['type'] = name

# === 8. Save dataset to CSV ===
dataset = pd.concat([train_df, val_df, test_df], ignore_index=True)
dataset = dataset[['file_name', 'start', 'duration', 'end', 'usv', 'type', 'dataset']]

os.makedirs(os.path.dirname(DATASET_CSV), exist_ok=True)
dataset.to_csv(DATASET_CSV, index=False)

# === 9. Report ===
print("Split proportions (train/val/test):")
print(dataset['type'].value_counts(normalize=True))
print("Class balance (0=noise, 1=bat):")
print(dataset['usv'].value_counts(normalize=True))


Window generation complete:
usv
1    43660
0    12735
Name: count, dtype: int64 windows (0=noise, 1=bat)
Split proportions (train/val/test):
type
train    0.699991
val      0.200000
test     0.100009
Name: proportion, dtype: float64
Class balance (0=noise, 1=bat):
usv
1    0.774182
0    0.225818
Name: proportion, dtype: float64
