In [None]:
import json
import math
import os
import tempfile
from pathlib import Path

import librosa
import numpy as np
import soundfile as sf
from skimage.transform import resize

# ──────────────────────────────────────────────────────────────────────
# constants (unchanged)
# ──────────────────────────────────────────────────────────────────────
SR            = 22_050          # sample-rate used in your pipeline
SEGMENT_LEN   = 4 * SR
N_MELS        = 128
N_FFT         = 1024
HOP_LENGTH    = 512
TARGET_FRAMES = 128
MAX_PAD_RATIO = 0.65            # discard if >65 % padding

# ──────────────────────────────────────────────────────────────────────
# helper functions (unchanged)
# ──────────────────────────────────────────────────────────────────────
def load_audio(path: Path) -> np.ndarray:
    audio, sr = sf.read(str(path))
    if audio.ndim > 1:
        audio = audio.mean(axis=1)
    if sr != SR:
        audio = librosa.resample(audio, orig_sr=sr, target_sr=SR)
    return audio.astype(np.float32)

def wav_to_logmel(wav: np.ndarray) -> np.ndarray:
    mel = librosa.feature.melspectrogram(
        y=wav, sr=SR, n_fft=N_FFT, hop_length=HOP_LENGTH,
        n_mels=N_MELS, power=2.0,
    )
    logmel = librosa.power_to_db(mel, ref=np.max)
    if logmel.shape[1] != TARGET_FRAMES:
        logmel = resize(
            logmel, (N_MELS, TARGET_FRAMES),
            mode="reflect", preserve_range=True, anti_aliasing=True
        )
    return logmel.astype(np.float32)

# ──────────────────────────────────────────────────────────────────────
# NEW – end-to-end preprocessing with safe atomic write + fixed dB range
# ──────────────────────────────────────────────────────────────────────
def preprocess_logmel_to_npy_normalised(
    wav_dir: str | Path,
    out_npy_file: str | Path,
    out_stats_json: str | Path | None = None,
) -> np.ndarray:
    """
    Convert all .wav under `wav_dir` to (1,128,128) log-Mel tensors,
    stack them → (N,1,128,128), clamp to [-80,0] dB, then min-max normalise
    into [-1,1], and save both the array and the fixed-range stats (JSON).
    """
    wav_dir = Path(wav_dir)
    specs: list[np.ndarray] = []

    # gather all 4-second segments as log-Mel patches
    for wav_path in wav_dir.rglob("*.wav"):
        audio = load_audio(wav_path)
        n_chunks = math.ceil(len(audio) / SEGMENT_LEN)

        for i in range(n_chunks):
            chunk = audio[i*SEGMENT_LEN:(i+1)*SEGMENT_LEN]
            missing = SEGMENT_LEN - len(chunk)

            if missing > 0:
                pad_ratio = missing / SEGMENT_LEN
                if pad_ratio > MAX_PAD_RATIO:
                    continue  # skip overly short tail
                chunk = np.pad(chunk, (0, missing), mode="constant")

            specs.append(wav_to_logmel(chunk)[None, ...])  # (1,128,128)

    if not specs:
        raise RuntimeError(f"No usable segments found in {wav_dir}")

    data = np.stack(specs, axis=0)  # → (N,1,128,128) float32

    # ── Fixed dB-range normalisation to [-1,1] ────────────────────
    floor_db = -80.0
    ceil_db  =   0.0

    # clamp to [floor_db, ceil_db]
    data = np.clip(data, floor_db, ceil_db)
    # map [floor_db, ceil_db] → [0,1]
    data = (data - floor_db) / (ceil_db - floor_db)
    # then map [0,1] → [-1,1]
    data = data * 2.0 - 1.0

    # ── Atomic save – write to tmp then rename ─────────────────────
    out_npy_file = Path(out_npy_file)
    out_npy_file.parent.mkdir(parents=True, exist_ok=True)
    with tempfile.NamedTemporaryFile(
        dir=out_npy_file.parent, suffix=".npy", delete=False
    ) as tmp:
        np.save(tmp.name, data)
        tmp.flush()
        os.fsync(tmp.fileno())
    os.replace(tmp.name, out_npy_file)  # atomic on POSIX
    # ── Save fixed-range stats for later reference ────────────────
    # if out_stats_json is None:
    #     out_stats_json = out_npy_file.with_suffix(".json")
    # with open(out_stats_json, "w") as f:
    #     json.dump({
    #         "db_floor": floor_db,
    #         "db_ceil":  ceil_db,
    #         "scaled_min": -1.0,
    #         "scaled_max":  1.0,
    #         "shape":      data.shape,
    #         "dtype":      "float32"
    #     }, f, indent=2)

    # # Final verification – cheap header read
    # hdr = np.lib.format.read_array_header_1_0(open(out_npy_file, "rb"))
    # assert hdr[0] == data.shape and hdr[2] == data.dtype, "save corrupted!"

    print(f"✅ Saved {data.shape[0]} segments → {out_npy_file}")
    print(f"   Stats  dB range=[{floor_db},{ceil_db}]  JSON→ {out_stats_json}")
    return data


In [None]:
preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/fake/ljspeech_full_band_melgan",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake128/fake/normalized_ljspeech_full_band_melgan.npy"
)


In [None]:
preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/fake/ljspeech_hifiGAN",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake128/fake/normalized_ljspeech_hifiGAN.npy"
)


In [None]:
preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/fake/ljspeech_melgan",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake128/fake/normalized_ljspeech_melgan.npy"
)


In [None]:
preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/fake/ljspeech_melgan_large",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake128/fake/normalized_ljspeech_melgan_large.npy"
)


In [None]:
preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/fake/ljspeech_multi_band_melgan",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake128/fake/normalized_ljspeech_multi_band_melgan.npy"
)


In [None]:
preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/fake/ljspeech_parallel_wavegan",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake128/fake/normalized_ljspeech_parallel_wavegan.npy"
)

In [None]:
preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/fake/ljspeech_waveglow",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake128/fake/normalized_ljspeech_waveglow.npy"
)

In [None]:
preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/real/wavs",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake128/normalized_real.npy"
)

In [None]:
import numpy as np
arr = np.load("/home/ml/Documents/voice/ResData/wavefake128_split/test/real.npy",
              mmap_mode='r')
print(arr.shape, arr.dtype, arr.min(), arr.max())
# (21537, 1, 128, 128) float32 0.0 1.0


# Min-Max Normalization

In [None]:
# ─────────────────────────────────────────────────────────────
# CONFIG – change only these three lines
# ─────────────────────────────────────────────────────────────
# CONFIG – change only these three lines
# ─────────────────────────────────────────────────────────────
IN_FILE    = "/home/ml/Documents/voice/ResData/wavefake/real.npy"
OUT_FILE   = "/home/ml/Documents/voice/ResData/wavefake/normalized_real.npy"
PARAMS_NPZ = "train_raw_norm_params.npz"
# ─────────────────────────────────────────────────────────────

import numpy as np
from numpy.lib.format import open_memmap      # ← new import
from pathlib import Path

# 1) mem-map the big array (no RAM spike)
x_map = np.load(IN_FILE, mmap_mode="r")
print("Shape :", x_map.shape, "dtype :", x_map.dtype)

# 2) global extrema on the *training* split
x_min, x_max = float(x_map.min()), float(x_map.max())
print(f"global min {x_min:.3f}   max {x_max:.3f}")
denom = x_max - x_min
assert denom > 1e-12, "min and max are equal – check the data!"

# 3) create **proper .npy file with header** that is also a mem-map
out_shape = x_map.shape
out_map   = open_memmap(
    OUT_FILE,
    mode="w+",                 # create for read/write
    dtype=np.float32,
    shape=out_shape
)
print("Writing to", OUT_FILE, "…")

# 4) normalise in mini-batches
BATCH = 512
N = out_shape[0]

for start in range(0, N, BATCH):
    end   = min(start + BATCH, N)
    slice = x_map[start:end].astype(np.float32)
    slice = 2.0 * (slice - x_min) / denom - 1.0     # → (-1 … +1)
    out_map[start:end] = slice
    if (start // BATCH) % 50 == 0:
        print(f"processed {end}/{N}")

out_map.flush()          # ensure data hits disk
print("✓ finished writing", OUT_FILE)

# 5) store scale params
np.savez(PARAMS_NPZ, xmin=x_min, xmax=x_max)
print("✓ saved scale params to", PARAMS_NPZ)



In [None]:
# ─────────────────────────────────────────────────────────────
# CONFIG  –  adjust the three paths below
# ─────────────────────────────────────────────────────────────
VAL_IN   = "/home/ml/Documents/voice/ResData/log-mel-eval-aggregate/real.npy"             # raw   (N,1,H,W) log-Mel values
VAL_OUT  = "/home/ml/Documents/voice/ResData/log-mel-eval-aggregate/normalized_real.npy"  # scaled copy  –1…+1  (same shape)
PARAMS   = "train_raw_norm_params.npz"  # the file you saved earlier
# ─────────────────────────────────────────────────────────────

import numpy as np
from pathlib import Path

# 1) load the two scale parameters learned on *training*
params = np.load(PARAMS)
xmin, xmax = float(params["xmin"]), float(params["xmax"])
denom      = xmax - xmin
assert denom > 1e-12, "xmin == xmax – scale parameters look wrong"

print(f"using xmin={xmin:.3f}  xmax={xmax:.3f}  (denom={denom:.3f})")

# 2) memory-map the big validation array (read-only)
x_val = np.load(VAL_IN, mmap_mode="r")           # shape (N,1,H,W)
print("val shape :", x_val.shape, "dtype :", x_val.dtype)

# 3) create an output mem-map with the same shape, float32
val_norm = np.memmap(VAL_OUT, dtype=np.float32,
                     mode="w+", shape=x_val.shape)

# 4) stream-normalise in mini-batches to keep RAM low
BATCH = 512                                       # tune per GPU/RAM
N      = x_val.shape[0]

for start in range(0, N, BATCH):
    end = min(start + BATCH, N)
    chunk = x_val[start:end].astype(np.float32)    # load small slice

    # Min-max transform → (-1 … +1)
    chunk = 2.0 * (chunk - xmin) / denom - 1.0

    val_norm[start:end] = chunk                    # write back
    if (start // BATCH) % 50 == 0:
        print(f"processed {end}/{N}")

val_norm.flush()   # make sure data is written to disk
print("✓ saved normalised validation file to", VAL_OUT)


# Normalized Directory

In [None]:
from pathlib import Path
import numpy as np
from tqdm.auto import tqdm

# ───────────────────────── CONFIG ──────────────────────────
# 1) Where are the raw log-Mel .npy files?
SRC_DIR = Path('/home/ml/Documents/voice/ResData/log-mel-data-train')      # <── change me

# 2) Where is the file that holds xmin / xmax?
PARAMS_FILE = Path('/home/ml/Documents/voice/ResPreprocess/train_raw_norm_params.npz')

# (No need to edit anything below unless you want to)
# ───────────────────────────────────────────────────────────

# → Destination folder:  normalized_<old-folder-name>
DST_DIR = SRC_DIR.parent / f'normalized_{SRC_DIR.name}'
DST_DIR.mkdir(parents=True, exist_ok=True)
print(f'Normalised files will be saved to:  {DST_DIR}\n')

# ── Load global min / max ─────────────────────────────────
with np.load(PARAMS_FILE) as f:
    xmin = f['xmin'].item()   # scalar, e.g. -80.0
    xmax = f['xmax'].item()   # scalar, e.g. -1.1e-5

denom = xmax - xmin
if np.isclose(denom, 0):
    raise ValueError('xmax and xmin are identical — cannot normalise.')

print(f'Using xmin = {xmin}, xmax = {xmax}  ➜  scale = {denom}\n')

# ── Walk through every .npy (sub-folders included) ─────────
for npy_path in tqdm(list(SRC_DIR.rglob('*.npy')), desc='Normalising'):
    x = np.load(npy_path)

    # min-max:   x' = (x − xmin) / (xmax − xmin)
    x_norm = (x - xmin) / denom
    x_norm = np.clip(x_norm, 0.0, 1.0).astype(np.float32)   # keep in [0,1]

    # Mirror the original folder structure inside DST_DIR
    out_path = DST_DIR / npy_path.relative_to(SRC_DIR)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    np.save(out_path, x_norm)

print('\n✅  All files normalised.')


In [None]:
from pathlib import Path
import numpy as np
from tqdm.auto import tqdm

# ───────────────────────── CONFIG ──────────────────────────
# 1) Where are the raw log-Mel .npy files?
SRC_DIR = Path('/home/ml/Documents/voice/ResData/log-mel-eval/fake')      # <── change me

# 2) Where is the file that holds xmin / xmax?
PARAMS_FILE = Path('/home/ml/Documents/voice/ResPreprocess/train_raw_norm_params.npz')

# (No need to edit anything below unless you want to)
# ───────────────────────────────────────────────────────────

# → Destination folder:  normalized_<old-folder-name>
DST_DIR = SRC_DIR.parent / f'normalized_{SRC_DIR.name}'
DST_DIR.mkdir(parents=True, exist_ok=True)
print(f'Normalised files will be saved to:  {DST_DIR}\n')

# ── Load global min / max ─────────────────────────────────
with np.load(PARAMS_FILE) as f:
    xmin = f['xmin'].item()   # scalar, e.g. -80.0
    xmax = f['xmax'].item()   # scalar, e.g. -1.1e-5

denom = xmax - xmin
if np.isclose(denom, 0):
    raise ValueError('xmax and xmin are identical — cannot normalise.')

print(f'Using xmin = {xmin}, xmax = {xmax}  ➜  scale = {denom}\n')

# ── Walk through every .npy (sub-folders included) ─────────
for npy_path in tqdm(list(SRC_DIR.rglob('*.npy')), desc='Normalising'):
    x = np.load(npy_path)

    # min-max:   x' = (x − xmin) / (xmax − xmin)
    x_norm = (x - xmin) / denom
    x_norm = np.clip(x_norm, 0.0, 1.0).astype(np.float32)   # keep in [0,1]

    # Mirror the original folder structure inside DST_DIR
    out_path = DST_DIR / npy_path.relative_to(SRC_DIR)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    np.save(out_path, x_norm)

print('\n✅  All files normalised.')


# Split Dataset

In [None]:
import numpy as np

def split_npy_file(
    input_path: str,
    train_path: str,
    test_path: str,
    train_ratio: float = 0.8,
    seed: int | None = None
) -> tuple[tuple[int,...], tuple[int,...]]:
    """
    Split a .npy dataset into train/test and save them.

    Parameters
    ----------
    input_path : str
        Path to the original .npy file (shape: N×1×128×128).
    train_path : str
        Where to save the training split (will be .npy).
    test_path : str
        Where to save the test split (will be .npy).
    train_ratio : float, default=0.8
        Fraction of samples to go into the training set.
    seed : int or None
        RNG seed for reproducible shuffling.

    Returns
    -------
    train_shape : tuple
        Shape of the saved training array.
    test_shape : tuple
        Shape of the saved test array.
    """
    data = np.load(input_path)
    N = data.shape[0]
    m = int(N * train_ratio)

    # shuffle indices
    rng = np.random.default_rng(seed)
    idx = rng.permutation(N)

    train_idx = idx[:m]
    test_idx  = idx[m:]

    train_data = data[train_idx]
    test_data  = data[test_idx]

    np.save(train_path, train_data)
    np.save(test_path,  test_data)

    return train_data.shape, test_data.shape

# Example usage:
# train_shape, test_shape = split_npy_file(
#     'all_data.npy',
#     'train_data.npy',
#     'test_data.npy',
#     train_ratio=0.8,
#     seed=42
# )
# print(f"Train: {train_shape}, Test: {test_shape}")


In [17]:
train_shape, test_shape = split_npy_file(
    '/home/ml/Documents/voice/ResData/wavefake32/fake/normalized_ljspeech_full_band_melgan.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/train/fake/ljspeech_full_band_melgan.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/test/fake/ljspeech_full_band_melgan.npy',
    train_ratio=0.8,
    seed=42
)
print(f"Train: {train_shape}, Test: {test_shape}")

Train: (35369, 1, 64, 64), Test: (8843, 1, 64, 64)


In [18]:
train_shape, test_shape = split_npy_file(
    '/home/ml/Documents/voice/ResData/wavefake32/fake/normalized_ljspeech_hifiGAN.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/train/fake/ljspeech_hifiGAN.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/test/fake/ljspeech_hifiGAN.npy',
    train_ratio=0.8,
    seed=42
)
print(f"Train: {train_shape}, Test: {test_shape}")

Train: (35311, 1, 64, 64), Test: (8828, 1, 64, 64)


In [19]:
train_shape, test_shape = split_npy_file(
    '/home/ml/Documents/voice/ResData/wavefake32/fake/normalized_ljspeech_melgan.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/train/fake/ljspeech_melgan.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/test/fake/ljspeech_melgan.npy',
    train_ratio=0.8,
    seed=42
)
print(f"Train: {train_shape}, Test: {test_shape}")

Train: (35369, 1, 64, 64), Test: (8843, 1, 64, 64)


In [20]:
train_shape, test_shape = split_npy_file(
    '/home/ml/Documents/voice/ResData/wavefake32/fake/normalized_ljspeech_parallel_wavegan.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/train/fake/ljspeech_parallel_wavegan.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/test/fake/ljspeech_parallel_wavegan.npy',
    train_ratio=0.8,
    seed=42
)
print(f"Train: {train_shape}, Test: {test_shape}")

Train: (35369, 1, 64, 64), Test: (8843, 1, 64, 64)


In [21]:
train_shape, test_shape = split_npy_file(
    '/home/ml/Documents/voice/ResData/wavefake32/fake/normalized_ljspeech_melgan_large.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/train/fake/ljspeech_melgan_large.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/test/fake/ljspeech_melgan_large.npy',
    train_ratio=0.8,
    seed=42
)
print(f"Train: {train_shape}, Test: {test_shape}")

Train: (35369, 1, 64, 64), Test: (8843, 1, 64, 64)


In [22]:
train_shape, test_shape = split_npy_file(
    '/home/ml/Documents/voice/ResData/wavefake32/fake/normalized_ljspeech_multi_band_melgan.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/train/fake/ljspeech_multi_band_melgan.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/test/fake/ljspeech_multi_band_melgan.npy',
    train_ratio=0.8,
    seed=42
)
print(f"Train: {train_shape}, Test: {test_shape}")

Train: (35369, 1, 64, 64), Test: (8843, 1, 64, 64)


In [23]:
train_shape, test_shape = split_npy_file(
    '/home/ml/Documents/voice/ResData/wavefake32/fake/normalized_ljspeech_waveglow.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/train/fake/ljspeech_waveglow.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/test/fake/ljspeech_waveglow.npy',
    train_ratio=0.8,
    seed=42
)
print(f"Train: {train_shape}, Test: {test_shape}")

Train: (35369, 1, 64, 64), Test: (8843, 1, 64, 64)


In [24]:
train_shape, test_shape = split_npy_file(
    '/home/ml/Documents/voice/ResData/wavefake32/normalized_real.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/train/real.npy',
    '/home/ml/Documents/voice/ResData/wavefake32_2048split/test/real.npy',
    train_ratio=0.8,
    seed=42
)
print(f"Train: {train_shape}, Test: {test_shape}")

Train: (35349, 1, 64, 64), Test: (8838, 1, 64, 64)


# Alternative Preprocess

- enhancing frequency domain!!!!!

In [1]:
import math
import os
import tempfile
from pathlib import Path

import librosa
import numpy as np
import soundfile as sf
from skimage.transform import resize

# ──────────────────────────────────────────────────────────────────────
# constants  (ONLY the first two lines changed)
# ──────────────────────────────────────────────────────────────────────
SR            = 16_000          # ← resample target is now 16 kHz
SEGMENT_LEN   = 4 * SR          # 4-second chunks = 64 000 samples
N_MELS        = 128
N_FFT         = 2048
HOP_LENGTH    = 512
TARGET_FRAMES = 128
MAX_PAD_RATIO = 0.65


def load_audio(path: Path) -> np.ndarray:
    """Read WAV/FLAC/etc. → mono float32, then resample to 16 kHz."""
    audio, sr_in = sf.read(str(path))
    if audio.ndim > 1:
        audio = audio.mean(axis=1)           # mix to mono
    if sr_in != SR:                          # resample if needed
        audio = librosa.resample(audio,
                                 orig_sr=sr_in,
                                 target_sr=SR)
    return audio.astype(np.float32)


def wav_to_logmel(wav: np.ndarray) -> np.ndarray:
    mel = librosa.feature.melspectrogram(
        y=wav, sr=SR, n_fft=N_FFT, hop_length=HOP_LENGTH,
        n_mels=N_MELS, power=2.0,
    )
    logmel = librosa.power_to_db(mel, ref=np.max)
    if logmel.shape[1] != TARGET_FRAMES:
        logmel = resize(
            logmel, (N_MELS, TARGET_FRAMES),
            mode="reflect", preserve_range=True, anti_aliasing=True
        )
    return logmel.astype(np.float32)

def full_logmel(wav: np.ndarray) -> np.ndarray:
    """
    Log-Mel spectrogram for the *entire* waveform.
    Returns (N_MELS, T_frames) in dB.
    """
    mel = librosa.feature.melspectrogram(
        y=wav, sr=SR, n_fft=N_FFT, hop_length=HOP_LENGTH,
        n_mels=N_MELS, power=2.0,
    )
    return librosa.power_to_db(mel, ref=np.max).astype(np.float32)  # (128, T)


def preprocess_logmel_to_npy_normalised(
    wav_dir: str | Path,
    out_npy_file: str | Path,
    out_stats_json: str | Path | None = None,
) -> np.ndarray:

    wav_dir = Path(wav_dir)
    tiles: list[np.ndarray] = []

    for wav_path in wav_dir.rglob("*.wav"):
        wav = load_audio(wav_path)
        logmel = full_logmel(wav)          # (128, T)
        n_frames = logmel.shape[1]
        n_chunks = math.ceil(n_frames / TARGET_FRAMES)

        for i in range(n_chunks):
            tile = logmel[:, i*TARGET_FRAMES:(i+1)*TARGET_FRAMES]
            missing = TARGET_FRAMES - tile.shape[1]

            if missing > 0:
                pad_ratio = missing / TARGET_FRAMES
                if pad_ratio > MAX_PAD_RATIO:
                    continue  # skip overly short tail
                # pad with the floor dB value so “silence” looks right
                tile = np.pad(
                    tile,
                    pad_width=((0, 0), (0, missing)),
                    mode="constant",
                    constant_values=-80.0,
                )
            tiles.append(tile[None, ...])   # (1,128,128)

    if not tiles:
        raise RuntimeError(f"No usable segments found in {wav_dir}")

    data = np.stack(tiles, axis=0)          # (N,1,128,128) float32

    # ── fixed-range scaling to [-1,1] ──────────────────────────
    floor_db, ceil_db = -80.0, 0.0
    data = np.clip(data, floor_db, ceil_db)
    data = (data - floor_db) / (ceil_db - floor_db)   # → [0,1]
    data = data * 2.0 - 1.0                          # → [-1,1]

    # ── atomic save ────────────────────────────────────────────
    out_npy_file = Path(out_npy_file)
    out_npy_file.parent.mkdir(parents=True, exist_ok=True)
    with tempfile.NamedTemporaryFile(dir=out_npy_file.parent,
                                     suffix=".npy", delete=False) as tmp:
        np.save(tmp.name, data)
        tmp.flush()
        os.fsync(tmp.fileno())
    os.replace(tmp.name, out_npy_file)

    print(f"✅ Saved {data.shape[0]} segments → {out_npy_file}")
    return data


In [None]:
preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/fake/ljspeech_full_band_melgan",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake32/fake/normalized_ljspeech_full_band_melgan.npy"
)

preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/fake/ljspeech_hifiGAN",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake32/fake/normalized_ljspeech_hifiGAN.npy"
)

preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/fake/ljspeech_melgan",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake32/fake/normalized_ljspeech_melgan.npy"
)

preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/fake/ljspeech_melgan_large",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake32/fake/normalized_ljspeech_melgan_large.npy"
)

preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/fake/ljspeech_multi_band_melgan",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake32/fake/normalized_ljspeech_multi_band_melgan.npy"
)

preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/fake/ljspeech_parallel_wavegan",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake32/fake/normalized_ljspeech_parallel_wavegan.npy"
)

preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/fake/ljspeech_waveglow",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake32/fake/normalized_ljspeech_waveglow.npy"
)

preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_wavefake/real/wavs",
    out_npy_file="/home/ml/Documents/voice/ResData/wavefake32/normalized_real.npy"
)

In [3]:
preprocess_logmel_to_npy_normalised(
    wav_dir="/home/ml/Documents/voice/data_train/eval/real",
    out_npy_file="/home/ml/Documents/voice/ResData/ASV128/test/real.npy"
)

✅ Saved 7649 segments → /home/ml/Documents/voice/ResData/ASV128/test/real.npy


array([[[[-0.6568533 , -0.7925539 , -1.        , ..., -1.        ,
          -1.        , -1.        ],
         [-0.6125681 , -0.74188995, -1.        , ..., -1.        ,
          -1.        , -1.        ],
         [-0.49124545, -0.45326233, -0.38913214, ..., -1.        ,
          -1.        , -1.        ],
         ...,
         [-1.        , -1.        , -1.        , ..., -1.        ,
          -1.        , -1.        ],
         [-1.        , -1.        , -1.        , ..., -1.        ,
          -1.        , -1.        ],
         [-1.        , -1.        , -1.        , ..., -1.        ,
          -1.        , -1.        ]]],


       [[[-0.65216905, -0.8166157 , -1.        , ..., -1.        ,
          -1.        , -1.        ],
         [-0.54957044, -0.707338  , -1.        , ..., -1.        ,
          -1.        , -1.        ],
         [-0.35180247, -0.39018136, -0.28695142, ..., -1.        ,
          -1.        , -1.        ],
         ...,
         [-1.        , -1.      