In [1]:
import pandas as pd
import numpy as np
import soundfile as sf
import librosa

In [28]:
df = pd.read_csv("audio/train.csv")
sample = df.iloc[100]["filename"]

In [44]:
path = f"audio/train/{sample}"
sr = 44100
n_fft = int(0.025 * sr)
hop_length = int(0.010 * sr)
n_mels = 128
y, _ = librosa.load(path, sr=sr)
S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, window='hamming')
log_S = librosa.power_to_db(S, ref=np.max)

In [53]:
patch_size = 16
overlap    = 6
stride     = patch_size - overlap   # 10 :contentReference[oaicite:5]{index=5}

# dimensiones del espectrograma
n_freq, n_time = log_S.shape        # (128, ~100·t)

# número de parches por dimensión
n_patches_freq = (n_freq  - patch_size) // stride + 1
n_patches_time = (n_time - patch_size) // stride + 1

# total de parches N = n_patches_freq * n_patches_time
N = n_patches_freq * n_patches_time  # debería coincidir con 12·ceil((100t−16)/10) :contentReference[oaicite:6]{index=6}


In [54]:
from numpy.lib.stride_tricks import sliding_window_view

# crea una vista con ventanas de 16×16 :contentReference[oaicite:7]{index=7}
windows = sliding_window_view(
    log_S,
    window_shape=(patch_size, patch_size)
)

# windows tiene forma (n_freq-15, n_time-15, 16, 16);
# muestreamos cada stride-ésimo elemento
patches = windows[
    ::stride,        # freq
    ::stride,        # time
    :, :             # dentro de cada ventana
]
# aplanamos los dos primeros ejes en uno solo:
patches = patches.reshape(-1, patch_size, patch_size)  # (N, 16, 16)


## saving

In [None]:
import numpy as np
import pandas as pd
import librosa
import h5py
from numpy.lib.stride_tricks import sliding_window_view

# --- Parámetros de audio y parches ---
sr         = 44100
n_fft      = int(0.025 * sr)
hop_length = int(0.010 * sr)
n_mels     = 128
patch_size = 16
overlap    = 6
stride     = patch_size - overlap
duration   = 3.0  # segundos

# --- Leer CSV y preparar HDF5 ---
df        = pd.read_csv("audio/train.csv")
audio_dir = "audio/train"
label_cols = [c for c in df.columns if c != "filename"]

num_samples = len(df)
# podemos calcular N (número de parches) a partir de un ejemplo:
y, _       = librosa.load(f"{audio_dir}/{df.iloc[0]['filename']}", sr=sr, duration=duration)
S          = librosa.feature.melspectrogram(y=y, sr=sr,
                                            n_fft=n_fft, hop_length=hop_length,
                                            n_mels=n_mels, window="hamming")
log_S      = librosa.power_to_db(S, ref=np.max)
windows  = sliding_window_view(log_S, window_shape=(patch_size, patch_size))
ws       = windows[::stride, ::stride]     # shape = (n_wf, n_wt,16,16)
n_wf, n_wt = ws.shape[:2]
N        = n_wf * n_wt


num_classes = len(label_cols)

# Creamos el archivo HDF5 y los datasets vacíos
h5f = h5py.File("ast_dataset.h5", "w")
dset_data   = h5f.create_dataset(
    "data",
    shape=(num_samples, N, patch_size, patch_size),
    dtype=np.float32,
    chunks=(1, N, patch_size, patch_size),  # chunk por muestra
    compression="lzf"
)
dset_labels = h5f.create_dataset(
    "labels",
    shape=(num_samples, num_classes),
    dtype=np.int8,
    chunks=(1, num_classes),
    compression="lzf"
)

# --- Procesamiento en “chunks” de filas ---
chunk_size = 1000
for start in range(0, num_samples, chunk_size):
    end = min(start + chunk_size, num_samples)
    print(f"Procesando muestras {start}–{end-1}")
    for i in range(start, end):
        row = df.iloc[i]
        # 1) carga y pad
        y, _ = librosa.load(f"{audio_dir}/{row['filename']}", sr=sr, duration=duration)
        if len(y) < int(sr * duration):
            y = np.pad(y, (0, int(sr * duration) - len(y)))
        # 2) mel → dB
        S     = librosa.feature.melspectrogram(
                    y=y, sr=sr,
                    n_fft=n_fft, hop_length=hop_length,
                    n_mels=n_mels, window="hamming"
               )
        log_S = librosa.power_to_db(S, ref=np.max)
        # 3) extraer parches
        windows = sliding_window_view(log_S, window_shape=(patch_size, patch_size))
        patches = windows[::stride, ::stride]
        patches = patches.reshape(-1, patch_size, patch_size)  # (N,16,16)
        # 4) extraer etiquetas multi‑hot
        labels = row[label_cols].values.astype(np.int8)        # [0/1,…]

        # 5) escribir en HDF5
        dset_data[i, ...]   = patches
        dset_labels[i, ...] = labels

# Cerramos
h5f.close()
print("¡Listo! Tienes ast_dataset.h5 con:")
"""
  data:   62191 × 348 × 16 × 16
  labels: 62191 × 42
"""


In [6]:
import h5py
import torch
from torch.utils.data import Dataset, DataLoader

class H5AudioDataset(Dataset):
    def __init__(self, h5_path):
        self.h5f = h5py.File(h5_path, "r")
        self.X = self.h5f["data"]
        self.Y = self.h5f["labels"]
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, idx):
        x = torch.from_numpy(self.X[idx]).float()   # (N,16,16)
        y = torch.from_numpy(self.Y[idx]).float()   # (num_classes,)
        return x, y

# Carga y DataLoader
ds     = H5AudioDataset("ast_dataset.h5")
loader = DataLoader(ds, batch_size=32, shuffle=True, num_workers=4)

for patches, targets in loader:
    print(patches.shape, targets.shape)



torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) torch.Size([32, 42])
torch.Size([32, 348, 16, 16]) t