In [12]:
import pandas as pd
import numpy as np
import soundfile as sf
import librosa

In [28]:
df = pd.read_csv("audio/train.csv")
sample = df.iloc[100]["filename"]

In [44]:
path = f"audio/train/{sample}"
sr = 44100
n_fft = int(0.025 * sr)
hop_length = int(0.010 * sr)
n_mels = 128
y, _ = librosa.load(path, sr=sr)
S = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, window='hamming')
log_S = librosa.power_to_db(S, ref=np.max)

In [53]:
patch_size = 16
overlap    = 6
stride     = patch_size - overlap   # 10 :contentReference[oaicite:5]{index=5}

# dimensiones del espectrograma
n_freq, n_time = log_S.shape        # (128, ~100·t)

# número de parches por dimensión
n_patches_freq = (n_freq  - patch_size) // stride + 1
n_patches_time = (n_time - patch_size) // stride + 1

# total de parches N = n_patches_freq * n_patches_time
N = n_patches_freq * n_patches_time  # debería coincidir con 12·ceil((100t−16)/10) :contentReference[oaicite:6]{index=6}


In [54]:
from numpy.lib.stride_tricks import sliding_window_view

# crea una vista con ventanas de 16×16 :contentReference[oaicite:7]{index=7}
windows = sliding_window_view(
    log_S,
    window_shape=(patch_size, patch_size)
)

# windows tiene forma (n_freq-15, n_time-15, 16, 16);
# muestreamos cada stride-ésimo elemento
patches = windows[
    ::stride,        # freq
    ::stride,        # time
    :, :             # dentro de cada ventana
]
# aplanamos los dos primeros ejes en uno solo:
patches = patches.reshape(-1, patch_size, patch_size)  # (N, 16, 16)


## saving

In [65]:
import numpy as np
import pandas as pd
import librosa
import torch
from numpy.lib.stride_tricks import sliding_window_view

# --- Parámetros de audio y parches (igual que antes) ---
sr         = 44100
n_fft      = int(0.025 * sr)
hop_length = int(0.010 * sr)
n_mels     = 128
patch_size = 16
overlap    = 6
stride     = patch_size - overlap
duration   = 3.0  # segundos

# --- Leer CSV y directorio de audios ---
df        = pd.read_csv("audio/train.csv")
audio_dir = "audio/train"

# Determinamos las columnas de etiqueta (todas menos 'filename')
label_cols = [c for c in df.columns if c != "filename"]

data_list  = []
label_list = []

for _, row in df.iterrows():
    # 1) Carga y pad a duración fija
    y, _ = librosa.load(f"{audio_dir}/{row['filename']}", sr=sr, duration=duration)
    if len(y) < int(sr * duration):
        print(f"Audio {row['filename']} es más corto que {duration} segundos, padding...")
        y = np.pad(y, (0, int(sr * duration) - len(y)))

    # 2) Mel-spectrogram → dB
    S     = librosa.feature.melspectrogram(
                y=y, sr=sr,
                n_fft=n_fft, hop_length=hop_length,
                n_mels=n_mels, window="hamming"
            )
    log_S = librosa.power_to_db(S, ref=np.max)

    # 3) Extraer parches 16×16 con solape 6 → stride=10
    windows = sliding_window_view(log_S, window_shape=(patch_size, patch_size))
    patches = windows[::stride, ::stride]               
    patches = patches.reshape(-1, patch_size, patch_size)  # (N,16,16)

    data_list.append(patches)

    # 4) Extraer vector de etiquetas multi‑hot
    labels = row[label_cols].values.astype(np.int64)     # e.g. [0,0,1,0,...]
    label_list.append(labels)

# 5) Apilar en arrays numpy
data_array   = np.stack(data_list,  axis=0)   # (num_samples, N, 16, 16)
labels_array = np.stack(label_list, axis=0)   # (num_samples, num_classes)

# 6) Convertir a tensores PyTorch
tensor_data   = torch.from_numpy(data_array).float()      # float para pasar al modelo
tensor_labels = torch.from_numpy(labels_array).float()    # float para BCEWithLogitsLoss

# 7) Guardar en un solo .pt
torch.save(
    {"data": tensor_data, "labels": tensor_labels, "label_cols": label_cols},
    "ast_multilabel_dataset.pt"
)
print("Guardado ast_multilabel_dataset.pt con",
      tensor_data.shape, "y", tensor_labels.shape)


Procesados 0 audios de 62191
Procesados 100 audios de 62191
Procesados 200 audios de 62191
Procesados 300 audios de 62191
Procesados 400 audios de 62191
Procesados 500 audios de 62191
Procesados 600 audios de 62191
Procesados 700 audios de 62191


KeyboardInterrupt: 