In [7]:
import os
import torch
import torchaudio
import tarfile
import wandb
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
import gc
from torch.utils.data import Dataset
from torchaudio.datasets import GTZAN
from torch.utils.data import DataLoader
import torchaudio.transforms as tt
from torch.utils.data import random_split
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedShuffleSplit
from torch.utils.data import Subset

In [2]:
# Configuración de la semilla para reproducibilidad
random_seed = 42
torch.manual_seed(random_seed)

# Parámetros de configuración inicial
samplerate = 22050
init_batch_size = 20
init_num_epochs = 10
init_lr = 0.0005
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Función para extraer el género de cada archivo
def parse_genres(fname):
    parts = fname.split('/')[-1].split('.')[0]
    return parts

# Clase para cargar y manejar el dataset
class MusicDataset(Dataset):
    def __init__(self, root):
        super().__init__()
        self.root = root
        self.files = []
        for c in os.listdir(root):
            # Agrega cada archivo .wav a la lista con su path completo
            self.files += [os.path.join(root, c, fname) for fname in os.listdir(os.path.join(root, c)) if fname.endswith('.wav')]
        self.classes = list(set(parse_genres(fname) for fname in self.files))

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        fpath = self.files[idx]
        genre = parse_genres(fpath)
        class_idx = self.classes.index(genre)
        audio = torchaudio.load(fpath)[0]
        return audio, class_idx

# Cargar el dataset
data_dir = './genres_5sec'
dataset = MusicDataset(data_dir)

# Crear los pesos de las clases para balancear el dataset
labels = [dataset[i][1] for i in range(len(dataset))]
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weights = torch.tensor(class_weights, dtype=torch.float)

In [4]:
# Dividir el dataset en entrenamiento, validación y prueba usando StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=random_seed)
for train_idx, temp_idx in split.split(range(len(dataset)), labels):
    train_dataset = Subset(dataset, train_idx)
    temp_dataset = Subset(dataset, temp_idx)

val_test_labels = [labels[i] for i in temp_idx]
split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=random_seed)
for val_idx, test_idx in split.split(temp_idx, val_test_labels):
    val_dataset = Subset(dataset, [temp_idx[i] for i in val_idx])
    test_dataset = Subset(dataset, [temp_idx[i] for i in test_idx])

# Creación de data loaders para cada conjunto
train_loader = DataLoader(train_dataset, batch_size=init_batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=init_batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=init_batch_size, shuffle=False)

In [5]:
# Define a model class to experiment with different architectures
class ExperimentNN(nn.Module):
    def __init__(self, input_size, num_classes, layer_size, layers):
        super(ExperimentNN, self).__init__()
        print(f"Initializing model with {layers} layers and {layer_size} layer size")
        self.layer_sizes = layer_size
        self.fc_layers = nn.ModuleList()

        for layer in range(layers - 1):
            if layer == 0:
                self.fc_layers.append(nn.Linear(input_size, layer_size))
            else:
                self.fc_layers.append(nn.Linear(layer_size, layer_size))
        
        # Output layer
        self.fc_layers.append(nn.Linear(layer_size, num_classes))

    def forward(self, x):
        print("Running forward pass...")
        for fc in self.fc_layers[:-1]:  # Skip last layer
            x = F.relu(fc(x))
        x = self.fc_layers[-1](x)  # Output layer (no activation)
        return x

def train_model(model, criterion, optimizer, epochs, train_loader, val_loader, device):
    print("Starting training...")
    best_loss = float("inf")
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}...")

        # Training loop
        model.train()
        train_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(1), labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        train_loss /= len(train_loader)
        print(f"Train loss: {train_loss:.4f}")

        # Validation step
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(1), labels)
                val_loss += loss.item()
        val_loss /= len(val_loader)
        print(f"Validation loss: {val_loss:.4f}")

        if val_loss < best_loss:
            best_loss = val_loss
            print(f"New best validation loss: {best_loss:.4f}")

        # Clear cache and collect garbage
        gc.collect()
        torch.cuda.empty_cache()

    return best_loss

In [8]:
# Define the search space for hyperopt
layers = [2, 3, 5, 6, 9]
sizes = [32, 64, 128, 256]

best_val_loss = float("inf")
learning_rate = 0.0005
weight_decay = 1e-4
best_model = None
input_size = 22050*5
num_classes=10

# Definir los parámetros del modelo
input_size = 22050 * 5  # Ejemplo: 22050 muestras por 5 segundos de audio
num_classes = 10        # Número de clases (géneros musicales)
layer_size = 64         # Tamaño de las capas ocultas
layers = 3              # Número total de capas (incluyendo la capa de salida)


# Experiment loop
for layer in range(layers):
    for size in sizes:
        print(f"Experimenting with {layer} layers and {size} hidden layer size...")
        # Initialize the model
        model = ExperimentNN(input_size, num_classes, layer_size, layers).to(device)

        # Set the criterion and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

        validation_loss = train_model(model, criterion, optimizer, 10, train_loader, val_loader, device)

        print(f"Validation loss for {layer} layers and {size} hidden units: {validation_loss}")

        if validation_loss < best_val_loss:
            best_val_loss = validation_loss
            best_model = model

print(f"Best model found with validation loss: {best_val_loss}")

Experimenting with 0 layers and 32 hidden layer size...
Initializing model with 3 layers and 64 layer size
Starting training...
Epoch 1/10...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Running forward pass...
Train loss: 2.2903

In [15]:
# Evaluación final en el conjunto de test
print("Evaluating best model on test set...")
model.eval()
test_loss = 0.0
correct = 0
total = len(test_loader.dataset)
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)

        # Eliminar la dimensión innecesaria
        outputs = outputs.squeeze(1)

        print(f"outputs shape after squeeze: {outputs.shape}")  # Imprimir la nueva forma

        loss = criterion(outputs, labels)  # Usar outputs directamente sin squeeze
        test_loss += loss.item()

        # Obtener las predicciones (ahora outputs debe tener la forma [batch_size, num_classes])
        predicted = outputs.argmax(dim=1)  # Obtiene el índice de la clase predicha

        # Asegurarse de que predicted y labels tengan las mismas dimensiones
        correct += (predicted == labels).sum().item()

test_loss /= len(test_loader)
accuracy = 100 * correct / total

print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {accuracy:.2f}%")

Evaluating best model on test set...
Running forward pass...
outputs shape after squeeze: torch.Size([20, 10])
Running forward pass...
outputs shape after squeeze: torch.Size([20, 10])
Running forward pass...
outputs shape after squeeze: torch.Size([20, 10])
Running forward pass...
outputs shape after squeeze: torch.Size([20, 10])
Running forward pass...
outputs shape after squeeze: torch.Size([20, 10])
Running forward pass...
outputs shape after squeeze: torch.Size([20, 10])
Running forward pass...
outputs shape after squeeze: torch.Size([20, 10])
Running forward pass...
outputs shape after squeeze: torch.Size([9, 10])
Test Loss: 2.7520, Test Accuracy: 17.45%
