In [8]:
import torch
import torchaudio
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence

# Ensure you have torchaudio dataset ready
torchaudio.datasets.LIBRISPEECH(root="./data", download=True)

def prepare_data():
    train_dataset = torchaudio.datasets.LIBRISPEECH(
        root="./data", url="train-clean-100", download=True
    )

    def collate_fn(batch):
        # Pads the audio to the same length for batching
        waveforms, sample_rates = zip(*[(item[0], item[1]) for item in batch])
        lengths = torch.tensor([waveform.size(1) for waveform in waveforms])
        waveforms = nn.utils.rnn.pad_sequence(waveforms, batch_first=True)
        return waveforms, lengths

    train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_fn)
    return train_loader

# Simple Autoencoder model
class AudioAutoencoder(nn.Module):
    def __init__(self):
        super(AudioAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(32000, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
        )

        self.decoder = nn.Sequential(
            nn.Linear(512, 1024),
            nn.ReLU(),
            nn.Linear(1024, 32000),
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

def train_autoencoder():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Data Preparation
    train_loader = prepare_data()

    # Model, Loss, Optimizer
    model = AudioAutoencoder().to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training Loop
    epochs = 2
    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for waveforms, lengths in train_loader:
            waveforms = waveforms.to(device)

            # Flatten the audio for linear processing (basic setup)
            waveforms_flat = waveforms.view(waveforms.size(0), -1)

            # Forward pass
            outputs = model(waveforms_flat)

            # Compute loss
            loss = criterion(outputs, waveforms_flat)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(train_loader):.4f}")

if __name__ == "__main__":
    train_autoencoder()

RuntimeError: The size of tensor a (244480) must match the size of tensor b (89520) at non-singleton dimension 1