In [18]:
import os
import warnings
import numpy as np
import torch
import torch.nn as nn
import torchaudio
from pathlib import Path
from torch.utils.data import Dataset, DataLoader

# from constants import *

In [5]:
# allows the output of plotting commands to be displayed directly within the notebook
%matplotlib inline

# ignore depracation warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)

In [6]:
# set the seed for random number generation in NumPy
# Setting the seed ensures that the sequence of random 
# numbers generated will be the same every time the code is run.
np.random.seed(999)

# set the seed for random number generation in PyTorch
torch.manual_seed(999)

# If running on Cuda set these 2 for determinism
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [7]:
# Check GPU is available
GPU_AVAILABLE=torch.cuda.is_available()

if(GPU_AVAILABLE):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')
       
DEVICE = torch.device('cuda' if GPU_AVAILABLE else 'cpu')

No GPU available, training on CPU.


In [8]:
!nvidia-smi

zsh:1: command not found: nvidia-smi


In [13]:
SAMPLE_RATE = 48000
N_FFT = (SAMPLE_RATE * 64) // 1000          # 3072
HOP_LENGTH = (SAMPLE_RATE * 16) // 1000     # 768

In [17]:
TRAIN_INPUT_DIR = Path('datasets/class_9_train_input')
TRAIN_TARGET_DIR = Path('datasets/clean_trainset_28spk_wav')
TEST_INPUT_DIR = Path('datasets/class_9_test_input')
TEST_TARGET_DIR = Path('datasets/clean_testset_wav')

In [16]:
class SpeechDataset(Dataset):
    """
    A dataset class with audio that cuts them/paddes them to a specified length, applies a STFT,
    normalizes and leads to a tensor.
    """
    def __init__(self, clean_files, noisy_files, n_fft=64, hop_length=16):
        super().__init__()
        self.clean_files = sorted(clean_files)
        self.noisy_files = sorted(noisy_files)
        self.n_fft = n_fft
        self.hop_length = hop_length
        self.max_len = 165000

    def __len__(self):
        return len(self.noisy_files)
    
    def __getitem__(self, index):
        # load waf files to waveform tensors
        x_clean = self.load_sample(self.clean_files[index])
        x_noisy = self.load_sample(self.noisy_files[index])

        # apply padding and cutting
        x_clean = self._prepare_sample(x_clean)
        x_noisy = self._prepare_sample(x_noisy)

        # apply STFT
        x_clean_stft = torch.stft(input=x_clean, n_fft=self.n_fft, hop_length=self.hop_length, normalized=True)
        x_noisy_stft = torch.stft(input=x_noisy, n_fft=self.n_fft, hop_length=self.hop_length, normalized=True)

        return x_clean_stft, x_noisy_stft

    
    def load_sample(self, file):
        waveform, _ = torchaudio.load(file)
        return waveform
    
    def _prepare_sample(self, waveform):
        """"
        Processes an input waveform, which is a 2D NumPy array
        """
        # converts an input waveform from a PytorchTensor to a NumPy array
        waveform = waveform.numpy()
        # get the length of the audio sequence
        current_len = waveform.shape[1]
        # initialize the output array with zeros
        output = np.zeros((1, self.max_len), dtype='float32')
        # copy the values from the input waveform into the initialized 
        # output array. It ensures that the waveform is placed at the end of the output 
        # array, and if the waveform is longer than self.max_len, it is truncated to fit.
        output[0, -current_len:] = waveform[0, :self.max_len]
        # convert the NumPy array to a PyTorch tensor
        return torch.from_numpy(output)

In [None]:
train_input_files = sorted(list(TRAIN_INPUT_DIR.rglob('*.wav')))
train_target_files = sorted(list(TRAIN_TARGET_DIR.rglob('*.wav')))
test_input_files = sorted(list(TEST_INPUT_DIR.rglob('*.wav')))
test_target_files = sorted(list(TEST_TARGET_DIR.rglob('*.wav')))

In [None]:
print(f"Train Input Files: {len(train_input_files)}")
print(f"Train Target Files: {len(train_target_files)}")
print(f"Test Input Files: {len(test_input_files)}")
print(f"Test Target Files: {len(train_target_files)}")

In [None]:
train_dataset = SpeechDataset(train_target_files, train_input_files, N_FFT, HOP_LENGTH)
test_dataset = SpeechDataset(test_target_files, test_input_files, N_FFT, HOP_LENGTH)

In [None]:
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)