In [1]:
import sys
sys.path.append('../')  # Adjust the path as necessary
from src.data.librisubset import LibriSampledDataset

In [2]:
file_path = '/scratch/f006pq6/projects/asr-grad-reconstruction/samples/samples_below_4s_bucket_500_all_minh.txt'
dataset = LibriSampledDataset(file_path, min_length=0, max_length=100000000, transform=None, target_transform=None)

In [3]:
import h5py
import os

# Assume `dataset` is your PyTorch dataset object
def save_dataset_item_as_hdf5(dataset, output_dir):
    total_samples = len(dataset)
    
    # Ensure output directory exists
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Iterate over the dataset
    for idx in range(total_samples):
        # Get the data from the dataset
        numpy_array, text, file_path, duration, sampling_rate = dataset[idx]
        
        # Create the filename for the HDF5 file (e.g., dataset_item_0.h5, dataset_item_1.h5, ...)
        hdf5_filename = os.path.join(output_dir, f'dataset_item_{idx}.h5')
        
        # Write the data to an HDF5 file
        with h5py.File(hdf5_filename, 'w') as hdf5_file:
            # Create a dataset for the numpy array
            hdf5_file.create_dataset('array', data=numpy_array)
            
            # Create a dataset for the text (use string dtype)
            hdf5_file.create_dataset('text', data=text, dtype=h5py.string_dtype())
            
            # Create a dataset for the file path (use string dtype)
            hdf5_file.create_dataset('file_path', data=file_path, dtype=h5py.string_dtype())
            
            # Create a dataset for the duration (floating point)
            hdf5_file.create_dataset('duration', data=duration)
            
            # Create a dataset for the sampling rate (integer)
            hdf5_file.create_dataset('sampling_rate', data=sampling_rate)

        print(f'Saved item {idx} to {hdf5_filename}')

# Usage:
# Assuming you have your dataset and output directory
output_dir = 'output_hdf5_files'  # Replace with your output directory path
save_dataset_item_as_hdf5(dataset, output_dir)


/scratch/f006pq6/datasets/.deep-speaker-wd/LibriSpeech/train-other-500/3557/8342/3557-8342-0013.wav
Saved item 0 to output_hdf5_files/dataset_item_0.h5
/scratch/f006pq6/datasets/.deep-speaker-wd/LibriSpeech/train-clean-360/4133/6541/4133-6541-0027.wav
Saved item 1 to output_hdf5_files/dataset_item_1.h5
/scratch/f006pq6/datasets/.deep-speaker-wd/LibriSpeech/train-clean-360/986/129388/986-129388-0101.wav
Saved item 2 to output_hdf5_files/dataset_item_2.h5
/scratch/f006pq6/datasets/.deep-speaker-wd/LibriSpeech/train-other-500/726/124445/726-124445-0117.wav
Saved item 3 to output_hdf5_files/dataset_item_3.h5
/scratch/f006pq6/datasets/.deep-speaker-wd/LibriSpeech/train-other-500/51/121055/51-121055-0201.wav
Saved item 4 to output_hdf5_files/dataset_item_4.h5
/scratch/f006pq6/datasets/.deep-speaker-wd/LibriSpeech/train-other-500/5725/50452/5725-50452-0020.wav
Saved item 5 to output_hdf5_files/dataset_item_5.h5
/scratch/f006pq6/datasets/.deep-speaker-wd/LibriSpeech/train-other-500/3798/16981/

In [10]:
from torch.utils.data import Dataset
class HDF5SampledDataset(Dataset):
    '''
    Dataset to load individual HDF5 files containing (numpy array, text, file_path, duration, sampling_rate).
    '''
    def __init__(self, hdf5_dir, min_length=0, max_length=100000, transform=None, target_transform=None):
        # Get list of HDF5 files
        self.hdf5_files = sorted([f for f in os.listdir(hdf5_dir) if f.endswith('.h5')])
        self.hdf5_dir = hdf5_dir
        self.min_length = min_length
        self.max_length = max_length
        self._transform = transform
        self._target_transform = target_transform

        # Filter the dataset based on the duration in the HDF5 files
        self.filtered_files = []
        for hdf5_file in self.hdf5_files:
            file_path = os.path.join(hdf5_dir, hdf5_file)
            with h5py.File(file_path, 'r') as hdf5:
                duration = hdf5['duration'][()]
                if self.min_length <= duration < self.max_length:
                    self.filtered_files.append(hdf5_file)

    def __len__(self):
        return len(self.filtered_files)

    def __getitem__(self, index):
        # Get the corresponding HDF5 file path
        hdf5_file = self.filtered_files[index]
        hdf5_path = os.path.join(self.hdf5_dir, hdf5_file)

        # Read the data from the HDF5 file
        with h5py.File(hdf5_path, 'r') as hdf5:
            audio = hdf5['array'][:]
            text = hdf5['text'][()].decode('utf-8')
            file_path = hdf5['file_path'][()].decode('utf-8')
            duration = hdf5['duration'][()]
            rate = hdf5['sampling_rate'][()]


        # Apply optional transformations to audio
        if self._transform is not None:
            audio = self._transform(audio)

        # Apply optional transformations to target text
        if self._target_transform is not None:
            text = self._target_transform(text)

        return audio, text

In [11]:
dataset = HDF5SampledDataset('output_hdf5_files', min_length=1, max_length=1000)

In [14]:
dataset[0]

(array([ 77,  58,  59, ..., 107, -62,   8], dtype=int16),
 'i have',
 '/scratch/f006pq6/datasets/.deep-speaker-wd/LibriSpeech/train-other-500/3557/8342/3557-8342-0013.wav',
 310)