In [1]:
import os
import numpy as np
import tensorflow as tf
from typing import Sequence, Optional, Union
import math

2022-08-31 13:34:08.155060: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [4]:
class input(object):
    def __init__(self, filePaths:Sequence[str], labels:Sequence[float], startTimes:Optional[Sequence[float]]=None, endTimes:Optional[Sequence[float]]=None, batchSize:Optional[int]=None, duration:Union[float,str]=10, sr:int=4000):
        # Check that data has consistent shape
        cols = [filePaths, labels, startTimes, endTimes]
        if any(len(col)!=len(cols[0]) for col in cols if col is not None):
            raise Exception(f'Expected lists with equal length, but received lists with lengths {[len(col) for col in cols if col is not None]}')
        # Check that startTime and endTime are either both specified or both unspecified (assuming both are either a sequence of float or None)
        if type(startTimes)!=type(endTimes):
            raise Exception(f'startTimes and endTimes must either both have type Sequence[float] or both be None, but received startTimes with type {type(startTimes)} and endTimes with type {type(endTimes)}.')
        # Check that batch size is not larger than the total amount of data
        if batchSize is not None and batchSize > len(filePaths):
            raise Exception(f'batchSize={batchSize} cannot be larger than the length of filePaths')

        self.__filePaths = filePaths
        self.__labels = labels
        self.__startTimes = startTimes if startTimes is not None else [None]*len(filePaths)
        self.__endTimes = endTimes if endTimes is not None else [None]*len(filePaths)

        self.totalSize = len(filePaths)
        self.batchSize = batchSize
        self.numBatches = math.floor(self.totalSize / batchSize) if batchSize is not None else 0
        self.sr = sr
        self.duration = float(duration)

        def dataset(self) -> tf.data.Dataset:
            try:
                out = self.__dataset
            except AttributeError:
                datasetCols = [self.filePaths, self.labels, self.startTimes, self.endTimes]
                self.__dataset = (
                    tf.data.Dataset.from_tensor_slices(datasetCols)
                    .map(self.__preprocessor, num_parallel_calls=tf.data.AUTOTUNE)
                )
            return out

        def __preprocessor(self, filePath, label, startTime, endTime):
            # Load and decode audio file
            encodedAudio = tf.io.read_file(filePath)
            rawAudio, _ = tf.audio.decode_wav(encodedAudio)

            # If specified, extract the sample from the audio file using startTime and endTime.
            # Otherwise, use the entire audio file as the sample.
            if startTime is None or endTime is None:
                audio = rawAudio
            else:
                startInd = math.floor(startTime*self.sr)
                endInd = math.floor(endTime*self.sr)
                audio = rawAudio[startInd:endInd]

            # Resize samples to uniform length, either by padding it with silence or truncating it
            sampleLength = math.floor(self.duration*self.sr)
            buffer = tf.fill(dims=(sampleLength), value=0.)

            originalLength = tf.size(audio)
            trimmedLength = math.floor(self.duration*self.sr)
            startBound = originalLength - trimmedLength
            




In [105]:
data_dir = "data/raw_training/training_data/"
filePath = data_dir + '2530_AV.wav'
label = 1.0
startTime = 0.1
endTime = 0.2

sr = 4000
duration = 0.12

# Load and decode audio file
encodedAudio = tf.io.read_file(filePath)
rawAudio, _ = tf.audio.decode_wav(encodedAudio)

# If specified, extract the sample from the audio file using startTime and endTime.
# Otherwise, use the entire audio file as the sample.
if startTime is None or endTime is None:
    audio = rawAudio
else:
    startInd = math.floor(startTime*sr)
    endInd = math.floor(endTime*sr)
    audio = rawAudio[startInd:endInd]
audio = tf.squeeze(audio)

# Resize samples to uniform length, either by padding it with silence or truncating it
originalLength = tf.size(audio)
trimmedLength = tf.cast(tf.math.floor(duration*sr), tf.int32)
# Define the range of possible indices where the slice window may start
startBounds = tf.sort(tf.concat([originalLength-trimmedLength, tf.constant(0)], 0))
# Pad audio on both sides to the outer possible boundary of the slice window
padSize = tf.math.abs(startBounds[0])
paddedAudio = tf.pad(audio, paddings=[[padSize, padSize]])
# Get a random starting point for the slice window relative to the unpadded audio
startBounds = tf.cast(startBounds, tf.float32)
unpaddedStart = tf.math.round(tf.random.uniform(shape=[1], minval=startBounds[0], maxval=startBounds[1]))
# Slice the audio
paddedStart = tf.cast(unpaddedStart + tf.cast(padSize, tf.float32), tf.int32)
shape = tf.reshape(trimmedLength, [1])
trimmedAudio = tf.slice(paddedAudio, paddedStart, shape)



(400,)
tf.Tensor(400, shape=(), dtype=int32) tf.Tensor(480, shape=(), dtype=int32)


<tf.Tensor: shape=(480,), dtype=float32, numpy=
array([ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
       -8.85009766e-04,  1.00708008e-03, -9.76562500e-04, -9.46044922e-04,
       -2.62451172e-03, -2.62451172e-03, -1.09863281e-03, -2.59399414e-03,
       -3.66210938e-03, -5.88989258e-03, -9.27734375e-03, -1.28784180e-02,
       -1.84020996e-02, -1.88903809e-02, -1.77001953e-02, -1.86462402e-02,
       -1.76696777e-02, -2.01721191e-02, -1.81579590e-02, -1.48010254e-02,
       -1.45568848e-02, -1.17492676e-02, -6.34765625e-03, -4.54711914e-03,
       -3.29589844e-03,  2.74658203e-04, -1.00708008e-03, -2.68554688e-03,
        2.59399414e-03,  4.42504883e-03,  3.75366211

In [65]:
audio = tf.range(1,6)


originalLength = tf.size(audio)
trimmedLength = tf.constant(9)
# Define the range of possible indices where the slice window may start
startBounds = tf.sort(tf.concat([originalLength-trimmedLength, tf.constant(0)], 0))
# Pad audio on both sides to the outer possible boundary of the slice window
padSize = tf.math.abs(startBounds[0])
paddedAudio = tf.pad(audio, paddings=[[padSize, padSize]])
# Get a random starting point for the slice window
startBounds = tf.cast(startBounds, tf.float32)
start = tf.math.round(tf.random.uniform(shape=[], minval=startBounds[0], maxval=startBounds[1]))
# Slice the audio
trimmedAudio = tf.slice(audio, tf.cast(start, tf.int32), trimmedLength)

InvalidArgumentError: Expected begin and size arguments to be 1-D tensors of size 1, but got shapes [] and [] instead. [Op:Slice]