In [1]:
import numpy as np
import scipy as sp
import tensorflow as tf
from tensorflow import keras
import os
from scipy.io import wavfile
import math

from datetime import date, datetime

# 1. Preprocessing

*This code is identically to the code found in the tensorflow training notebook*

Basics:
 - provide a input and target audio file in the config
 - This will create 3 folders [train, val, test] in /Data which will be used for training


In [2]:
def prepare_training_data(config):
    in_rate, in_data = wavfile.read(config["input_audio_path"])
    out_rate, out_data = wavfile.read(config["target_audio_path"])
    
    if len(in_data) != len(out_data):
        print("input and target files have different lengths")
        sys.exit()
      
    if len(in_data.shape) > 1 or len(out_data.shape) > 1:
        print("expected mono files")
        sys.exit()

    # Convert PCM16 to FP32
    if in_data.dtype == "int16":
        in_data = in_data / 32767
        print("In data converted from PCM16 to FP32")
    if out_data.dtype == "int16":
        out_data = out_data / 32767
        print("Out data converted from PCM16 to FP32")    

    clean_data = in_data.astype(np.float32).flatten()
    target_data = out_data.astype(np.float32).flatten()

    # Split the data on a twenty percent mod
    in_train, out_train, in_val, out_val = slice_on_mod(clean_data, target_data)

    save_wav(config["output_path"] + "/train/" + config["name"] + "-input.wav", in_train)
    save_wav(config["output_path"] + "/train/" + config["name"] + "-target.wav", out_train)

    save_wav(config["output_path"] + "/test/" + config["name"] + "-input.wav", in_val)
    save_wav(config["output_path"] + "/test/" + config["name"] + "-target.wav", out_val)

    save_wav(config["output_path"] + "/val/" + config["name"] + "-input.wav", in_val)
    save_wav(config["output_path"] + "/val/" + config["name"] + "-target.wav", out_val)


In [3]:
def slice_on_mod(input_data, target_data, mod=5):
    # Split the data on a modulus.

    # Type cast to an integer the modulus
    mod = int(mod)

    # Split the data into 100 pieces
    input_split = np.array_split(input_data, 100)
    target_split = np.array_split(target_data, 100)

    val_input_data = []
    val_target_data = []
    # Traverse the range of the indexes of the input signal reversed and pop every 5th for val
    for i in reversed(range(len(input_split))):
        if i % mod == 0:
            # Store the validation data
            val_input_data.append(input_split[i])
            val_target_data.append(target_split[i])
            # Remove the validation data from training
            input_split.pop(i)
            target_split.pop(i)

    # Flatten val_data down to one dimension and concatenate
    val_input_data = np.concatenate(val_input_data)
    val_target_data = np.concatenate(val_target_data)

    # Concatenate back together
    training_input_data = np.concatenate(input_split)
    training_target_data = np.concatenate(target_split)
    return training_input_data, training_target_data, val_input_data, val_target_data

In [4]:
def save_wav(name, data):
    directory = os.path.dirname(name)

    if not os.path.exists(directory):
        os.makedirs(directory)
        
    wavfile.write(name, 44100, data.flatten().astype(np.float32))

In [5]:
importConfig = {
    "input_audio_path": "TrainingData/flanger-input.wav",
    "target_audio_path": "TrainingData/flanger-target.wav",
    "output_path": "Data",
    "name": "flanger"
}

prepare_training_data(importConfig)

In data converted from PCM16 to FP32
Out data converted from PCM16 to FP32


# Dataloader

In [6]:
# converts numpy audio into frames, and creates a tf tensor from them, frame_len = 0 just converts to a tf tensor
def framify(audio, frame_len):
    # If audio is mono, add a dummy dimension
    audio = np.expand_dims(audio, 1) if len(audio.shape) == 1 else audio
        
    # Calculate the number of segments
    seg_num = math.floor(audio.shape[0] / frame_len) if frame_len else 1
    
    # Adjust frame_len if it's not provided
    frame_len = audio.shape[0] if not frame_len else frame_len
    
    # Find the number of channels
    channels = audio.shape[1]
    
    # Initialize tensor array
    dataset = tf.TensorArray(dtype=tf.float32, size=seg_num)

    # Populate the tensor array
    for i in range(seg_num):
        segment = tf.convert_to_tensor(audio[i * frame_len:(i + 1) * frame_len, :])
        dataset = dataset.write(i, segment)

    # Stack the tensor array
    dataset = dataset.stack()  # Shape will be [seg_num, frame_len, channels]

    # Reshape to [frame_len, seg_num, channels] if seg_num is the batch size
    dataset = tf.reshape(dataset, [seg_num, frame_len, channels])

    return dataset

In [7]:
# The SubSet class holds a subset of data,
# frame_len sets the length of audio per frame (in s), if set to 0 a single frame is used instead
class SubSet:
    def __init__(self, frame_len):
        self.data = {}
        self.frame_len = frame_len
        self.conditioning = None
        self.fs = None

    def add_data(self, fs, audio, ext, cond_val):
        if not self.fs:
            self.fs = fs
        assert self.fs == fs, "data with different sample rate provided to subset"
        ext = 'data' if not ext else ext
        framed_data = framify(audio, self.frame_len)

        try:
            data = list(self.data[ext])
            self.data[ext] = (tf.concat([data[0], framed_data], axis=1),)
        except KeyError:
            self.data[ext] = (framed_data,)

In [8]:
class DataSet:
    def __init__(self, data_dir='../Dataset/', extensions=('input', 'target')):
        self.extensions = extensions if extensions else ['']
        self.subsets = {}
        assert type(data_dir) == str, "data_dir should be string,not %r" % {type(data_dir)}
        self.data_dir = data_dir

    # add a subset called 'name', desired 'frame_len' is given in seconds, or 0 for just one long frame
    def create_subset(self, name, frame_len=0):
        assert type(name) == str, "data subset name must be a string, not %r" %{type(name)}
        assert not (name in self.subsets), "subset %r already exists" %name
        self.subsets[name] = SubSet(frame_len)

    # load a file of 'filename' into existing subset/s 'set_names', split fractionally as specified by 'splits',
    # if 'cond_val' is provided the conditioning value will be saved along with the frames of the loaded data
    def load_file(self, filename, set_names='train', splits=None, cond_val=None):
        # Assertions and checks
        if type(set_names) == str:
            set_names = [set_names]
        assert len(set_names) == 1 or len(set_names) == len(splits), "number of subset names must equal number of " \
                                                                     "split markers"
        assert [self.subsets.get(each) for each in set_names], "set_names contains subsets that don't exist yet"

        # Load each of the 'extensions'
        for i, ext in enumerate(self.extensions):
            try:
                file_loc = os.path.join(self.data_dir, filename + '-' + ext)
                file_loc = file_loc + '.wav' if not file_loc.endswith('.wav') else file_loc
                np_data = wavfile.read(file_loc)
            except FileNotFoundError:
                print(["File Not Found At: " + self.data_dir + filename])
                return

            raw_audio = np_data[1].astype(np.float32)

            if len(set_names) == 1:
                self.subsets[set_names[0]].add_data(np_data[0], raw_audio, ext, cond_val)

# Training

In [9]:
class StatefulLSTM(tf.keras.Model):
    def __init__(self, input_size=1, output_size=1, hidden_size=32, skip=1, bias_fl=True, batch_size=4096):
        super(StatefulLSTM, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.skip = skip
        self.batch_size = batch_size

        self.lstm = keras.layers.LSTM(units=hidden_size, return_sequences=True, stateful=True, return_state=True, batch_size=batch_size)    
        self.dense = keras.layers.Dense(units=output_size, activation=None, batch_size=batch_size, use_bias=bias_fl)
        
        # Build LSTM before training, because stateful lstm requires information batch size to build static graph
        self.lstm.build((batch_size, input_size, 1))
        
    def call(self, x):
        x, _, _ = self.lstm(x)
        x = self.dense(x)
        return x
    
    def reset_hidden(self, batch_size):
        self.lstm.reset_states()
    
    def train_epoch(self, input_data, target_data, loss_fcn, optim, bs, init_len=200, up_fr=1000):

        # shuffle the segments at the start of the epoch
        shuffle = tf.random.shuffle(tf.range(input_data.shape[0]))
    
        self.reset_hidden(bs)

        # Iterate over the batches
        ep_loss = 0
        for batch_i in range(math.ceil(shuffle.shape[0] / bs)):            

            # Use tf.gather to index the tensors
            input_batch = tf.gather(input_data, shuffle[batch_i * bs:(batch_i + 1) * bs], axis=0)
            target_batch = tf.gather(target_data, shuffle[batch_i * bs:(batch_i + 1) * bs], axis=0)
            
            # Initialise network hidden state by processing some samples then zero the gradient buffers
            # For training processing eine Anfangssequenz, damit ein brauchbarer hidden state vorliegt
            # Training startet erst nach! einem eingelaufen hidden state
            self(input_batch[:, 0:init_len, :])
        
            start_i = init_len
            batch_loss = 0
            # Iterate over the remaining samples in the mini batch
            for k in range(math.ceil((input_batch.shape[1] - init_len) / up_fr)):
                
                with tf.GradientTape() as g:
                    # Process input batch with neural network    
                    output = self(input_batch[:, start_i:start_i + up_fr, :])
                    loss = loss_fcn(output, target_batch[:, start_i:start_i + up_fr, :])
                    with g.stop_recording():
                        dloss_dw = g.gradient(loss, self.trainable_variables)
                        optim.apply_gradients(zip(dloss_dw, self.trainable_variables))
                    g.reset()
                        
                print(f"loss: {loss}")

                # Update the start index for the next iteration and add the loss to the batch_loss total
                start_i += up_fr
                batch_loss += loss

            # Add the average batch loss to the epoch loss and reset the hidden states to zeros
            ep_loss += batch_loss / (k + 1)
        
        return ep_loss / (batch_i + 1)

In [10]:
class ESRLoss(tf.keras.losses.Loss):
    def __init__(self):
        super(ESRLoss, self).__init__()
        self.epsilon = 1e-5

    def call(self, y_true, y_pred):
        loss = tf.reduce_mean(tf.square(y_true - y_pred))
        energy = tf.reduce_mean(tf.square(y_true)) + self.epsilon
        return loss / energy

class DCLoss(tf.keras.losses.Loss):
    def __init__(self):
        super(DCLoss, self).__init__()
        self.epsilon = 1e-5

    def call(self, y_true, y_pred):
        loss = tf.reduce_mean(tf.square(tf.reduce_mean(y_true, axis=0) - tf.reduce_mean(y_pred, axis=0)))
        energy = tf.reduce_mean(tf.square(y_true)) + self.epsilon
        return loss / energy

class LossWrapper(tf.keras.losses.Loss):
    def __init__(self, loss_weights):
        super(LossWrapper, self).__init__()
        # Map the loss names to their corresponding classes
        loss_dict = {'ESR': ESRLoss, 'DC': DCLoss}
        # Create instances of the loss functions
        self.loss_functions = [loss_dict[key]() for key in ["ESR", "DC"]]
        # Assign the weights
        self.loss_factors = [loss_weights[key] for key in ["ESR", "DC"]]

    def call(self, y_true, y_pred):
        total_loss = 0
        for i, loss_function in enumerate(self.loss_functions):
            total_loss += loss_function(y_true, y_pred) * self.loss_factors[i]
        return total_loss


In [11]:
config = {
    "input_size": 1, # Number of channels
    "output_size": 1, # Number of channels
    "skip_con": 0, # is there a skip connection for the input to the output
    "epochs": 10,
    "batch_size": 16,
    "init_length": 200, # Number of sequence samples to process before starting weight updates
    "up_fr": 1000, # For recurrent models, number of samples to run in between updating network weights
    "validation_f": 1, # Validation Frequency (in epochs)
    "val_chunk": 1000, #Number of sequence samples to process in n each chunk of validation
    "learning_rate": 0.0005, 
    "hidden_size": 32,
    "loss_fcns": {"ESR": 0.75, "DC": 0.25},
    "hardware_device": "flanger",
    "save_location": "Results-PyTorch",
    "export_json": 1,
    "export_torchscript": 1,
    "stateful_lstm": 1
}

In [None]:
current_directory = os.getcwd()
result_parent_path = os.path.join(current_directory, config["save_location"])
os.makedirs(result_parent_path, exist_ok=True)
result_path = os.path.join(result_parent_path, config["hardware_device"])
os.makedirs(result_path, exist_ok=True)

save_path = os.path.join(config["save_location"], config["hardware_device"])
    
physical_devices = tf.config.list_physical_devices()
print(f"These are the physical devices available:\n{physical_devices}")

try:
    # Disable all GPUS
    tf.config.set_visible_devices([], 'GPU')
    visible_devices = tf.config.get_visible_devices()
    print(f"These are the visible devices:\n{visible_devices}")
except:
    pass
    
print("Creating Stateful LSTM")
network = StatefulLSTM(input_size=config["input_size"], 
                       output_size=config["output_size"], 
                       hidden_size=config["hidden_size"], 
                       skip=config["skip_con"],
                       batch_size=config["batch_size"])

optimiser = tf.keras.optimizers.Adam(learning_rate=config["learning_rate"], weight_decay=1e-4, epsilon=1e-8)
loss_functions = tf.keras.losses.MeanSquaredError(reduction=tf.keras.losses.Reduction.SUM)

#loss_functions = LossWrapper(config["loss_fcns"])

network.compile(optimizer=optimiser, loss=loss_functions)
network.build((config["batch_size"],1,1))
network.summary()        

dataset = DataSet(data_dir='Data')
dataset.create_subset('train', frame_len=22050)
dataset.load_file(os.path.join('train', config["hardware_device"]), 'train')

dataset.create_subset('val')
dataset.load_file(os.path.join('val', config["hardware_device"]), 'val')   

for epoch in range(1, config["epochs"] + 1):
    print("Epoch: ", epoch)
    
    # Run 1 epoch of training
    epoch_loss = network.train_epoch(dataset.subsets['train'].data['input'][0],
                                     dataset.subsets['train'].data['target'][0],
                                     loss_functions, optimiser, config['batch_size'], config['init_length'], config['up_fr'])

    print("Epoch loss:", epoch_loss)


These are the physical devices available:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
These are the visible devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
Creating Stateful LSTM
Model: "stateful_lstm"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 multiple                  4352      
                                                                 
 dense (Dense)               multiple                  33        
                                                                 
Total params: 4385 (17.13 KB)
Trainable params: 4385 (17.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch:  1
loss: 280.5711669921875
loss: 223.938720703125
loss: 250.28607177734375
loss: 256.096923828125
loss: 240.67544555664062
loss: 238.42738342285156
loss: 153.2185516357422
loss: 

In [27]:


train(trainConfig)

These are the physical devices available:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
These are the visible devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]
Creating Stateful LSTM
Model: "stateful_lstm_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               multiple                  4352      
                                                                 
 dense_4 (Dense)             multiple                  33        
                                                                 
Total params: 4385 (17.13 KB)
Trainable params: 4385 (17.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch:  1
tf.Tensor([ 4 11  6  7  8 15 10 14 12 13  3  9  1  2  5  0], shape=(16,), dtype=int32)
Iterate over the batches2
(16, 22050, 1)
(16, 22050, 1)


KeyboardInterrupt: 