Uncomment the following code to run on Google Colab

In [None]:
# ! pip install pydub
# ! pip install ffmpeg
# ! pip install soundfile
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import numpy as np
import librosa
import tensorflow as tf 
import tensorflow.keras.layers as layers
from pydub import AudioSegment
import soundfile as sf

## Preprocess

Simply copy `preprocessing.py` here

In [None]:
import numpy as np
import librosa
import tensorflow as tf 
from pydub import AudioSegment

def mu_law_encode(signal, quantization_channels):
    # Manual mu-law companding and mu-bits quantization
    mu = (quantization_channels - 1)
    # signal should be in [-1, +1]
    magnitude = np.log1p(mu * np.abs(signal)) / np.log1p(mu)
    signal = np.sign(signal) * magnitude

    # Map signal from [-1, +1] to [0, mu-1]
    quantized_signal = ((signal + 1) / 2 * mu + 0.5)

    return list(quantized_signal)

class Preprocess:

    def __init__(self, time_steps, sampling_rate, datapath, quantization_channels=256):
        self.time_steps = time_steps
        self.sampling_rate = sampling_rate
        self.datapath = datapath # datapath is the directory path that contains the .wav files
        self.inputs = []
        self.quantization_channels = quantization_channels
        self.normalized = False

    def load_data(self):
        # transform mp3 to wav
        mp3_files = librosa.util.find_files(self.datapath, ext=['mp3'])
        print("Found {} mp3 files".format(len(mp3_files)))
        i = 0
        for mp3_file in mp3_files:
            wav_file = mp3_file[:-4] + '.wav'
            sound = AudioSegment.from_file(mp3_file, format="mp3")
            sound.export(wav_file, format="wav")
            i += 1
            print(f"Created {i} .wav files")

        wav_files = librosa.util.find_files(self.datapath, ext=['wav'])

        if len(wav_files) == 0:
            raise FileNotFoundError("No .wav files found in the directory")
        
        print("Found {} wav files".format(len(wav_files)))

        for file in wav_files:
            # load the audio file, range from -1 to 1
            audio, sr = librosa.load(file, sr=self.sampling_rate, mono=True)
            self.normalized = True

            # trim the audio file
            audio, _ = librosa.effects.trim(audio)
            
            # convert the audio file to mono
            audio = librosa.to_mono(audio)

            # normalize the audio file
            if not self.normalized:
                audio = audio / np.max(np.abs(audio))
                
            # discretize the audio file
            audio = mu_law_encode(audio, self.quantization_channels)
            self.inputs.append(audio)
    
        print("Finished loading data")

    # takes in a list of inputs, each is a long array
    def create_dataset(self):

        self.load_data()

        self.sample_audio = self.inputs[0]

        x = []
        y = []

        cnt = 0

        for input in self.inputs:
            for i in range(0, len(input) - self.time_steps):
                # preparing input and output sequences
                input_ = input[i:i + self.time_steps]
                output = input[i + self.time_steps]
                x.append(input_)
                y.append(output)

            cnt += 1
            print(f"Loaded {cnt} input data")
        
        x = np.array(x)
        y = np.array(y)
        y = tf.one_hot(y, self.quantization_channels)

        test_size = 0.2
        i = int(len(x) * test_size)
        x_tr = x[i:]
        x_test = x[:i]
        y_tr = y[i:]
        y_test = y[:i]
        
        print("Finished creating dataset")
        return x_tr, x_test, y_tr, y_test

## Wavenet

Simply copy `wavenet.py` here

In [None]:
import numpy as np
import tensorflow as tf 
import tensorflow.keras.layers as layers
import soundfile as sf

def mu_law_decode(signal, quantization_channels):
    # Calculate inverse mu-law companding and dequantization
    mu = quantization_channels - 1
    # Map signal from [0, mu-1] to [-1, +1]
    signal = 2 * (signal.astype(np.float32) / mu) - 1
    signal = np.sign(signal) * (1.0 / mu) * ((1.0 + mu)**abs(signal) - 1.0)
    return signal

class Wavenet(tf.keras.Model):
    def __init__(self, timesteps = 32, quantization_channels=256, **kwargs):
        super().__init__(**kwargs)
        self.timesteps = timesteps
        self.quantization_channels = quantization_channels
        self.model = tf.keras.Sequential([
            layers.Embedding(self.quantization_channels, 100, input_length=32, trainable=True),
            layers.Conv1D(64, 3, padding='causal', activation='relu'),
            layers.Dropout(0.2),
            layers.MaxPool1D(2),
            layers.Conv1D(128, 3, activation='relu', dilation_rate=2, padding='causal'),
            layers.Dropout(0.2),
            layers.MaxPool1D(2),
            layers.Conv1D(256, 3, activation='relu', dilation_rate=4, padding='causal'),
            layers.Dropout(0.2),
            layers.MaxPool1D(2),
            layers.GlobalMaxPool1D(),
            layers.Dense(256, activation='relu'),
            layers.Dense(self.quantization_channels, activation='softmax'),
        ])

    def call(self, inputs):
        return self.model(inputs)

    def generate(self, generate_time, sampling_rate):
        mean = self.quantization_channels / 2
        std = mean * 0.909
        no_samples = generate_time * sampling_rate
        inputs = tf.random.normal((no_samples, self.timesteps), mean=mean, stddev=std, dtype=tf.float32)

        # forward pass:
        predicted_output = self.model.predict(inputs)
        print(f"Model prediction has shape {predicted_output.shape}")

        # generate predictions
        labels = np.argmax(predicted_output, axis=-1)
        print(f"Labels has shape {labels.shape}")

        # decode the predictions
        self.out = mu_law_decode(labels, self.quantization_channels)
        sf.write("generated.wav", self.out, sampling_rate)
        print("Finished generating audio")

Define hyperparameters and preprocessing

In [None]:
ts = 32 # time steps, number of samples per input
sr = 16000 # sampling rate, number of samples per second
qc = 256 # quantization channels, number of possible values for each sample

preprocess = Preprocess(time_steps=ts, sampling_rate=sr, quantization_channels=qc, datapath='audiotest')
x_tr, x_val, y_tr, y_val = preprocess.create_dataset()

Play and visualize sample input audio

In [None]:
import matplotlib.pyplot as plt
import librosa.display
from IPython.display import Audio

def visualize_input(datapath):
    # mp3_file = librosa.util.find_files(datapath, ext=['mp3'])[0]
    # wav_file = mp3_file[:-4] + '.wav'
    # sound = AudioSegment.from_file(mp3_file, format="mp3")
    # sound.export(wav_file, format="wav")
    
    wav_files = librosa.util.find_files(datapath, ext=['wav'])
    audio, _ = librosa.load(wav_files[0], sr=sr)

    librosa.display.waveshow(audio, sr=sr)
    plt.title('Input amplitude (normalized) with respect to time')

    return audio

audio = visualize_input('guzheng')
# print(audio[:1000])
Audio(audio, rate=sr)

Sanity check:

In [None]:
print(x_tr.shape) # should be (num, ts)
print(x_val.shape) # should be (num/4, ts)
print(y_tr.shape) # should be (num, qc)
print(y_val.shape) # should be (num/4, qc)

print(x_tr.dtype) # should be float32

Display the model

In [None]:
model = Wavenet(timesteps=ts, quantization_channels=qc)
model(x_tr[:1])
model.model.summary()

In [None]:
%load_ext tensorboard
import numpy as np
import pandas as pd
import tensorflow as tf
import datetime

! rm -rf ./logs/
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

Train the model

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(x_tr, y_tr, batch_size = 128, epochs=10, validation_data=(x_val, y_val), verbose=1, callbacks=[tensorboard_callback])

In [None]:
%tensorboard --logdir logs/fit

In [None]:
history.history

Visualization for loss and accuracy

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

import matplotlib.pyplot as plt

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()

Generate audio and save to `generated.wav`

In [None]:
model.generate(generate_time = 10, sampling_rate = sr)

Visualization of output

In [None]:
librosa.display.waveshow(model.out, sr=sr)
plt.title('Generated amplitude (normalized) with respect to time')
Audio(model.out, rate=sr)