# Exploratory data analysis

We extract one sample from the VCTK dataset and obtain the recording as a time series in a numpy array with the transcript of the text read by a person.

In [5]:
from constants import *
import tensorflow_datasets as tfds
import tensorflow as tf
from DatasetGenerator import DatasetGenerator
import numpy as np
from metrics import *
import librosa
import librosa.display
import soundfile as sf
import datetime
import matplotlib.pyplot as plt
import random
import re

print("Loading the sample vocal recording from the VCTK dataset...")
dataset = tfds.load("vctk", with_info=False)
sample_array = None
transcript = None

recording_index = 0
chosen_recording = random.randint(0, 100)

for sample in dataset['train']:
    if recording_index == chosen_recording:
        transcript = sample['text']
        print("Recording transcript: {}".format(transcript))
        sample_array = np.array(sample['speech'], dtype=float)
        break
    recording_index += 1

print("The selected sample's transcript: {}".format(str(transcript)))

ImportError: cannot import name 'LayerNormalization' from 'tensorflow.python.keras.layers.normalization' (/home/calandrinon/anaconda3/envs/AudioSuperResolution/lib/python3.8/site-packages/tensorflow/python/keras/layers/normalization/__init__.py)

In [None]:
print("Index of the recording: {}".format(recording_index))

In [None]:
print("Sample array: {}".format(sample_array))

In [None]:
import pandas as pd
sample_array_as_dataframe = pd.Series(sample_array)
sample_array_as_dataframe.describe()

As expected, most of the values are zero.

In [None]:
sample_array_as_dataframe_without_zeroes = pd.Series(sample_array[sample_array != 0])
sample_array_as_dataframe_without_zeroes.describe()

Even after filtering the zeroes, a lot of values still seem to be very close to 0.

In [None]:
figure, axes = plt.subplots(2, 1, figsize=(10, 10))

axes[0].boxplot(sample_array_as_dataframe)
axes[1].boxplot(sample_array_as_dataframe_without_zeroes)

plt.show()

The range of the values is enormous (-20000, 20000), so a boxplot doesn't really help.

In [None]:
figure, axes = plt.subplots(2, 1, figsize=(10, 10))

axes[0].set_title("Histogram of the sample")
axes[0].hist(sample_array_as_dataframe, bins=100)

axes[1].set_title("Histogram of the sample without zeroes")
axes[1].hist(sample_array_as_dataframe_without_zeroes, bins=100)
plt.plot()

We notice that there are lots of values centered around zero, which means that there is quite a lot of silence in a sample. In order to visualize the distribution in a clearer way, we can display the distribution of all values within and outside the interquartile range.

In [None]:
first_quartile = np.quantile(sample_array_as_dataframe_without_zeroes, 0.25, axis=0)
third_quartile = np.quantile(sample_array_as_dataframe_without_zeroes, 0.75, axis=0)
sample_array_as_dataframe_within_the_interquartile_range = pd.Series(sample_array_as_dataframe_without_zeroes[(sample_array_as_dataframe_without_zeroes > first_quartile) & (sample_array_as_dataframe_without_zeroes < third_quartile)])
sample_array_as_dataframe_outside_the_interquartile_range = pd.Series(sample_array_as_dataframe_without_zeroes[(sample_array_as_dataframe_without_zeroes < first_quartile) | (sample_array_as_dataframe_without_zeroes > third_quartile)])
sample_array_as_dataframe_within_the_interquartile_range.describe()

In [None]:
sample_array_as_dataframe_outside_the_interquartile_range.describe()

In [None]:
figure, axes = plt.subplots(2, 1, figsize=(10, 10))

axes[0].set_title("Histogram of the sample within the IQR")
axes[0].hist(sample_array_as_dataframe_within_the_interquartile_range, bins=100)
axes[1].set_title("Histogram of the sample outside the IQR")
axes[1].hist(sample_array_as_dataframe_outside_the_interquartile_range, bins=100)

plt.plot()

From the histogram displaying the distribution of values outside the IQR, we can deduce that most values are located somewhere between -10000 and +10000.

In [None]:
figure, axes = plt.subplots(2, 1, figsize=(10, 10))

axes[0].set_title("Boxplot for values within the IQR")
axes[0].boxplot(sample_array_as_dataframe_within_the_interquartile_range)
axes[1].set_title("Boxplot for values outside the IQR")
axes[1].boxplot(sample_array_as_dataframe_outside_the_interquartile_range)

plt.show()

## Checking the spectral content and what chunk size would fit for the task

In [None]:
import librosa
import librosa.display

audio_clip_spectrogram = librosa.feature.melspectrogram(sample_array, sr=VCTK_DATASET_SAMPLING_RATE)
decibel_units = librosa.power_to_db(audio_clip_spectrogram, ref=np.max)
figure, axes = plt.subplots(1, 1, figsize=(10,8))

spectrogram_plot = librosa.display.specshow(decibel_units, x_axis='time',
                    y_axis='mel', sr=VCTK_DATASET_SAMPLING_RATE, ax=axes)

figure.colorbar(spectrogram_plot, ax=axes, format='%+2.0f dB')
axes.set_title("{}".format(str(transcript.numpy())[1:]))
plt.show()

We can use IPython.display.Audio(...) to embed the recording in the Jupyter Notebook.

In [None]:
from IPython.display import Audio

full_recording_filename = "exploratory-data-analysis-track-no-{}-high-res.wav".format(chosen_recording)
sf.write("outputs/exploratory-data-analysis/{}".format(full_recording_filename), np.int16(sample_array), VCTK_DATASET_SAMPLING_RATE)

Audio("outputs/exploratory-data-analysis/{}".format(full_recording_filename), rate=VCTK_DATASET_SAMPLING_RATE)

Length of the sample recording

In [None]:
print(len(sample_array))

In [None]:
content_after_one_second_filename = "exploratory-data-analysis-track-no-{}-high-res-after-1-sec.wav".format(chosen_recording)
cropped_sample_array = sample_array[len(sample_array) - 2 * len(sample_array) // 3:]
sf.write("outputs/exploratory-data-analysis/{}".format(content_after_one_second_filename), np.int16(cropped_sample_array), VCTK_DATASET_SAMPLING_RATE)

audio_clip_spectrogram = librosa.feature.melspectrogram(cropped_sample_array, sr=VCTK_DATASET_SAMPLING_RATE)
decibel_units = librosa.power_to_db(audio_clip_spectrogram, ref=np.max)
figure, axes = plt.subplots(1, 1, figsize=(10,8))

spectrogram_plot = librosa.display.specshow(decibel_units, x_axis='time',
                                            y_axis='mel', sr=VCTK_DATASET_SAMPLING_RATE, ax=axes)

figure.colorbar(spectrogram_plot, ax=axes, format='%+2.0f dB')
axes.set_title("{}".format(str(transcript.numpy())[1:]))
plt.show()

In [None]:
Audio("outputs/exploratory-data-analysis/{}".format(content_after_one_second_filename), rate=VCTK_DATASET_SAMPLING_RATE)

In [None]:
approximately_one_second_of_the_sample = cropped_sample_array[: len(cropped_sample_array) // 2]

first_second_clip_filename = "exploratory-data-analysis-track-no-{}-high-res-first-sec.wav".format(chosen_recording)
sf.write("outputs/exploratory-data-analysis/{}".format(first_second_clip_filename), np.int16(approximately_one_second_of_the_sample), VCTK_DATASET_SAMPLING_RATE)

audio_clip_spectrogram = librosa.feature.melspectrogram(approximately_one_second_of_the_sample, sr=VCTK_DATASET_SAMPLING_RATE)
decibel_units = librosa.power_to_db(audio_clip_spectrogram, ref=np.max)
figure, axes = plt.subplots(1, 1, figsize=(10, 8))

spectrogram_plot = librosa.display.specshow(decibel_units, x_axis='time',
                                            y_axis='mel', sr=VCTK_DATASET_SAMPLING_RATE, ax=axes)

figure.colorbar(spectrogram_plot, ax=axes, format='%+2.0f dB')
axes.set_title("{}".format("That's been our position a..."))
plt.show()

In [None]:
Audio("outputs/exploratory-data-analysis/{}".format(first_second_clip_filename), rate=VCTK_DATASET_SAMPLING_RATE)

In [None]:
print("Number of samples in a clip of approximately 1 second: {}".format(len(approximately_one_second_of_the_sample)))

We notice that the number of samples in the numpy array of approximately one second is close to 48000, so the sample rate of 48 kHz found on Google is accurate.

For a chunk of 100 milliseconds (a tenth of a second) from a 48 kHz recording, we need $ \frac{48000}{10} = 4800 $ samples.
An interesting idea would be to try multiple chunk sizes, one for chunks of 100 milliseconds, one for chunks of 250 milliseconds, for chunks of 500 milliseconds and one for chunks of 1 second (for which the model would take more time and data to train).
This is how a chunk of 100 milliseconds looks:

In [None]:
chunk_of_100_ms = approximately_one_second_of_the_sample[:4800]

clip_of_100_milliseconds_filename = "exploratory-data-analysis-track-no-{}-high-res-first-100-ms.wav".format(chosen_recording)
sf.write("outputs/exploratory-data-analysis/{}".format(clip_of_100_milliseconds_filename), np.int16(chunk_of_100_ms), VCTK_DATASET_SAMPLING_RATE)

audio_clip_spectrogram = librosa.feature.melspectrogram(chunk_of_100_ms, sr=VCTK_DATASET_SAMPLING_RATE)
decibel_units = librosa.power_to_db(audio_clip_spectrogram, ref=np.max)
figure, axes = plt.subplots(1, 1, figsize=(10,8))

spectrogram_plot = librosa.display.specshow(decibel_units, x_axis='time',
                                            y_axis='mel', sr=VCTK_DATASET_SAMPLING_RATE, ax=axes)

figure.colorbar(spectrogram_plot, ax=axes, format='%+2.0f dB')
axes.set_title("{}".format(str(transcript.numpy())[1:]))
plt.show()

Obviously, the spectrogram doesn't really help in this case, because the audio clip is extremely short.
Here is how it sounds:

In [None]:
Audio("outputs/exploratory-data-analysis/{}".format(clip_of_100_milliseconds_filename), rate=VCTK_DATASET_SAMPLING_RATE)

Indeed, it is a very short sound, but moving a sliding window all over a low-resolution recording to run the model on a small 100-milliseconds chunk in the execution script might work properly (remains to be seen later on after training)

We can try downsampling the clip and perhaps also interpolating it afterwards to compare the spectrograms.

In [None]:
downsampled_array = np.array(sample_array[0::RESAMPLING_FACTOR])
downsampled_recording_filename = "exploratory-data-analysis-track-no-{}-downsampled-without-interpolation.wav".format(chosen_recording)
sf.write("outputs/exploratory-data-analysis/{}".format(downsampled_recording_filename), np.int16(downsampled_array), VCTK_DATASET_SAMPLING_RATE)
Audio("outputs/exploratory-data-analysis/{}".format(downsampled_recording_filename), rate=VCTK_DATASET_SAMPLING_RATE)

Due to the recording getting downsampled by a factor of 4, the pitch of the voice in the WAV file sounds hilariously high and cartoonish. We have to adjust the sample rate properly.

In [None]:
sf.write("outputs/exploratory-data-analysis/{}".format(downsampled_recording_filename), np.int16(downsampled_array), DOWNSAMPLED_RATE)
Audio("outputs/exploratory-data-analysis/{}".format(downsampled_recording_filename), rate=DOWNSAMPLED_RATE)

Now we can compare the spectrograms of the high-res and low-res clips:

In [None]:
high_res_audio_clip_spectrogram = librosa.feature.melspectrogram(sample_array, sr=VCTK_DATASET_SAMPLING_RATE)
high_res_decibel_units = librosa.power_to_db(high_res_audio_clip_spectrogram, ref=np.max)
low_res_audio_clip_spectrogram = librosa.feature.melspectrogram(downsampled_array, sr=DOWNSAMPLED_RATE)
low_res_decibel_units = librosa.power_to_db(low_res_audio_clip_spectrogram, ref=np.max)
figure, axes = plt.subplots(2, 1, figsize=(10, 8))

axes[0].set_title("Low-res")
low_res_spectrogram_plot = librosa.display.specshow(low_res_decibel_units, x_axis='time',
                                                     y_axis='mel', sr=DOWNSAMPLED_RATE, ax=axes[0])

axes[1].set_title("High-res")
high_res_spectrogram_plot = librosa.display.specshow(high_res_decibel_units, x_axis='time',
                            y_axis='mel', sr=VCTK_DATASET_SAMPLING_RATE, ax=axes[1])

figure.tight_layout()
figure.colorbar(low_res_spectrogram_plot, ax=[axes[0], axes[1]], format='%+2.0f dB')
plt.show()

Now we can also interpolate the downsampled signal and compare all 3 versions.

In [None]:
downsampled_and_interpolated_array = DatasetGenerator.upsample(downsampled_array, RESAMPLING_FACTOR)
downsampled_recording_with_interpolation_filename = "exploratory-data-analysis-track-no-{}-downsampled-with-interpolation.wav".format(chosen_recording)
sf.write("outputs/exploratory-data-analysis/{}".format(downsampled_recording_with_interpolation_filename), np.int16(downsampled_and_interpolated_array), VCTK_DATASET_SAMPLING_RATE)
Audio("outputs/exploratory-data-analysis/{}".format(downsampled_recording_with_interpolation_filename), rate=VCTK_DATASET_SAMPLING_RATE)

In [None]:
high_res_audio_clip_spectrogram = librosa.feature.melspectrogram(sample_array, sr=VCTK_DATASET_SAMPLING_RATE)
high_res_decibel_units = librosa.power_to_db(high_res_audio_clip_spectrogram, ref=np.max)
low_res_audio_clip_spectrogram = librosa.feature.melspectrogram(downsampled_array, sr=DOWNSAMPLED_RATE)
low_res_decibel_units = librosa.power_to_db(low_res_audio_clip_spectrogram, ref=np.max)
low_res_with_interpolation_audio_clip_spectrogram = librosa.feature.melspectrogram(downsampled_and_interpolated_array, sr=DOWNSAMPLED_RATE)
low_res_with_interpolation_decibel_units = librosa.power_to_db(low_res_with_interpolation_audio_clip_spectrogram, ref=np.max)
figure, axes = plt.subplots(3, 1, figsize=(12, 8))

axes[0].set_title("Low-res")
low_res_spectrogram_plot = librosa.display.specshow(low_res_decibel_units, x_axis='time',
                                                    y_axis='mel', sr=DOWNSAMPLED_RATE, ax=axes[0])

axes[1].set_title("Low-res and interpolated")
low_res_and_interpolated_spectrogram_plot = librosa.display.specshow(low_res_with_interpolation_decibel_units, x_axis='time',
                                                    y_axis='mel', sr=VCTK_DATASET_SAMPLING_RATE, ax=axes[1])

axes[2].set_title("High-res")
high_res_spectrogram_plot = librosa.display.specshow(high_res_decibel_units, x_axis='time',
                                                     y_axis='mel', sr=VCTK_DATASET_SAMPLING_RATE, ax=axes[2])

figure.tight_layout()
figure.colorbar(low_res_spectrogram_plot, ax=[axes[0], axes[1], axes[2]], format='%+2.0f dB')
plt.show()

One thing to notice is that interpolating the downsampled signal recovers a few of the high frequency components, but not enough of them.

Downsampling the signal simply cuts off most of the frequency components above 4096 Hz.

Obviously, the high-resolution recording has harmonics that are much more fine-grained and well-defined when compared to the low-res-and-interpolation version of the recording.

## Some line plots

In [None]:
figure, axes = plt.subplots(3, 2, figsize=(10, 8))

axes[0, 0].set_title("High-res")
axes[0, 0].plot(sample_array)
axes[1, 0].set_title("Low-res + interpolation")
axes[1, 0].plot(downsampled_and_interpolated_array)
axes[2, 0].set_title("Low-res")
axes[2, 0].plot(downsampled_array)

axes[0, 1].set_title("High-res FFT")
axes[0, 1].plot(np.fft.fft(sample_array))
axes[1, 1].set_title("Low-res + interpolation FFT")
axes[1, 1].plot(np.fft.fft(downsampled_and_interpolated_array))
axes[2, 1].set_title("Low-res FFT")
axes[2, 1].plot(np.fft.fft(downsampled_array))

figure.tight_layout()
plt.show()

Of course, the number of samples in the downsampled array is 4 times lower.

We can display the line plots for a small chunk of 100 milliseconds.


In [None]:
figure, axes = plt.subplots(3, 1, figsize=(10, 8))

axes[0].set_title("Lineplot of the high-resolution 100 millisecond chunk")
axes[0].plot(chunk_of_100_ms)
axes[1].set_title("Lineplot of the downsampled low-res 100 millisecond chunk")
axes[1].plot(chunk_of_100_ms[0::RESAMPLING_FACTOR])
axes[2].set_title("Lineplot of the downsampled and interpolated 100 millisecond chunk")
axes[2].plot(DatasetGenerator.upsample(chunk_of_100_ms[0::RESAMPLING_FACTOR], RESAMPLING_FACTOR))

figure.tight_layout()
plt.show()

There's nothing that can be possibly inferred from this, so let's cut the chunk into smaller pieces.

In [None]:
figure, axes = plt.subplots(3, 1, figsize=(10, 8))

very_short_sample = chunk_of_100_ms[len(chunk_of_100_ms)-100:]
axes[0].set_title("High-res line plot")
axes[0].plot(very_short_sample)
axes[1].set_title("Low-res line plot")
axes[1].plot(very_short_sample[0::RESAMPLING_FACTOR])
axes[2].set_title("Low-res + interpolation line plot")
axes[2].plot(DatasetGenerator.upsample(very_short_sample[0::RESAMPLING_FACTOR], RESAMPLING_FACTOR))

figure.tight_layout()
plt.show()

As expected, the low-resolution downsampled chunk has rough corners all over the place. The low-resolution + cubic spline interpolation version is smoother
(probably due to the fact that cubic spline interpolation is used, instead of linear interpolation).

It also makes sense why the old model trained on insanely short 256-sample-sized chunks is disappointing, because instead of actually feeding only
downsampled chunks to the model directly, it applies cubic spline interpolation on the downsampled chunks, therefore smoothening the waveform instead. This
is probably why the results of the old model seemed to have a very poor effect on high-frequency phonemes such as 'S'.

In [None]:
figure, axes = plt.subplots(3, 2, figsize=(24, 24))

very_short_sample = chunk_of_100_ms[len(chunk_of_100_ms)-100:]

axes[0, 0].set_title("High-res line plot")
axes[0, 0].plot(very_short_sample)
axes[1, 0].set_title("Low-res line plot")
axes[1, 0].plot(very_short_sample[0::RESAMPLING_FACTOR])
axes[2, 0].set_title("Low-res + interpolation line plot")
axes[2, 0].plot(DatasetGenerator.upsample(very_short_sample[0::RESAMPLING_FACTOR], RESAMPLING_FACTOR))
axes[0, 1].set_title("High-res FFT")
axes[0, 1].plot(np.fft.fft(very_short_sample))
axes[1, 1].set_title("Low-res FFT")
axes[1, 1].plot(np.fft.fft(very_short_sample[0::RESAMPLING_FACTOR]))
axes[2, 1].set_title("Low-res + interpolation FFT")
axes[2, 1].plot(np.fft.fft(DatasetGenerator.upsample(very_short_sample[0::RESAMPLING_FACTOR], RESAMPLING_FACTOR)))

figure.tight_layout()
plt.show()