In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import IPython
from IPython.display import clear_output

import torch
import torch.utils.data as data
import torchvision.transforms as transforms
import numpy as np
from IPython.display import Audio
import librosa
import librosa.display
from tqdm import tqdm
import h5py
import os
from pathlib import Path

from GANsynth_pytorch.pytorch_nsynth_lib.nsynth import NSynth

from GANsynth_pytorch import phase_operation

In [None]:
from GANsynth_pytorch import spec_ops as spec_ops
from GANsynth_pytorch import phase_operation as phase_op
from GANsynth_pytorch import spectrograms_helper as spec_helper

In [None]:
subset = 'valid'

base_path = Path(f'~/code/data/nsynth/{subset}').expanduser()
json_wav_path = base_path / 'json_wav/'
h5_save_path = base_path / 'hdf5_float32_mel/no_target/'

# audio samples are loaded as an int16 numpy array
# rescale intensity range as float [-1, 1]
toFloat = transforms.Lambda(lambda x: x / np.iinfo(np.int16).max)

# use instrument_family and instrument_source as classification targets
dataset = NSynth(str(json_wav_path),
                 transform=toFloat,
                 #  blacklist_pattern=["string"],  # blacklist string instrument
                 #  categorical_field_list=["instrument_family","pitch"]
)

HOP_LENGTH = 512
FS_HZ = dataset[0][2]['sample_rate']  # assumes constant sampling rate accross the dataset

loader = data.DataLoader(dataset, batch_size=1, shuffle=False)
shuffled_loader = data.DataLoader(dataset, batch_size=1, shuffle=True)

In [None]:
print(dataset[0])

In [None]:
def expand(mat):
    """"Repeat the last column of the input matrix twice"""
    expand_vec = np.expand_dims(mat[:,125],axis=1)
    expanded = np.hstack((mat,expand_vec,expand_vec))
    return expanded

In [None]:
def plot_representations(magnitude: np.ndarray, angle: np.ndarray,
                         IF: np.ndarray,
                         log_melspec: np.ndarray, mel_IF: np.ndarray
                        ) -> None:
    plt.subplot(1, 5, 1)
    librosa.display.specshow(magnitude, sr=FS_HZ, hop_length=HOP_LENGTH,
                             y_axis='linear')
    plt.title('Spectrogram')

    plt.subplot(1, 5, 2)
    librosa.display.specshow(angle, sr=FS_HZ, hop_length=HOP_LENGTH,
                             y_axis='linear')
    plt.title("Phase")

    plt.subplot(1, 5, 3)
    librosa.display.specshow(IF, sr=FS_HZ, hop_length=HOP_LENGTH,
                             y_axis='linear')
    plt.title("IF")

    plt.subplot(1, 5, 4)
    librosa.display.specshow(logmelmag2, sr=FS_HZ, hop_length=HOP_LENGTH,
                            y_axis='mel')
    plt.title("log-melspectrogram")

    plt.subplot(1, 5, 5)
    librosa.display.specshow(mel_p, sr=FS_HZ, hop_length=HOP_LENGTH,
                             y_axis='mel')
    plt.title("mel-IF")

    fig = plt.gcf()
    fig.set_size_inches(12, 6)
#     plt.figure(num=0, figsize=(6, 10))
    plt.tight_layout()

    plt.show()
    
def plot_mel_representations_only(log_melspec: np.ndarray, mel_IF: np.ndarray
                                 ) -> None:
    plt.subplot(1, 2, 1)
    librosa.display.specshow(logmelmag2, sr=FS_HZ, hop_length=HOP_LENGTH,
                            y_axis='mel')
    plt.title("log-melspectrogram")

    plt.subplot(1, 2, 2)
    librosa.display.specshow(mel_p, sr=FS_HZ, hop_length=HOP_LENGTH,
                             y_axis='mel')
    plt.title("mel-IF")

    plt.tight_layout()

    plt.show()

# Visualization of the computed representations

Re-run the cell to visualize representations on a different input!

In [None]:
shuffled_loader_iterator = iter(shuffled_loader)
samples, pitch, targets = next(shuffled_loader_iterator)
sample_name = targets['note_str'][0]

pitch = targets['pitch'].data.numpy()[0]

if pitch < 24 or pitch > 84:
    pass

sample = samples.data.numpy().squeeze()
spec = librosa.stft(sample, n_fft=2048, hop_length = HOP_LENGTH)

magnitude = np.log(np.abs(spec)+ 1.0e-6)[:1024]
angle = np.angle(spec)

IF = phase_operation.instantaneous_frequency(angle, time_axis=1)[:1024]
magnitude = expand(magnitude)
IF = expand(IF)
logmelmag2, mel_p = spec_helper.specgrams_to_melspecgrams(magnitude, IF)

assert magnitude.shape == (1024, 128)
assert IF.shape == (1024, 128)

plot_representations(magnitude, angle[:1024], IF, logmelmag2, mel_p)
plot_mel_representations_only(logmelmag2, mel_p)

In [None]:
import random
random_stored_sample = random.choice(list(save_path.glob('*.h5')))
with h5py.File(random_stored_sample, 'r') as sample_file:
    IF = sample_file['IF']
    logmelmag2 = sample_file['mel_Spec']
    mel_p = sample_file['mel_IF']
    
    plt.subplot(1, 3, 1)
    librosa.display.specshow(logmelmag2, sr=FS_HZ, hop_length=HOP_LENGTH,
                            y_axis='mel')
    plt.title("log-melspectrogram")

    plt.subplot(1, 3, 2)
    librosa.display.specshow(IF, sr=FS_HZ, hop_length=HOP_LENGTH,
                             y_axis='linear')
    plt.title("IF")

    plt.subplot(1, 3, 3)
    librosa.display.specshow(mel_p, sr=FS_HZ, hop_length=HOP_LENGTH,
                             y_axis='mel')
    plt.title("mel-IF")

    plt.tight_layout()

    plt.show()

In [None]:
SKIP_EXISTING = False
count = 0

for samples, pitch, targets in tqdm(loader):
    sample_name = targets['note_str'][0]
    h5_file_path = h5_save_path / f'{sample_name}.h5'
    
    if SKIP_EXISTING and h5_file_path.is_file(): 
        # skip already created file
        continue

    pitch = targets['pitch'].data.numpy()[0]

    if pitch < 24 or pitch > 84:
        # filter-out extreme pitches, as advocated by GANSynth
        continue
    
    sample = samples.data.numpy().squeeze()
    spec = librosa.stft(sample, n_fft=2048, hop_length = 512)
    
    magnitude = np.log(np.abs(spec) + 1.0e-6)[:1024]
    angle =np.angle(spec)

    IF = phase_operation.instantaneous_frequency(angle, time_axis=1)[:1024]
    
    magnitude = expand(magnitude)
    IF = expand(IF)
    logmelmag2, mel_p = spec_helper.specgrams_to_melspecgrams(magnitude, IF)
    
    assert magnitude.shape == (1024, 128)
    assert IF.shape == (1024, 128)
    with h5py.File(h5_file_path, 'w') as sample_file:
        sample_file.create_dataset("Spec", data=magnitude.astype(np.float32))
        sample_file.create_dataset("IF", data=IF.astype(np.float32))
        sample_file.create_dataset("mel_Spec", data=logmelmag2.astype(np.float32))
        sample_file.create_dataset("mel_IF", data=mel_p.astype(np.float32))
        sample_file.attrs.create("pitch", data=pitch)
    
    if count % 500 == 0:
        clear_output(wait=True)
        plot_representations(magnitude, angle[:1024], IF, logmelmag2, mel_p)
        IPython.display.display(IPython.display.Audio(sample, rate=16000))
    count +=1