<a href="https://colab.research.google.com/github/wojciechsadlik/MelGAN-VC-ThesisExperiments/blob/master/MelGAN_VC_generating_spectrograms.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#We'll be using TF 2.1 and torchaudio

try:
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
!pip install soundfile                    #to save wav files
!pip install --no-deps torchaudio==0.5.0

Collecting torchaudio==0.5.0
  Downloading torchaudio-0.5.0-cp37-cp37m-manylinux1_x86_64.whl (3.2 MB)
[K     |████████████████████████████████| 3.2 MB 23.8 MB/s 
[?25hInstalling collected packages: torchaudio
  Attempting uninstall: torchaudio
    Found existing installation: torchaudio 0.10.0+cu111
    Uninstalling torchaudio-0.10.0+cu111:
      Successfully uninstalled torchaudio-0.10.0+cu111
Successfully installed torchaudio-0.5.0


In [2]:
#Connecting Drive to save model checkpoints during training and to use custom data, uncomment if needed

import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Imports

from __future__ import print_function, division
from glob import glob
import scipy
import soundfile as sf
import matplotlib.pyplot as plt
from IPython.display import clear_output
from tensorflow.keras.layers import Input, Dense, Reshape, Flatten, Concatenate, Conv2D, Conv2DTranspose, GlobalAveragePooling2D, UpSampling2D, LeakyReLU, ReLU, Add, Multiply, Lambda, Dot, BatchNormalization, Activation, ZeroPadding2D, Cropping2D, Cropping1D
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.initializers import TruncatedNormal, he_normal
import tensorflow.keras.backend as K
import datetime
import numpy as np
import random
import matplotlib.pyplot as plt
import collections
from PIL import Image
from skimage.transform import resize
import imageio
import librosa
import librosa.display
from librosa.feature import melspectrogram
import os
import time
import IPython
import shutil

In [4]:
#Hyperparameters

hop=192               #hop size (window size = 6*hop)
sr=16000              #sampling rate
min_level_db=-100     #reference values to normalize data
ref_level_db=20

shape=24              #length of time axis of split specrograms to feed to generator            
vec_len=128           #length of vector generated by siamese vector
bs = 16               #batch size
delta = 2.            #constant for siamese loss

In [5]:
#There seems to be a problem with Tensorflow STFT, so we'll be using pytorch to handle offline mel-spectrogram generation and waveform reconstruction
#For waveform reconstruction, a gradient-based method is used:

import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
from functools import partial
import math
import heapq
from torchaudio.transforms import MelScale, Spectrogram

#torch.set_default_tensor_type('torch.cuda.FloatTensor')

specobj = Spectrogram(n_fft=6*hop, win_length=6*hop, hop_length=hop, pad=0, power=2, normalized=True)
specfunc = specobj.forward
melobj = MelScale(n_mels=hop, sample_rate=sr, f_min=0.)
melfunc = melobj.forward

def melspecfunc(waveform):
  specgram = specfunc(waveform)
  mel_specgram = melfunc(specgram)
  return mel_specgram

def normalize(S):
  return np.clip((((S - min_level_db) / -min_level_db)*2.)-1., -1, 1)

def melprep(wv,hop=192):
  S = np.array(torch.squeeze(melspecfunc(torch.Tensor(wv).view(1,-1))).detach().cpu())
  S = librosa.power_to_db(S)-ref_level_db
  return normalize(S)

def stftprep(wv,hop=192):
  S = np.array(torch.squeeze(specfunc(torch.Tensor(wv).view(1,-1))).detach().cpu())
  S = librosa.power_to_db(S)-ref_level_db
  return normalize(S)

In [10]:
#Helper functions

#Generate spectrograms from waveform array
def to_melspec(data):
  specs=np.empty(data.shape[0], dtype=object)
  for i in range(data.shape[0]):
    x = data[i]
    S = melprep(x)
    S = np.array(S, dtype=np.float32)
    specs[i]=np.expand_dims(S, -1)
  print(specs.shape)
  return specs

def to_stftspec(data):
  specs=np.empty(data.shape[0], dtype=object)
  for i in range(data.shape[0]):
    x = data[i]
    S = stftprep(x)
    S = np.array(S, dtype=np.float32)
    specs[i]=np.expand_dims(S, -1)
  print(specs.shape)
  return specs

#Waveform array from path of folder containing wav files
def audio_array(path):
  ls = glob(f'{path}/*.wav')
  adata = []
  filenames = []
  for i in range(len(ls)):
    try:
      x, sr = tf.audio.decode_wav(tf.io.read_file(ls[i]), 1)
      x = np.array(x, dtype=np.float32)
      adata.append(x)
      filenames.append(os.path.basename(ls[i]))
    except (UnicodeDecodeError):
      print('Unable to load:\n' + ls[i])
  return np.array(adata), filenames

In [11]:
def save_spectrograms(dir_path, spectrograms, filenames, force_recreate=False):
  if os.path.isdir(dir_path):
    if not force_recreate:
      print('Set ' + dir_path + ' already exists')
      return
    else:
      shutil.rmtree(dir_path)

  os.makedirs(dir_path)

  class_name = os.path.dirname(dir_path)

  for i in range(len(spectrograms)):
    np.save(os.path.join(dir_path, filenames[i]), spectrograms[i])

  return

In [14]:
DATASET_BASE_PATH = '/content/drive/MyDrive/GTZAN_dataset'
WAV_DATASET_PATH = os.path.join(DATASET_BASE_PATH, 'genres_16khz')

def generate_spectrograms_dataset(genre):
  awv, filenames = audio_array(os.path.join(WAV_DATASET_PATH, genre))

  aspec = to_melspec(awv)
  save_spectrograms(os.path.join(DATASET_BASE_PATH, 'genres_melspectrograms', genre), aspec, filenames)

  aspec = to_stftspec(awv)
  save_spectrograms(os.path.join(DATASET_BASE_PATH, 'genres_stftspectrograms', genre), aspec, filenames)

In [15]:
genres = ['jazz', 'classical']

for genre in genres:
  generate_spectrograms_dataset(genre)



(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2530, 1)
(192, 2502, 1)
(192, 2501, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2517, 1)
(192, 2502, 1)
(192, 2515, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2523, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2521, 1)
(192, 2541, 1)
(192, 2502, 1)
(192, 2502, 1)
(192, 2530

KeyboardInterrupt: ignored

In [None]:
awv, filenames = audio_array(os.path.join(WAV_DATASET_PATH, 'jazz'))
aspec = to_melspec(awv)

Unable to load:
/content/drive/MyDrive/GTZAN_dataset/genres_original/jazz/jazz.00054.wav




(99,)


In [None]:
aspec[0].shape

(192, 3447, 1)