In [1]:
import pickle
import numpy as np
from scipy.io import wavfile
from util import audio
from hparams import hparams
from scipy import signal
import librosa
import librosa.display
import matplotlib.pyplot as plt


import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPooling1D, Input, Layer
from tensorflow.keras.layers import Embedding
from tensorflow.keras import regularizers
from tqdm import tqdm

from absl import app
from absl import flags
from absl import logging

import os

# small_data = 'data/lj/small.pickle'

# #Load small test set
# train_x = None
# train_y = None
# with open(small_data, "rb") as handle:
#     train_x, train_y = pickle.load(handle)
# print(train_x[0])
# print(train_y[0])

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [37]:
#Training Parameters
num_epochs = 100
steps_per_epoch = None
batch_size = 32
loss = None
optimizer = None
lr = 0.001
filters = 512
sample_rate = 22050 #Hz 16 bit PCM WAV files
wav_dir = './LJSPEECH-1.1/'
out_dir = './output'
os.makedirs(out_dir, exist_ok=True)
hop_length = int(hparams.frame_shift_ms / 1000 * hparams.sample_rate) #change
win_length = int(hparams.frame_length_ms / 1000 * hparams.sample_rate)
n_mels = 128
n_fft = 2048

def preemphasis(x):
    return signal.lfilter([1, -hparams.preemphasis], [1], x)
def inv_preemphasis(x):
    return signal.lfilter([1], [1, -hparams.preemphasis], x)

In [27]:
# print(train_y[0])
# wavfile.write("test.wav", sample_rate, train_y[0])

In [53]:
#####PREPROCESSING#######

ind = 1
with open(os.path.join(wav_dir, 'metadata.csv'), encoding='utf-8') as f:
    for inp in f:
        data = inp.strip().split('|')
        wav_file = os.path.join(wav_dir, 'wavs/' + data[0] + '.wav')
        text = data[2]
        print(text)
        wav, rate = librosa.core.load(wav_file, sr=None)
        print(rate)
        wav2 = wav.astype('float')
        # normalize
        wav = audio.load_wav(wav_file)
        #Compute Spectrogram
#         spectrogram = audio.spectrogram(wav)
        #Compute Spectrogram
        spectrogram = preemphasis(wav)
        spectrogram = np.abs(librosa.stft(spectrogram, n_fft=n_fft, hop_length=hop_length, win_length=win_length))
        spectrogram = librosa.amplitude_to_db(spectrogram, ref=np.max)
        
        spectrogram = audio._normalize(spectrogram)
        n_frames = spectrogram[1]
        waveform = audio.inv_spectrogram(spectrogram)
        audio.save_wav(waveform, 'step-%d-audio.wav' % ind)
        #Compute Melspectrogram
        mel_spectrogram = librosa.feature.melspectrogram(y = wav, sr = sample_rate, n_fft=n_fft, 
                                                         hop_length=hop_length, 
                                                         win_length = win_length,
                                                         n_mels=n_mels)
        # melspectrogram on log scale
#         mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
        
        
        
        #file saving
        spectrogram_filename = 'ljspeech-spec-%05d.npy' % ind
        mel_filename = 'ljspeech-mel-%05d.npy' % ind
        np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
        
        
        ind += 1
        


Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition,,,,,,,,,,,,,,,
22050


  out = out_full[ind]


in being comparatively modern.,,,,,,,,,,,,,,,,,,,
22050
For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process,,,,,,,,,,,,,,,,,
22050


In [25]:
np.load('./output/ljspeech-mel-00001.npy')

array([[8.5631573e-06, 3.2648091e-05, 4.1935720e-05, ..., 3.9799815e-07,
        1.1173992e-06, 9.5501946e-07],
       [1.8225535e-05, 8.8215995e-05, 1.5128181e-04, ..., 5.3997394e-07,
        1.6153452e-06, 1.3832307e-06],
       [2.7657237e-05, 1.8380619e-04, 3.5688866e-04, ..., 6.9680118e-07,
        2.0345626e-06, 1.7245667e-06],
       ...,
       [4.7496592e-06, 7.8812947e-05, 3.5650693e-05, ..., 3.8170538e-07,
        1.4434181e-07, 2.6579690e-08],
       [1.0110153e-05, 6.9919974e-05, 5.7727666e-05, ..., 3.3499708e-07,
        1.7116160e-07, 3.1338125e-08],
       [1.3446468e-05, 6.3574065e-05, 8.6454798e-05, ..., 2.9122504e-07,
        1.7826329e-07, 3.3713881e-08]], dtype=float32)

In [30]:
#####PREPROCESSING#######

ind = 1
with open(os.path.join(wav_dir, 'metadata.csv'), encoding='utf-8') as f:
    for inp in f:
        data = inp.strip().split('|')
        wav_file = os.path.join(wav_dir, 'wavs/' + data[0] + '.wav')
        text = data[2]
        print(text)
        wav = audio.load_wav(wav_file)
        
        #Compute Spectrogram
        spectrogram = audio.spectrogram(wav)
        waveform = audio.inv_spectrogram(spectrogram)
        audio.save_wav(waveform, 'step-%d-audio.wav' % ind)
        #Compute Melspectrogram
        mel_spectrogram = librosa.feature.melspectrogram(y = wav, sr = sample_rate, n_fft=n_fft, 
                                                         hop_length=hop_length, n_mels=n_mels)
        # melspectrogram on log scale
#         mel_spectrogram = librosa.power_to_db(mel_spectrogram, ref=np.max)
        
        
        
        #file saving
        spectrogram_filename = 'ljspeech-spec-%05d.npy' % ind
        mel_filename = 'ljspeech-mel-%05d.npy' % ind
        np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
        
        
        ind += 1
        


Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition,,,,,,,,,,,,,,,


  out = out_full[ind]


in being comparatively modern.,,,,,,,,,,,,,,,,,,,
For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process,,,,,,,,,,,,,,,,,
