# data_load.py

In [2]:
import numpy as np
import tensorflow as tf
import codecs
import re
import os
import unicodedata

In [3]:
PAD = '_'
EOS = '~'
PUNC = '!\'(),-.:;?'
SPACE = ' '

JAMO_LEADS = "".join([chr(_) for _ in range(0x1100, 0x1113)])
JAMO_VOWELS = "".join([chr(_) for _ in range(0x1161, 0x1176)])
JAMO_TAILS = "".join([chr(_) for _ in range(0x11A8, 0x11C3)])

VALID_CHARS = JAMO_LEADS + JAMO_VOWELS + JAMO_TAILS + PUNC + SPACE
ALL_SYMBOLS = PAD + EOS + VALID_CHARS

vocab = ALL_SYMBOLS

In [4]:
vocab = "PE abcdefghijklmnopqrstuvwxyz'.?"

In [5]:
def load_vocab():
    char2idx = {char : idx for idx, char in enumerate(vocab)} #80개
    idx2char = {idx: char for idx, char in enumerate(vocab)}
    return char2idx, idx2char

In [6]:
load_vocab()

({'P': 0,
  'E': 1,
  ' ': 2,
  'a': 3,
  'b': 4,
  'c': 5,
  'd': 6,
  'e': 7,
  'f': 8,
  'g': 9,
  'h': 10,
  'i': 11,
  'j': 12,
  'k': 13,
  'l': 14,
  'm': 15,
  'n': 16,
  'o': 17,
  'p': 18,
  'q': 19,
  'r': 20,
  's': 21,
  't': 22,
  'u': 23,
  'v': 24,
  'w': 25,
  'x': 26,
  'y': 27,
  'z': 28,
  "'": 29,
  '.': 30,
  '?': 31},
 {0: 'P',
  1: 'E',
  2: ' ',
  3: 'a',
  4: 'b',
  5: 'c',
  6: 'd',
  7: 'e',
  8: 'f',
  9: 'g',
  10: 'h',
  11: 'i',
  12: 'j',
  13: 'k',
  14: 'l',
  15: 'm',
  16: 'n',
  17: 'o',
  18: 'p',
  19: 'q',
  20: 'r',
  21: 's',
  22: 't',
  23: 'u',
  24: 'v',
  25: 'w',
  26: 'x',
  27: 'y',
  28: 'z',
  29: "'",
  30: '.',
  31: '?'})

In [7]:
def text_normalize(text):
    text = ''.join(char for char in unicodedata.normalize('NFD', text)
                           if unicodedata.category(char) != 'Mn') # Strip accents

    text = text.lower()
    text = re.sub("[^{}]".format(vocab), " ", text)
    text = re.sub("[ ]+", " ", text)
    return text

In [8]:
text_normalize('hello everybody')

'hello everybody'

In [13]:
data = "/home/jeon/Downloads/LJSpeech-1.1/"
# data = "/data/private/voice/nick"
test_data = 'harvard_sentences.txt'

## def load_data

In [14]:
def load_data(mode="train"):
    # Load vocabulary
    char2idx, idx2char = load_vocab()

    if mode in ("train", "eval"):
        # Parse
        fpaths, text_lengths, texts = [], [], []
        transcript = os.path.join(data, 'metadata.csv')
        lines = codecs.open(transcript, 'r', 'utf-8').readlines()
        total_hours = 0
        if mode=="train":
            lines = lines[1:]
        else: # We attack only one sample!
            lines = lines[:1]

        for line in lines:
            fname, _, text = line.strip().split("|")

            fpath = os.path.join(data, "wavs", fname + ".wav")
            fpaths.append(fpath)

            text = text_normalize(text) + "E"  # E: EOS
            text = [char2idx[char] for char in text]
            text_lengths.append(len(text))
            texts.append(np.array(text, np.int32).tostring())

        return fpaths, text_lengths, texts
    else:
        # Parse
        lines = codecs.open(hp.test_data, 'r', 'utf-8').readlines()[1:]
        sents = [text_normalize(line.split(" ", 1)[-1]).strip() + "E" for line in lines] # text normalization, E: EOS
        #sents = [text_normalize(line.split(" ", 1)[-1]).strip() + "~" for line in lines] # text normalization, ~: EOS
        lengths = [len(sent) for sent in sents]
        maxlen = sorted(lengths, reverse=True)[0]
        texts = np.zeros((len(sents), maxlen), np.int32)
        for i, sent in enumerate(sents):
            texts[i, :len(sent)] = [char2idx[char] for char in sent]
        return texts

In [15]:
char2idx, idx2char = load_vocab()

In [16]:
fpaths, text_lengths, texts = [], [], []
transcript = os.path.join(data, 'metadata.csv')
lines = codecs.open(transcript,'r','utf-8').readlines()

In [10]:
#lines = lines[1:]

In [17]:
for line in lines:
    fname, _, text = line.strip().split("|")
    #LJ001-0001 / txt로 나뉨
    fpath = os.path.join(data, "wavs", fname + ".wav")
    fpaths.append(fpath)

    text = text_normalize(text) + "E"  # E: EOS (text 한줄마다 끝에 E 추가)
    text = [char2idx[char] for char in text] #각 알파벳 잘라서 idx로 바꾸기(숫자로 바꾸기)
    text_lengths.append(len(text)) #한 wav파일 txt당 
    texts.append(np.array(text, np.int32).tostring())
    #tostring : Construct python bytes showing a copy of the raw contents of data memory
    

In [18]:
text_lengths

[150,
 31,
 155,
 89,
 144,
 74,
 113,
 26,
 102,
 116,
 75,
 105,
 44,
 168,
 166,
 79,
 137,
 124,
 111,
 64,
 125,
 110,
 130,
 119,
 107,
 87,
 139,
 66,
 74,
 95,
 100,
 105,
 99,
 136,
 79,
 98,
 60,
 139,
 62,
 103,
 74,
 113,
 64,
 97,
 38,
 87,
 93,
 93,
 115,
 100,
 76,
 79,
 73,
 64,
 103,
 81,
 142,
 103,
 149,
 111,
 56,
 83,
 82,
 91,
 107,
 132,
 99,
 128,
 77,
 130,
 113,
 100,
 99,
 86,
 105,
 141,
 135,
 31,
 58,
 92,
 54,
 97,
 75,
 73,
 137,
 104,
 118,
 136,
 130,
 53,
 102,
 125,
 104,
 63,
 60,
 149,
 115,
 80,
 118,
 132,
 116,
 90,
 77,
 31,
 116,
 128,
 62,
 93,
 130,
 77,
 99,
 69,
 84,
 62,
 116,
 127,
 104,
 93,
 90,
 102,
 108,
 88,
 65,
 139,
 45,
 55,
 141,
 136,
 58,
 89,
 93,
 119,
 129,
 82,
 75,
 90,
 76,
 91,
 80,
 86,
 56,
 137,
 100,
 69,
 107,
 87,
 103,
 104,
 111,
 132,
 74,
 155,
 89,
 149,
 76,
 82,
 70,
 130,
 121,
 56,
 74,
 134,
 98,
 114,
 40,
 129,
 73,
 107,
 150,
 29,
 143,
 119,
 135,
 50,
 60,
 107,
 79,
 112,
 45,
 115,
 67,
 99,
 1

In [19]:
texts

[b'\x12\x00\x00\x00\x14\x00\x00\x00\x0b\x00\x00\x00\x10\x00\x00\x00\x16\x00\x00\x00\x0b\x00\x00\x00\x10\x00\x00\x00\t\x00\x00\x00\x02\x00\x00\x00\x0b\x00\x00\x00\x10\x00\x00\x00\x02\x00\x00\x00\x16\x00\x00\x00\n\x00\x00\x00\x07\x00\x00\x00\x02\x00\x00\x00\x11\x00\x00\x00\x10\x00\x00\x00\x0e\x00\x00\x00\x1b\x00\x00\x00\x02\x00\x00\x00\x15\x00\x00\x00\x07\x00\x00\x00\x10\x00\x00\x00\x15\x00\x00\x00\x07\x00\x00\x00\x02\x00\x00\x00\x19\x00\x00\x00\x0b\x00\x00\x00\x16\x00\x00\x00\n\x00\x00\x00\x02\x00\x00\x00\x19\x00\x00\x00\n\x00\x00\x00\x0b\x00\x00\x00\x05\x00\x00\x00\n\x00\x00\x00\x02\x00\x00\x00\x19\x00\x00\x00\x07\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00\x14\x00\x00\x00\x07\x00\x00\x00\x02\x00\x00\x00\x03\x00\x00\x00\x16\x00\x00\x00\x02\x00\x00\x00\x12\x00\x00\x00\x14\x00\x00\x00\x07\x00\x00\x00\x15\x00\x00\x00\x07\x00\x00\x00\x10\x00\x00\x00\x16\x00\x00\x00\x02\x00\x00\x00\x05\x00\x00\x00\x11\x00\x00\x00\x10\x00\x00\x00\x05\x00\x00\x00\x07\x00\x00\x00\x14\x00\x00\x00\x10\x00\x00\x0

In [22]:
text_1 = text_normalize(lines[0].strip().split("|")[1]+"E")

In [24]:
[char for char in text_1]

['p',
 'r',
 'i',
 'n',
 't',
 'i',
 'n',
 'g',
 ' ',
 'i',
 'n',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'o',
 'n',
 'l',
 'y',
 ' ',
 's',
 'e',
 'n',
 's',
 'e',
 ' ',
 'w',
 'i',
 't',
 'h',
 ' ',
 'w',
 'h',
 'i',
 'c',
 'h',
 ' ',
 'w',
 'e',
 ' ',
 'a',
 'r',
 'e',
 ' ',
 'a',
 't',
 ' ',
 'p',
 'r',
 'e',
 's',
 'e',
 'n',
 't',
 ' ',
 'c',
 'o',
 'n',
 'c',
 'e',
 'r',
 'n',
 'e',
 'd',
 ' ',
 'd',
 'i',
 'f',
 'f',
 'e',
 'r',
 's',
 ' ',
 'f',
 'r',
 'o',
 'm',
 ' ',
 'm',
 'o',
 's',
 't',
 ' ',
 'i',
 'f',
 ' ',
 'n',
 'o',
 't',
 ' ',
 'f',
 'r',
 'o',
 'm',
 ' ',
 'a',
 'l',
 'l',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'a',
 'r',
 't',
 's',
 ' ',
 'a',
 'n',
 'd',
 ' ',
 'c',
 'r',
 'a',
 'f',
 't',
 's',
 ' ',
 'r',
 'e',
 'p',
 'r',
 'e',
 's',
 'e',
 'n',
 't',
 'e',
 'd',
 ' ',
 'i',
 'n',
 ' ',
 't',
 'h',
 'e',
 ' ',
 'e',
 'x',
 'h',
 'i',
 'b',
 'i',
 't',
 'i',
 'o',
 'n',
 'e']

In [16]:
fname,_, text = line.strip().split("|")
#print(fname)
fpath = os.path.join(data, "wavs",fname + ".wav")

In [17]:
text = text_normalize(text) + "E"
text

'the recommendations we have here suggested would greatly advance the security of the office without any impairment of our fundamental liberties.E'

In [14]:
text[1]

10

In [17]:
text = text_normalize(text) + "~"
text = [char2idx[char] for char in text] #ㅁ,ㅗ,ㄷ,ㅡ,ㄴ ... (자모단위로 자르고 char2idx)
text 

[8,
 29,
 5,
 39,
 45,
 79,
 11,
 21,
 7,
 21,
 57,
 5,
 39,
 49,
 13,
 41,
 79,
 16,
 34,
 49,
 2,
 34,
 7,
 39,
 49,
 79,
 20,
 23,
 62,
 20,
 22,
 79,
 6,
 37,
 2,
 41,
 79,
 11,
 41,
 14,
 21,
 42,
 20,
 22,
 61,
 5,
 21,
 75,
 1,
 1]

In [18]:
text_lengths.append(len(text))
texts.append(np.array(text, np.int32).tostring())

In [19]:
lines = codecs.open(test_data, 'r', 'utf-8').readlines()[1:]
#sents = [text_normalize(line.split(" ", 1)[-1]).strip() + "E" for line in lines] # text normalization, E: EOS
sents = [text_normalize(line.split(" ", 1)[-1]).strip() + "~" for line in lines] # text normalization, ~: EOS
lengths = [len(sent) for sent in sents]
maxlen = sorted(lengths, reverse=True)[0]
texts = np.zeros((len(sents), maxlen), np.int32) #[20,84]
for i, sent in enumerate(sents):
    texts[i, :len(sent)] = [char2idx[char] for char in sent]

## def get_batch()

In [18]:
import librosa

In [19]:
batch_size = 16
prepro=False
preemphasis = .97 #or None
r = 4 # Reduction factor. Paper => 2, 3, 5
sr = 22050 #sample rate
n_fft = 2048 #fft points(sample)
n_mels = 80 #Number of inversion iterations
frame_shift = 0.0125 # seconds
frame_length = 0.05 # seconds
hop_length = int(sr*frame_shift) # samples. (275)
win_length = int(sr*frame_length) # samples.(1102)

In [158]:
def get_batch():
    """Loads training data and put them in queues"""
    with tf.device('/gpu:0'):
        # Load data
        fpaths, text_lengths, texts = load_data() # list
        maxlen, minlen = max(text_lengths), min(text_lengths) #max : 188, min : 12

        # Calc total batch count
        num_batch = len(fpaths) // batch_size #13099/16 = 818

        fpaths = tf.convert_to_tensor(fpaths)
        text_lengths = tf.convert_to_tensor(text_lengths)
        texts = tf.convert_to_tensor(texts)
        #conver_to_tensor : list와 ndarray type -> tensor type
        
        # Create Queues
        fpath, text_length, text = tf.train.slice_input_producer([fpaths, text_lengths, texts], shuffle=True)

        # Parse
        text = tf.decode_raw(text, tf.int32)  # (None,)

        if prepro:
            def _load_spectrograms(fpath):
                fname = os.path.basename(fpath)
                mel = "mels/{}".format(fname.replace("wav", "npy"))
                mag = "mags/{}".format(fname.replace("wav", "npy"))
                return fname, np.load(mel), np.load(mag)
                #np.load : np.save()로 저장된 *.npy파일을 배열로 불러오기
                #np.save() : 1개의 배열을 Numpy format의 바이너리 파일로 저장하기
                
            fname, mel, mag = tf.py_func(_load_spectrograms, [fpath], [tf.string, tf.float32, tf.float32])
        else:
            fname, mel, mag = tf.py_func(load_spectrograms, [fpath], [tf.string, tf.float32, tf.float32])  # (None, n_mels)

        # Add shape information
        fname.set_shape(())
        text.set_shape((None,))
        mel.set_shape((None, n_mels*r))
        mag.set_shape((None, n_fft//2+1))

        # Batching
        _, (texts, mels, mags, fnames) = tf.contrib.training.bucket_by_sequence_length(
                                            input_length=text_length,
                                            tensors=[text, mel, mag, fname],
                                            batch_size=batch_size,
                                            bucket_boundaries=[i for i in range(minlen + 1, maxlen - 1, 20)],
                                            num_threads=16,
                                            capacity=batch_size * 4,
                                            dynamic_pad=True)

    return texts, mels, mags, fnames, num_batch


In [None]:
def _load_spectrograms(fpath):
    fname = os.path.basename(fpath)
    mel = "mels/{}".format(fname.replace("wav", "npy"))
    mag = "mags/{}".format(fname.replace("wav", "npy"))
    return fname, np.load(mel), np.load(mag)

In [20]:
#Create Queues
fpath,text_length,text = tf.train.slice_input_producer([fpaths, text_lengths, texts], shuffle=True)
#배열에서 데이타를 읽어 드리는 input queue 를 생성하는데, 이때 인자로 shuffle = True로 주면 데이타를 리턴 할때 순차적으로 리턴하지 않고 셔플된 형태로 랜덤하게 리턴한다. 

Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensor_slices(tuple(tensor_list)).shuffle(tf.shape(input_tensor, out_type=tf.int64)[0]).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.range(limit).shuffle(limit).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensor_slices(input_tensor).shuffle(tf.shape(input_tensor, out_type=tf.int64)[0]).repeat(num_epochs)`. If `shuffle=False`, omit the `.shuffle(...)`.
Instructions for updating:
Queue-based input pipelines have been replaced by `tf.data`. Use `tf.data.Dataset.from_tensors(tensor).repeat(num_epochs)`.
Instructions for updating:
To construct input pipelines, use the `tf.data` module.
Instructions for updating:
To 

In [None]:
sess = tf.Session()
sess.run(fpath)

In [44]:
#parse
tf.decode_raw(text,tf.int32) #Reinterpret the bytes of a string as a vector of numbers

<tf.Tensor 'DecodeRaw_3:0' shape=(?,) dtype=int32>

In [None]:
fname, mel, mag = tf.py_func(_load_spectrograms, [fpath], [tf.string, tf.float32, tf.float32])
#tf.py_func(func, inp, Tout,name=None)
#func : 파이썬 함수
#inp : Tensor list
#Tout : func의 반환값을 나타내는 Tensorflow 데이터 타입의 리스트

In [46]:
def load_spectrograms(fpath):
    fname = os.path.basename(fpath)
    mel, mag = get_spectrograms(fpath)
    t = mel.shape[0]
    num_paddings = hp.r - (t % hp.r) if t % hp.r != 0 else 0 # for reduction
    mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode="constant")
    mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant")
    return fname, mel.reshape((-1, hp.n_mels*hp.r)), mag

TypeError: expected str, bytes or os.PathLike object, not Tensor

In [159]:
get_batch()

(<tf.Tensor 'bucket_by_sequence_length/bucket/dequeue_top:2' shape=(16, ?) dtype=int32>,
 <tf.Tensor 'bucket_by_sequence_length/bucket/dequeue_top:3' shape=(16, ?, 320) dtype=float32>,
 <tf.Tensor 'bucket_by_sequence_length/bucket/dequeue_top:4' shape=(16, ?, 1025) dtype=float32>,
 <tf.Tensor 'bucket_by_sequence_length/bucket/dequeue_top:5' shape=(16,) dtype=string>,
 818)

#### def get_spectrograms(fpath)

UnboundLocalError: local variable 'sr' referenced before assignment

22050

In [105]:
y, sr = librosa.load(fpath[0], sr=sr)


In [126]:
def get_spectrograms(fpath):
    '''Returns normalized log(melspectrogram) and log(magnitude) from `sound_file`.
    Args:
      sound_file: A string. The full path of a sound file.

    Returns:
      mel: A 2d array of shape (T, n_mels) <- Transposed
      mag: A 2d array of shape (T, 1+n_fft/2) <- Transposed
    '''
    # num = np.random.randn()
    # if num < .2:
    #     y, sr = librosa.load(fpath, sr=hp.sr)
    # else:
    #     if num < .4:
    #         tempo = 1.1
    #     elif num < .6:
    #         tempo = 1.2
    #     elif num < .8:
    #         tempo = 0.9
    #     else:
    #         tempo = 0.8
    #     cmd = "ffmpeg -i {} -y ar {} -hide_banner -loglevel panic -ac 1 -filter:a atempo={} -vn temp.wav".format(fpath, hp.sr, tempo)
    #     os.system(cmd)
    #     y, sr = librosa.load('temp.wav', sr=hp.sr)

    # Loading sound file
    y, sr = librosa.load(fpath, sr=22050)


    # Trimming
    y, _ = librosa.effects.trim(y)

    # Preemphasis
    y = np.append(y[0], y[1:] - preemphasis * y[:-1])

    # stft
    linear = librosa.stft(y=y,
                          n_fft=n_fft, #fft window size
                          hop_length=hop_length, #number audio of frames between STFT columns. If unspecified, defaults win_length/4
                          win_length=win_length) #each frame of audio is windowed by window().
                          #The window will be of length win_length and then padded with zeros to match n_fft

    # magnitude spectrogram
    mag = np.abs(linear)  # (1+n_fft//2, T)

    # mel spectrogram
    mel_basis = librosa.filters.mel(sr, n_fft, n_mels)  # (n_mels, 1+n_fft//2)
    mel = np.dot(mel_basis, mag)  # (n_mels, t)

    # to decibel
    mel = 20 * np.log10(np.maximum(1e-5, mel))
    mag = 20 * np.log10(np.maximum(1e-5, mag))

    # normalize
    mel = np.clip((mel - ref_db + max_db) / max_db, 1e-8, 1)
    mag = np.clip((mag - ref_db + max_db) / max_db, 1e-8, 1)

    # Transpose
    mel = mel.T.astype(np.float32)  # (T, n_mels)
    mag = mag.T.astype(np.float32)  # (T, 1+n_fft//2)

    return mel, mag


In [111]:
fpaths, text_lengths, texts = load_data() # list
#fpaths = tf.convert_to_tensor(fpaths)
#fpath, text_length, text = tf.train.slice_input_producer([fpaths, text_lengths, texts], shuffle=True)
fpath=fpaths

In [112]:
y,sr = librosa.load(fpath[0],sr=sr)
#librosa.load : loads and decodes the audio as a time series y, represented
#as a one-dimensional Numpy floating point array
print(y.shape)
print(y)

(41885,)
[-0.00027466  0.          0.         ... -0.00088501 -0.00097656
 -0.00109863]


In [113]:
#Trimming
y,_ = librosa.effects.trim(y) #silence 구간 잘라내기
#librosa.effects.trim(y,top_db=60,ref=<function amax>.frame_length=2048,hop_length=512)
#y : np.ndarray : Audio signal, can be mono or stereo
#top_db : The threshold(in decibels) below reference to consider as silence
#ref : The reference power. By default, it uses np.max and compares to the peak power in the signal
#frame_length : The number of samples per analysis frame
#hop_length : The number of samples between analysis frames

#Returns:
#y : The trimmed signal

In [114]:
#Preemphasis
#필터 식 : y(t) = x(t)-ax(t-1), 일반적으로 a는 0.95 또는 0.97
y = np.append(y[0],y[1:] - preemphasis * y[:-1])
y

array([-0.00027466,  0.00026642,  0.        , ...,  0.00018066,
       -0.0001181 , -0.00015137], dtype=float32)

The purpose of windowing your data before processing it with a discrete Fourier transform (DFT or FFT), is to minimize spectral leakage, which happens when you try to Fourier-transform non-cyclical data.

In [115]:
linear = librosa.stft(y=y,
                     n_fft = n_fft, #2048
                     hop_length = hop_length, # 275
                     win_length = win_length) #1102

#n_fft : fft window size
#hop_length : number audio of frames between STFT columns
#win_length : each frame of audio is windowed by window()

In [116]:
linear.shape

(1025, 153)

In [117]:
mag = np.abs(linear)
mag #1025, 153

array([[3.1162318e-04, 2.3022071e-04, 6.9906760e-04, ..., 3.6570873e-05,
        4.5403279e-04, 1.6208979e-03],
       [1.5589545e-04, 3.3332393e-04, 9.5406699e-04, ..., 3.6265690e-05,
        4.5633435e-04, 1.6248456e-03],
       [1.7460428e-05, 2.9214678e-04, 1.0676493e-03, ..., 2.7739532e-05,
        5.1014096e-04, 1.6056864e-03],
       ...,
       [1.1037842e-03, 8.9600298e-04, 1.7855663e-03, ..., 4.6173885e-04,
        7.4204023e-04, 8.0666738e-04],
       [1.3081902e-03, 9.2918915e-04, 1.4224973e-03, ..., 5.2605360e-04,
        5.8439485e-04, 1.1481394e-03],
       [1.3986207e-03, 4.7905929e-04, 1.2892666e-03, ..., 5.1225943e-04,
        3.6660090e-04, 1.2798454e-03]], dtype=float32)

In [118]:
#mel spectrogram
mel_basis = librosa.filters.mel(sr, n_fft, n_mels)
#librosa.filters.mel(sr,n_fft,n_mels)
#Prameters
#sr : sampling rate of the imcoming signal
#n_fft : number of FFT components
#n_mels : number of Mel bands to generate

#Returns
#Mel transform matrix
mel = np.dot(mel_basis,mag) #80,153

In [119]:
mel.shape

(80, 153)

In [120]:
#to decibel
mel = 20*np.log10(np.maximum(1e-5, mel))
mag = 20*np.log10(np.maximum(1e-5, mag))
print(mel)
print(mag)

[[-88.65034008 -87.50387218 -82.51284442 ... -91.25797869 -86.80219664
  -77.60555594]
 [-83.16915507 -78.81166387 -76.97727208 ... -87.75419142 -82.93924756
  -74.42842983]
 [-80.06756403 -68.94547844 -63.38376998 ... -78.32026547 -79.11120623
  -69.27933936]
 ...
 [-69.77365614 -50.35317403 -34.21485416 ... -72.80852549 -72.95838369
  -74.60560138]
 [-73.35550172 -58.83531701 -41.25358973 ... -72.51057883 -74.99295577
  -76.47260883]
 [-77.8713282  -64.84242293 -46.82061344 ... -78.38566662 -79.97135739
  -82.47509988]]
[[-70.1274   -72.75711  -63.10962  ... -88.7373   -66.85825  -55.804886]
 [-76.14333  -69.54267  -60.40842  ... -88.81008  -66.81434  -55.78376 ]
 [-95.158905 -70.68797  -59.431427 ... -91.138016 -65.84619  -55.886784]
 ...
 [-59.14232  -60.95381  -54.96448  ... -66.71207  -62.591454 -61.86611 ]
 [-57.666584 -60.637917 -56.938972 ... -65.5794   -64.66587  -58.800106]
 [-57.086    -66.39222  -57.793144 ... -65.810196 -68.71613  -57.85685 ]]


In [121]:
max_db = 100
ref_db = 20
mel = np.clip((mel - ref_db + max_db)/max_db, 1e-8,1) #1e-8보다 작으면 1e-8, 1보다 크면 1로 설정
mag = np.clip((mag - ref_db + max_db)/max_db, 1e-8,1)

print(mel)
print(mag)

[[1.00000000e-08 1.00000000e-08 1.00000000e-08 ... 1.00000000e-08
  1.00000000e-08 2.39444406e-02]
 [1.00000000e-08 1.18833613e-02 3.02272792e-02 ... 1.00000000e-08
  1.00000000e-08 5.57157017e-02]
 [1.00000000e-08 1.10545216e-01 1.66162300e-01 ... 1.67973453e-02
  8.88793771e-03 1.07206606e-01]
 ...
 [1.02263439e-01 2.96468260e-01 4.57851458e-01 ... 7.19147451e-02
  7.04161631e-02 5.39439862e-02]
 [6.64449828e-02 2.11646830e-01 3.87464103e-01 ... 7.48942117e-02
  5.00704423e-02 3.52739117e-02]
 [2.12867180e-02 1.51575771e-01 3.31793866e-01 ... 1.61433338e-02
  2.86426075e-04 1.00000000e-08]]
[[9.8725967e-02 7.2428897e-02 1.6890381e-01 ... 9.9999999e-09
  1.3141747e-01 2.4195114e-01]
 [3.8566664e-02 1.0457329e-01 1.9591583e-01 ... 9.9999999e-09
  1.3185662e-01 2.4216241e-01]
 [9.9999999e-09 9.3120269e-02 2.0568573e-01 ... 9.9999999e-09
  1.4153808e-01 2.4113220e-01]
 ...
 [2.0857681e-01 1.9046189e-01 2.5035521e-01 ... 1.3287933e-01
  1.7408547e-01 1.8133888e-01]
 [2.2333419e-01 1.93620

In [122]:
mel = mel.T.astype(np.float32)
mag = mag.T.astype(np.float32)

print(mel)
print(mag)

[[9.99999994e-09 9.99999994e-09 9.99999994e-09 ... 1.02263436e-01
  6.64449856e-02 2.12867185e-02]
 [9.99999994e-09 1.18833613e-02 1.10545218e-01 ... 2.96468258e-01
  2.11646825e-01 1.51575774e-01]
 [9.99999994e-09 3.02272793e-02 1.66162297e-01 ... 4.57851470e-01
  3.87464106e-01 3.31793875e-01]
 ...
 [9.99999994e-09 9.99999994e-09 1.67973451e-02 ... 7.19147474e-02
  7.48942122e-02 1.61433332e-02]
 [9.99999994e-09 9.99999994e-09 8.88793729e-03 ... 7.04161599e-02
  5.00704423e-02 2.86426075e-04]
 [2.39444412e-02 5.57157025e-02 1.07206605e-01 ... 5.39439879e-02
  3.52739133e-02 9.99999994e-09]]
[[9.8725967e-02 3.8566664e-02 9.9999999e-09 ... 2.0857681e-01
  2.2333419e-01 2.2914001e-01]
 [7.2428897e-02 1.0457329e-01 9.3120269e-02 ... 1.9046189e-01
  1.9362083e-01 1.3607781e-01]
 [1.6890381e-01 1.9591583e-01 2.0568573e-01 ... 2.5035521e-01
  2.3061028e-01 2.2206856e-01]
 ...
 [9.9999999e-09 9.9999999e-09 9.9999999e-09 ... 1.3287933e-01
  1.4420600e-01 1.4189804e-01]
 [1.3141747e-01 1.31856

#### def load_spectrogram(fpath):

In [123]:
def load_spectrograms(fpath):
    fname = os.path.basename(fpath)
    mel, mag = get_spectrograms(fpath)
    t = mel.shape[0]
    num_paddings = hp.r - (t % hp.r) if t % hp.r != 0 else 0 # for reduction
    mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode="constant")
    mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant")
    return fname, mel.reshape((-1, hp.n_mels*hp.r)), mag

In [127]:
get_spectrograms(fpath[0])

(array([[9.99999994e-09, 9.99999994e-09, 9.99999994e-09, ...,
         1.02263436e-01, 6.64449856e-02, 2.12867185e-02],
        [9.99999994e-09, 1.18833613e-02, 1.10545218e-01, ...,
         2.96468258e-01, 2.11646825e-01, 1.51575774e-01],
        [9.99999994e-09, 3.02272793e-02, 1.66162297e-01, ...,
         4.57851470e-01, 3.87464106e-01, 3.31793875e-01],
        ...,
        [9.99999994e-09, 9.99999994e-09, 1.67973451e-02, ...,
         7.19147474e-02, 7.48942122e-02, 1.61433332e-02],
        [9.99999994e-09, 9.99999994e-09, 8.88793729e-03, ...,
         7.04161599e-02, 5.00704423e-02, 2.86426075e-04],
        [2.39444412e-02, 5.57157025e-02, 1.07206605e-01, ...,
         5.39439879e-02, 3.52739133e-02, 9.99999994e-09]], dtype=float32),
 array([[9.8725967e-02, 3.8566664e-02, 9.9999999e-09, ..., 2.0857681e-01,
         2.2333419e-01, 2.2914001e-01],
        [7.2428897e-02, 1.0457329e-01, 9.3120269e-02, ..., 1.9046189e-01,
         1.9362083e-01, 1.3607781e-01],
        [1.6890381e-01

In [128]:
fname = os.path.basename(fpath[0])
mel,mag = get_spectrograms(fpath[0])

In [131]:
t = mel.shape[0]
t

153

In [132]:
num_paddings = r - (t%r) if t % r !=0 else 0
num_paddings

3

In [137]:
mel = np.pad(mel, [[0,num_paddings],[0,0]], mode = "constant")
mag = np.pad(mag, [[0,num_paddings],[0,0]], mode = "constant")
#상단부만 패딩이 됨
#np.pad(array,pad_width,mode)
#parameter
#pad_width : {sequence, array_like, int}
#Number of values padded to the edges of each axis. ((before_1, after_1), ... (before_N, after_N)) unique pad widths for each axis. ((before, after),) yields same before and after pad for each axis. (pad,) or int is a shortcut for before = after = pad width for all axes.

In [140]:
mag.shape

(156, 1025)

In [141]:
mel.shape

(156, 80)

# Modules.py

In [163]:
embed_size = 256 #num_units
vocab_size = 32

In [None]:
def embed(inputs, vocab_size, num_units, zero_pad=True, scope="embedding", reuse=None):
    '''Embeds a given tensor. 
    
    Args:
      inputs: A `Tensor` with type `int32` or `int64` containing the ids
         to be looked up in `lookup table`.
      vocab_size: An int. Vocabulary size.
      num_units: An int. Number of embedding hidden units.
      zero_pad: A boolean. If True, all the values of the fist row (id 0)
        should be constant zeros.
      scope: Optional scope for `variable_scope`.  
      reuse: Boolean, whether to reuse the weights of a previous layer
        by the same name.
        
    Returns:
      A `Tensor` with one more rank than inputs's. The last dimesionality
        should be `num_units`.
    '''
    with tf.variable_scope(scope, reuse=reuse):
        lookup_table = tf.get_variable('lookup_table', 
                                       dtype=tf.float32, 
                                       shape=[vocab_size, num_units], #32,256
                                       initializer=tf.truncated_normal_initializer(mean=0.0, stddev=0.01))
        if zero_pad:
            lookup_table = tf.concat((tf.zeros(shape=[1, num_units]), 
                                      lookup_table[1:, :]), 0)
    return tf.nn.embedding_lookup(lookup_table, inputs)


### tf.nn.embedding_lookup

In [6]:
tf.InteractiveSession()
params = tf.constant([10,20,30,40])
ids = tf.constant([0,1,2,3])
print(tf.nn.embedding_lookup(params,ids).eval())

[10 20 30 40]




In [7]:
params1 = tf.constant([1,2])
params2 = tf.constant([10,20])
ids = tf.constant([2,0,2,1,2,3])
result = tf.nn.embedding_lookup([params1, params2], ids)
print(result.eval())

In [10]:
embeddings = tf.constant([[[1,1],[2,2],[3,3],[4,4]],[[11,11],[12,12],[13,13],[14,14]],
                          [[21,21],[22,22],[23,23],[24,24]]])
ids=tf.constant([0,2,1])
embed = tf.nn.embedding_lookup(embeddings, ids, partition_strategy='div')

with tf.Session() as session:
    result = session.run(embed)
    print (result)

[[[ 1  1]
  [ 2  2]
  [ 3  3]
  [ 4  4]]

 [[21 21]
  [22 22]
  [23 23]
  [24 24]]

 [[11 11]
  [12 12]
  [13 13]
  [14 14]]]


In [1]:
import numpy as np

In [3]:
mels = "mels/b'3_1711.npy'"
mels

"mels/b'3_1711.npy'"

In [4]:
"mels/{}".format(str(mels).replace("wav", "npy").replace("/b'",""))

"mels/mels3_1711.npy'"