In [61]:
import tensorflow as tf
import wave, os, glob
from IPython.display import display, Audio
from scipy.io.wavfile import read, write
import numpy as np
import pickle as pkl

from utils import *

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [62]:
AUDIO_LENGTH = 16384*2
PATH_DATA = './test_data/'

# Creating a dataset to train on

In [65]:
# Choosing the 3 tokens
water = normalize(np.array(read(PATH_DATA + 'water_40.wav')[1]))
greasy = normalize(np.array(read(PATH_DATA + 'greasy_2.wav')[1]))
suit = normalize(np.array(read(PATH_DATA + 'suit_16.wav')[1]))

water = water / np.max(np.abs(water))
greasy = greasy / np.max(np.abs(greasy))
suit = suit / np.max(np.abs(suit))

tokens = [water, greasy, suit]
for i, token in enumerate(tokens):
    print(np.max(token), np.min(token))
    play(token)


0.9846217492427767 -1.0


1.0 -0.7839011678534843


1.0 -0.8664897845463306


In [66]:
# Generates 1 second long audio samples from water, padded randomly
water_pad = pad_audio_random(audio = water, desired_length = AUDIO_LENGTH//2)
play(water_pad)
print('audio starts at index: ', np.nonzero(water_pad)[0][0])

audio starts at index:  5301


The dataset will consist of various 2 second samples of: 

'water', 'greasy', 'suit' by themselves  
'water' then 'greasy' together  
'greasy' then 'water' together  
'water' then 'suit' together  
'suit' then 'water' together  

Crucially, the network will never see 'greasy' and 'suit' together in any order. The goal is to test whether or not the ciwGAN architecture will learn to concatenate two words that it has never seen concatented together before, by manipulating the learned categorical variables in same way (addition for example).  
  
  
In the following code, we generate 100 samples of each of the aforementioned combinations. The time that words start to get spoken is arbitrarily determined between samples.

In [67]:
num_samples = 100
train_dict = {
    'water': generate_data(water, AUDIO_LENGTH, num_samples),
    'greasy': generate_data(greasy, AUDIO_LENGTH, num_samples),
    'suit': generate_data(suit, AUDIO_LENGTH, num_samples),
    'water_greasy': generate_data((water, greasy), AUDIO_LENGTH, num_samples),
    'greasy_water': generate_data((greasy, water), AUDIO_LENGTH, num_samples),
    'water_suit': generate_data((water, suit), AUDIO_LENGTH, num_samples),
    'suit_water': generate_data((suit,water), AUDIO_LENGTH, num_samples)
}

In [68]:
data = []
for key in train_dict.keys():
    print(key)
    for sample in train_dict[key][0:10]:
        play(sample)
        
    for audio in train_dict[key]:
        data.append(audio)

data = np.vstack(data)

water


greasy


suit


water_greasy


greasy_water


water_suit


suit_water


In [69]:
print(data.shape)
print(np.max(data), np.min(data))

(700, 32768)
1.0 -1.0


Save all generated samples as .wav files

In [70]:
path = './generated_data/'
amplitude = np.iinfo(np.int16).max
for key in train_dict.keys():
    print(key)
    for i, audio in enumerate(train_dict[key]):
        filename = path + key + str(i) + '.wav'
        write(filename, 16000, (audio * amplitude).astype(np.int16))

water
greasy
suit
water_greasy
greasy_water
water_suit
suit_water


In [71]:
path = './generated_data/'
test_water = np.array(read(path + 'water_suit40.wav')[1])
play(test_water)