### Exploring the functions in load_audio_to_mem.py

TensorFlow is fun! Open Source is like Enterprise -- well documented.
This whole library is based on Mozilla DeepSpeech

https://svds.com/tensorflow-rnn-tutorial/

Vikram Reddy, 4/17/17

Check-in Log:

Friday, 4/21/17: Goals - feed input and target into pipeline

Thursday, 4/27/17: Feed input and target into system

In [26]:
import os
import scipy.io.wavfile as wav
import numpy as np

from python_speech_features import mfcc

In [27]:
wavfile = 'mydata/1970-28415-0023.wav'

def load_wavfile(wavfile):
    ''' 
    Read a wav file using scipy.io.wavfile
    
    Returns a tuple of the rate, signal and name of the wav file
    '''
    rate, sig = wav.read(wavfile)
    data_name = os.path.splitext(os.path.basename(wavfile))[0]
    return rate, sig, data_name

frames_per_second, signal, data_name = load_wavfile(wavfile)
print(frames_per_second, signal, data_name)

16000 [-66 -34  15 ..., -22 -29 -32] 1970-28415-0023


In [28]:
textfile = '1970-28415-0023.txt'
def get_audio_and_transcript(textfile):
    '''
    
    '''

In [29]:
def audiofile_to_input_vector(audio_filename, numcep, numcontext):
    '''
    Turn audio file into input vec
    Shape of original inputs is (number of frames by number of cepstrum (13 is default))
    '''
    fs, audio = wav.read(audio_filename)
    
    # Get mfcc coefficients
    orig_inputs = mfcc(audio, samplerate=fs, numcep=numcep)
    
    # We only keep every second feature (BiRNN stride = 2)
    orig_inputs = orig_inputs[::2]
    
    

In [30]:
coeff = mfcc(signal, samplerate=frames_per_second)
coeff.shape
strided_input = coeff[::2]
strided_input.shape

(139, 13)

In [31]:
train_inputs = np.array([], np.float32)

In [32]:
numcep = 13
numcontext = 9

In [33]:
train_inputs

array([], dtype=float32)

#### refcheck = False to avoid value error in the resize function

In [34]:
train_inputs.resize((139, numcep + 2 * numcep * numcontext),refcheck=False)
train_inputs

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [35]:
empty_mfcc = np.array([])
empty_mfcc.resize((numcep), refcheck=False)
empty_mfcc


array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [36]:
time_slices = range(train_inputs.shape[0])
context_past_min = time_slices[0] + numcontext # 9
context_past_min
context_future_max = time_slices[-1] - numcontext # 129
context_future_max
time_slices[-1]

138

In [37]:
time_slices

range(0, 139)

In [41]:
for p in time_slices:
    # creating sequences of past context
    need_empty_past = max(0, (context_past_min - p)) # 9 counting down to 1, then 0s
    
    # only the first 9 have empty past source (if 9 is numcontext)
    empty_source_past = list(empty_mfcc for empty_slots in range(need_empty_past))
    data_source_past = strided_input[max(0, p - numcontext):p] # starts with empty list of [0:0]
    assert(len(empty_source_past) + len(data_source_past) == numcontext)
    if p == 0:
        print(len(empty_source_past))
        print()
        print(strided_input.shape)
        print(data_source_past.shape)
    # creating sequences of future context
    need_empty_future = max(0, (p - context_future_max))
    empty_source_future = list(empty_mfcc for empty_slots in range(need_empty_future))
    
    # at each point in time, I have a list of mfcc features (with dim 13), where the list has dim=numcontext
    # So I am essentially creating features for each numcontext-sized bin, centering around a timepoint
    data_source_future = strided_input[p + 1:p + numcontext + 1]
    assert(len(empty_source_future) + len(data_source_future) == numcontext)
    
    if need_empty_past:
        past = np.concatenate((empty_source_past, data_source_past))
    else:
        past = data_source_past

    if need_empty_future:
        future = np.concatenate((data_source_future, empty_source_future))
    else:
        future = data_source_future
    
    # flatten the arrays
    past = np.resize(past, (numcontext * numcep,))
    now = strided_input[p]
    future = np.resize(future, (numcontext * numcep,))
    assert(len(past) + len(now) + len(future) == numcep + 2*numcontext*numcep)
    
    train_inputs[p] = np.concatenate((past,now,future))

# Scale/Standardize the inputs
train_inputs = (train_inputs - np.mean(train_inputs)) / np.std(train_inputs)


9

(139, 13)
(0, 13)


In [40]:
len(train_inputs)

139