# Read the example wav file
First we decode the file header, then we get at the audio data

In [61]:
def decode_wav_header(audio_buffer):
    print("-------------------------------------------------------------------------")
    print("Show the wav header information")
    print("-------------------------------------------------------------------------")
    
    riff = audio_buffer[:4].decode()
    print("File type: {}".format(riff))

    file_size = audio_buffer[7] << 24 | audio_buffer[6] << 16 | audio_buffer[5] << 8 | audio_buffer[4] 
    print("File size: 0x{:d}".format(file_size))

    audio_type = audio_buffer[8:12].decode()
    print("Audio type: {}".format(audio_type))

    format_marker = audio_buffer[12:15].decode()
    print("Format marker: {}".format(format_marker))

    data_length = audio_buffer[17] << 8 | audio_buffer[16]
    print("Data length in bits: {:d}".format(data_length))

    type_format = audio_buffer[21] << 8 | audio_buffer[20]
    if type_format == 1:
        print("PCM - 2 byte integer")
    else:
        print("Unknown format: {d}".format(type_format))

    no_of_channels = audio_buffer[23] << 8 | audio_buffer[22]
    print("No of channels: {:d}".format(no_of_channels))

    sample_rate = audio_buffer[27] << 24 | audio_buffer[26] << 16 | audio_buffer[25] << 8 | audio_buffer[24]
    print("Sample rate: {:d} Hz".format(sample_rate))

    s_rate_bps_ch = audio_buffer[31] << 24 | audio_buffer[30] << 16 | audio_buffer[29] << 8 | audio_buffer[28]
    print("(Sample rate * Bits per sample * Channels)/8: {:d}".format(s_rate_bps_ch))

    bts_ch = audio_buffer[35] << 8 | audio_buffer[34]
    print("(Bits per sample * channels)/8: {:d}".format(bts_ch))

    bits_per_sample = audio_buffer[35] << 8 | audio_buffer[34]
    print("Bits per sample: {:d}".format(bits_per_sample))

    data_section = audio_buffer[36:40].decode()
    print("Start of data section: {}".format(data_section))

    data_section_length = audio_buffer[43] << 24 | audio_buffer[42] << 16 | audio_buffer[41] << 8 | audio_buffer[40]
    print("Length of data section: {:d}".format(data_section_length))
    return sample_rate

Read the wav file into a buffer

In [62]:
audiofile = "yes-example.wav"

In [63]:
try:
    f = open(audiofile,'rb')
except:
    print("Cannot open file {}".format(audiofile))

In [64]:
audio_buffer = f.read()
f.close()

In [65]:
kAudioSampleFrequency = decode_wav_header(audio_buffer)
kAudioOneMsSize = kAudioSampleFrequency // 1000

-------------------------------------------------------------------------
Show the wav header information
-------------------------------------------------------------------------
File type: RIFF
File size: 0x32036
Audio type: WAVE
Format marker: fmt
Data length in bits: 16
PCM - 2 byte integer
No of channels: 1
Sample rate: 16000 Hz
(Sample rate * Bits per sample * Channels)/8: 32000
(Bits per sample * channels)/8: 16
Bits per sample: 16
Start of data section: data
Length of data section: 32000


In [66]:
# The following values are derived from values used during model training.
# If you change the way you preprocess the input, update all these constants.
kFeatureSliceSize = 40
kFeatureSliceCount = 49
kFeatureElementCount = (kFeatureSliceSize * kFeatureSliceCount)
kFeatureSliceStrideMs = 20
kFeatureSliceDurationMs = 30

In [None]:
kSilenceIndex = 0
kUnknownIndex = 1
kYesIndex = 2
kNoIndex = 3

In [68]:
stride_size = kFeatureSliceStrideMs * kAudioOneMsSize
window_size = kFeatureSliceDurationMs * kAudioOneMsSize
print("\nWindow size in samples: {:d}".format(window_size))
print("Stride size in samples: {:d}".format(stride_size))


Window size in samples: 480
Stride size in samples: 320


In [86]:
import numpy as np
import math

Convert from bytearray to int16 numpy array

In [55]:
audio_array = np.frombuffer(audio_buffer[44:], dtype=np.int16)

In [56]:
count = audio_array.size
print("No of audio samples: {:d}".format(count))

No of audio samples: 16000


In [58]:
trailing_10ms = np.zeros(160,dtype=np.int16)

In [87]:
class FeatureData:

    def __init__(self):
        self.slices=[]
        self.totalSlices = 0

    def addSlice(self, slice):

        self.totalSlices = self.totalSlices + 1
        self.slices.append(slice)

        if len (self.slices) > 49:
            self.slices.pop(0)

        # print ("total slices = %d\n" % self.totalSlices)
        # print ("addSlice(): spectrogram length = %d\n" % spectrogram.size())
        # print (spectrogram)


    def setInputTensorValues(self, inputTensor):
        # print (inputTensor)
        counter = 0
        for slice_index in range(len(self.slices)):
            slice = self.slices[slice_index]
            spectrogram = slice.getSpectrogram()
            for spectrogram_index in range (spectrogram.size()):
                inputTensor.setValue(counter, spectrogram[spectrogram_index])
                counter = counter + 1

        # set 1960 values on input tensor
        # print ("set %d values on input tensor\n" % (counter))


In [127]:
class Score:
    def __init__(self, kind, score):
        self.kind = kind
        self.score = score
        
class Results:
    
    def __init__(self):
        self.silence_data = []
        self.unknown_data = []
        self.yes_data = []
        self.no_data  = []
        self.index = 0

    def _computeAverageTotal (self, array_data):
        total = 0
        array_length = len(array_data)
        for i in range (array_length):
            total = total + array_data[i]
        return math.floor(total / array_length)

    def computeResults(self):
        topScore = 0
        topScoreKind = None
        silence = self._computeAverageTotal(self.silence_data)
        print("Average total of silence: {:d}".format(silence))

        if silence > 200:
            topScoreKind = "silence"
            topScore = silence

        unknown = self._computeAverageTotal(self.unknown_data)
        print("Average total of unknown: {:d}".format(unknown))

        if unknown > topScore and unknown > 200:
            topScoreKind = "unknown"
            topScore = unknown
        yes = self._computeAverageTotal(self.yes_data)
        print("Average total of yes: {:d}".format(yes))

        if yes > topScore and yes > 200:
            topScoreKind = "yes"
            topScore = yes

        no = self._computeAverageTotal(self.no_data)
        print("Average total of no: {:d}".format(no))

        if no > topScore and no > 200:
            topScoreKind = "no"
            topScore = no

        return Score (topScoreKind, topScore)

    def storeResults(self, silenceScore, unknownScore, yesScore, noScore):
        print("index: ",self.index)
        if self.index == 3:
            self.silence_data.pop(0)
            self.unknown_data.pop(0)
            self.yes_data.pop(0)
            self.no_data.pop(0)
        else:
            self.index += 1

        self.silence_data.append(silenceScore)
        self.unknown_data.append(unknownScore)
        self.yes_data.append(yesScore)
        self.no_data.append(noScore)
        print("Length of silence_data: ",len(self.silence_data), 
            "last silence value: {:d}".format(self.silence_data[len(self.silence_data)-1]))
        print("Length of unknown_data: ",len(self.unknown_data),
             "last unknown value: {:d}".format(self.unknown_data[len(self.unknown_data)-1]))            
        print("Length of yes_data:     ",len(self.yes_data),
             "last yes value: {:d}".format(self.yes_data[len(self.yes_data)-1]))                    
        print("Length of no_data:      ",len(self.silence_data),
              "last no value: {:d}".format(self.no_data[len(self.no_data)-1]))       

In [128]:
start_index = 0
feature_data = FeatureData()

In [129]:
r = Results()

In [131]:
r.storeResults(0, 0, 201, 0)
score = r.computeResults()
print("kind: {}, score: {}".format(score.kind, score.score))

index:  1
Length of silence_data:  2 last silence value: 0
Length of unknown_data:  2 last unknown value: 0
Length of yes_data:      2 last yes value: 201
Length of no_data:       2 last no value: 0
Average total of silence: 0
Average total of unknown: 0
Average total of yes: 201
Average total of no: 0
kind: yes, score: 201


In [132]:
r.storeResults(0, 201, 0, 0)
score = r.computeResults()
print("kind: {}, score: {}".format(score.kind, score.score))

index:  2
Length of silence_data:  3 last silence value: 0
Length of unknown_data:  3 last unknown value: 201
Length of yes_data:      3 last yes value: 0
Length of no_data:       3 last no value: 0
Average total of silence: 0
Average total of unknown: 67
Average total of yes: 134
Average total of no: 0
kind: None, score: 0


In [81]:
def segmentAudio(featureData, audio, trailing_10ms):
    # In this example we have an array of 1 second of audio data.
    # This is a 16,000 element array.
   r.storeResults(0, 201, 0, 0)

score = r.computeResults() # each micro second is 16 elements in this array.
    # the stride is how far over we adjust the start of the window on each step
    # in this example it is 20 ms (20x16=320).
    # The width of the window for which we capture the spectogram is 30ms (16x30=480).
    # this function will turn the input array into a dictionary of start time to wav data

    input_audio = np.concatenate((trailing_10ms, audio), axis=0)
    input_size = input_audio.size

    total_segments = math.floor(input_size / stride_size)
    start_index = 0

    for segment_index in range (total_segments):
        end_index = min (start_index +  window_size, input_size)
        print ("segment_index=%d,start_index=%d, end_index=%d, size=%d\n" % (segment_index, start_index, end_index, end_index-start_index))
        slice = Slice (input_audio[start_index:end_index], start_index)
        featureData.addSlice(slice)
        start_index = start_index + stride_size

    # return the trailing 10ms
    return np.array(input_audio[input_size-160:input_size], dtype=np.int16)


In [136]:
featureData = FeatureData()
inputBufferSize=320*4
while count > 0:
    # segment the 16000 element array into 320*4 parts
    currentStartIndex = start_index
    currentEndIndex = currentStartIndex + inputBufferSize

    currentSamples = np.array(audio_array[currentStartIndex:currentEndIndex], dtype=np.int16)

    trailing_10ms = segmentAudio(featureData, currentSamples, trailing_10ms)

    start_index = currentEndIndex
    count = count - inputBufferSize


segment_index=0,start_index=0, end_index=480, size=480



NameError: name 'Slice' is not defined