In [1]:
import wave
import struct
import matplotlib.pyplot as plt
import numpy as np

#we have whitecrown sparrow and american robin samples

class_ids = {
    'sparrow': 0,
    'robin': 1,
}


robin = wave.open('./recordings/ml-american-robin.wav', 'r')
frame_n = robin.getnframes()

In [3]:
# find silent frames of file
#
# @arg w sound wave file
# @arg threshold threshold value for sound intensity
#
# @return s_frame_val all silent frames
# @return frame_val all significant frames
# @sig_frames number of sig frames
# @frames total number of frames

def silent_frames(w,threshold, x=0, y=-1, verbose=True):
   
    #init frame arrayws
    init_size = w.getnframes()
    
    frames = np.empty(init_size)
    s_frames = np.empty(init_size)
    
    frames_n=0
    s_frames_n = 0
    
    if y == -1:
        y = init_size
    
    
    
    for i in range(w.getnframes()):
        # read a single frame and advance to next frame
        current_frame= w.readframes(1)
        
        if w.tell() >= x:
            #print(w.tell())

            # check for silence
            silent = True
            # wave frame samples are stored in little endian**
            # this example works for a single channel 16-bit per sample encoding
            unpacked_signed_value = struct.unpack("<h", current_frame) # *
            if abs(unpacked_signed_value[0]) > threshold:
                silent = False

            if silent:
                s_frames[s_frames_n] = w.tell()
                s_frames_n +=1
                if verbose:
                    print ("Frame %s is silent." % w.tell())
                    print ("silence found at second %s" % (w.tell()/w.getframerate()))
            else:
                frames[frames_n] = w.tell()
                frames_n +=1
                if verbose:
                    print ("Frame %s is not silent." % w.tell())
                
            if w.tell() == y:
                break

    frames = np.resize(frames,frames_n)
    s_frames = np.resize(s_frames,s_frames_n)
    
    
    print(frames_n)            
    return frames, s_frames        

In [4]:
frames, s_frames = silent_frames(robin, 50, verbose=False)

2536701


### 2003 pixels on png == 44100 frames on audio file

In [11]:
print(frames[::])
print(s_frames[::])

[1.692000e+03 1.693000e+03 1.694000e+03 ... 4.704835e+06 4.705076e+06
 4.705077e+06]
[1.000000e+00 2.000000e+00 3.000000e+00 ... 4.726654e+06 4.726655e+06
 4.726656e+06]


In [12]:
print(len(frames))
print(len(s_frames))

2536701
2189955


In [None]:
print(silent_frames[::])

In [None]:
len(silent_frames)

In [None]:
def frame_to_pixel(f):
    return f * 2003 / 44100

In [None]:
[frame_to_pixel(x) for x in[220000, 244200]]

In [112]:
# find silent frames of file
#
# @arg w sound wave file
# @arg frames list of heard frames
# @arg stride how far the window moves each iter
# @arg tol ratio of heard/silent calls considered acceptable
#
# @return ranges 2d numpy array containing list of ranges


def find_audio(w, frames, window_size=150, stride=1, tol=0.3):
    left_f = 0
    right_f = window_size
    
    frame_end = int(frames[len(frames)-1])
    
    r_index = 0
    l_index = 0
    
    #values to return for range
    left = -1
    right = -1
    
    ranges = np.zeros((w.getnframes()), dtype=(float,2))
    
    #get inital heard values
    heard = 0
    
    for f in frames:
            #outside window range set new values
            if f >= right_f:
                #set next window  ind 
                left_f += 1
                right_f = left_f + window_size
                
                #set index in frames
                r_index = heard
                l_index += 1
                
                break
            #increment heard frames
            heard += 1
            
    #ratio of heard to window size
    ratio = heard/window_size
    
    #if tol is high set left range
    if ratio >= tol:
        left = left_f
        
    i = 0
    
    while(right_f < frame_end):
        #check if left value in frame range
        if frames[l_index] <= left_f:
            l_index += 1
            heard -= 1
            #print('left out of range')
            

    
    
        #check if right value in frame range
        if frames[r_index] <= right_f:
            r_index += 1
            heard += 1
            #print('right in range')
            
           #check if r_index is at edge
        if(r_index >= frame_end-1):
            ranges[i]=[left_f,right_f]
            i+=1
            break   
            
            
            
        #print(ratio)
            
        ratio = heard/window_size
        
        if ratio >= tol and left == -1:
            left = left_f
            
        #print(ratio , '' , tol)

        
            
        if ratio < tol and left != -1:
            right = right_f
            ranges[i] = [left_f,right_f]
            i +=1
            left = -1
        
        right_f += 1
        left_f += 1
        
    ranges = np.resize(ranges,(i,2))        
   
    return ranges


In [113]:
ranges = find_audio(robin,frames, tol = 0.2)

array([[1.703000e+03, 1.853000e+03],
       [1.922000e+03, 2.072000e+03],
       [1.941000e+03, 2.091000e+03],
       ...,
       [4.704289e+06, 4.704439e+06],
       [4.704565e+06, 4.704715e+06],
       [4.704760e+06, 4.704910e+06]])

In [116]:
len(ranges)

4455