In [59]:
import os
import math
import numpy as np
from pydub import AudioSegment
from pydub.playback import play
from scipy.io import wavfile
from scipy import signal

In [14]:
def process_audio(segment, duration):
    # Trim or pad audio segment to %duration
    segment = segment[:duration]
    padding = AudioSegment.silent(duration)
    segment = padding.overlay(segment)
    # Set frame rate to 123000
    segment = segment.set_channels(1)
    segment = segment.set_frame_rate(123000)
    
    assert math.ceil(segment.duration_seconds) == duration / 1000, "segment needs to be exactly 10s long."
    return segment

In [13]:
def create_X(segment):
    output_filepath = "temp.wav"
    file_handle = segment.export(output_filepath, format='wav')

    sample_rate, samples = wavfile.read(output_filepath)
    frequencies, times, x = signal.spectrogram(samples, sample_rate)
    
    os.remove(output_filepath)
        
    return frequencies, times, x

In [63]:
def insert_ones(y, segment_end_ms):
    """
    Update the label vector y. The labels of the 50 output steps strictly after the end of the segment 
    should be set to 1. By strictly we mean that the label of segment_end_y should be 0 while, the
    50 followinf labels should be ones.
    
    
    Arguments:
    y -- numpy array of shape (1, Ty), the labels of the training example
    segment_end_ms -- the end time of the segment in ms
    
    Returns:
    y -- updated labels
    """
    # duration of the background (in terms of spectrogram time-steps)
    segment_end_y = int(segment_end_ms * Ty / 10000.0)
    # Add 1 to the correct index in the background label (y)
    ### START CODE HERE ### (≈ 3 lines)
    for i in range(segment_end_y + 1, segment_end_y + 51):
        if i < Ty:
            y[0, i] = 1
    ### END CODE HERE ###
    
    return y

In [56]:
def create_Y(inserted_points):
    # Initialize y (label vector) of zeros (≈ 1 line)
    y = np.zeros((1, Ty))
    
    for point in inserted_points:
        y = insert_ones(y, point)
        
    assert y.shape == (1, Ty), "y shape needs to follow Ty!"
    
    return y

In [57]:
def txt_to_array(txt_filepath):
    """ Appends text in each line as a sep elt and outputs the result in array.
    """
    with open(txt_filepath, 'r') as f:
        x = f.read().splitlines()
    return x

In [18]:
txt_to_array(INPUT_DIRECTORY + TXT[0])

['3.55', '6.44']

In [48]:
def create_training_example(segment, inserted_points):
    segment = process_audio(segment, 10000)
    frequencies, times, x = create_X(segment)
    
    y = create_Y(inserted_points)
    return frequencies, times, x, y, inserted_points 

In [79]:
# prefix refers to the prefix naming of output audio files
def create_X_Y(input_directory, output_directory, debug = False):
    X, Y = [], []
    
    txts = [name for name in os.listdir(input_directory) if name.endswith(".txt")]
    wavs = [name for name in os.listdir(input_directory) if name.endswith(".wav")]
    txts.sort()
    wavs.sort()
    total = len(wavs)
    
    for i in range(total):  
        wav = wavs[i]
        segment = AudioSegment.from_wav(input_directory + wav)
        
        txt = txts[i]
        arr = txt_to_array(input_directory + txt)
        arr_ms = [float(i) * 1000 for i in arr] 
        if debug: 
            print("Creating example for {} and {}".format(wavs[i], txts[i]) )
            print("Inserted points for {} is {}".format(wavs[i], arr_ms))
        _, _, x, y, _ =  create_training_example(segment, arr_ms)
        x = np.transpose(x)
        y = np.transpose(y)
        X.append(x)
        Y.append(y)
    
    assert X.shape[0] == total, "Not all examples are added to X"
    assert Y.shape[0] == total, "Not all examples are added to Y"
    
    return (np.array(X), np.array(Y))

In [80]:
Tx = 5490 # Based on created training example
n_freq = 129 # Based on created training example
Ty = 1369 # Based on model.summary() in 1.4 with shape := (Tx, n_freq)

In [81]:
INPUT_DIRECTORY = "../ignored_audio_examples/bryan_cont/"
OUTPUT_DIRECTORY = "../ignored_examples/"
X, Y = create_X_Y(INPUT_DIRECTORY, OUTPUT_DIRECTORY, debug = True)

Creating example for aru_1.wav and aru_1.txt
Inserted points for aru_1.wav is [715.0, 9470.0]
Creating example for aru_10.wav and aru_10.txt
Inserted points for aru_10.wav is [3600.0, 8790.0]
Creating example for aru_11.wav and aru_11.txt
Inserted points for aru_11.wav is [3290.0, 7720.0]
Creating example for aru_12.wav and aru_12.txt
Inserted points for aru_12.wav is [1920.0]
Creating example for aru_13.wav and aru_13.txt
Inserted points for aru_13.wav is [4510.0, 7520.0]
Creating example for aru_2.wav and aru_2.txt
Inserted points for aru_2.wav is [3320.0, 7260.0]
Creating example for aru_3.wav and aru_3.txt
Inserted points for aru_3.wav is [2890.0, 8440.0]
Creating example for aru_4.wav and aru_4.txt
Inserted points for aru_4.wav is [3550.0, 6440.0]
Creating example for aru_5.wav and aru_5.txt
Inserted points for aru_5.wav is [4850.0, 8950.0]
Creating example for aru_6.wav and aru_6.txt
Inserted points for aru_6.wav is [1600.0, 7120.0]
Creating example for aru_7.wav and aru_7.txt
In

CouldntDecodeError: Decoding failed. ffmpeg returned error code: 1

Output from ffmpeg/avlib:

b'ffmpeg version 4.2.2 Copyright (c) 2000-2019 the FFmpeg developers\n  built with Apple clang version 11.0.0 (clang-1100.0.33.17)\n  configuration: --prefix=/usr/local/Cellar/ffmpeg/4.2.2_2 --enable-shared --enable-pthreads --enable-version3 --enable-avresample --cc=clang --host-cflags= --host-ldflags= --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libbluray --enable-libmp3lame --enable-libopus --enable-librubberband --enable-libsnappy --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-librtmp --enable-libspeex --enable-libsoxr --enable-videotoolbox --disable-libjack --disable-indev=jack\n  libavutil      56. 31.100 / 56. 31.100\n  libavcodec     58. 54.100 / 58. 54.100\n  libavformat    58. 29.100 / 58. 29.100\n  libavdevice    58.  8.100 / 58.  8.100\n  libavfilter     7. 57.100 /  7. 57.100\n  libavresample   4.  0.  0 /  4.  0.  0\n  libswscale      5.  5.100 /  5.  5.100\n  libswresample   3.  5.100 /  3.  5.100\n  libpostproc    55.  5.100 / 55.  5.100\n[wav @ 0x7f8429000000] invalid start code [0][0][0][0] in RIFF header\n../ignored_audio_examples/bryan_cont/yb_1.wav: Invalid data found when processing input\n'