In [1]:
import numpy as np
from pydub import AudioSegment
from pydub.playback import play
import random
import sys
import io
import os
import glob
import IPython
import wave
import pylab
from tf_utils import *
import matplotlib.pyplot as plt
from scipy import signal
from scipy.io import wavfile

In [8]:
POSITIVE_DIRECTORY = "./raw_data/positive_data/"
BACKGROUND_DIRECTORY = "./raw_data/background_data/"
NEGATIVES_DIRECTORY = "./raw_data/google_dataset/"
NEGATIVES_TRUNCATED_DIRECTORY = "./raw_data/google_dataset_truncated/"
AUDIO_EXAMPLES_DIRECTORY = "./audio_examples/"
AUDIO_IGNORED_EXAMPLES_DIRECTORY = "./audio_ignored_examples/"

POSITIVE_EXAMPLE = "jh_1.wav"
BACKGROUND_EXAMPLE = "bg_10.wav"

AUDIO_EXAMPLE = "example_train.wav"

## General Approach

The general idea is randomly iterate through each word directory, randomly select a recording from each word directory, and concatenating the words to form a ten second continuous stream of words.

For now, collect 9 words, select a background, put the 9 words inside, put basically inside.

In [16]:
NEGATIVES_FILENAMES = [name for name in os.listdir(NEGATIVES_DIRECTORY) if os.path.isdir(os.path.join(NEGATIVES_DIRECTORY, name)) 
                       and '_' not in name]
NEGATIVES_AUDIONAMES = {}
for file in NEGATIVES_FILENAMES:
    NEGATIVES_AUDIONAMES[file] = [name for name in os.listdir(NEGATIVES_DIRECTORY + file + "/") if name.endswith("wav")]

In [18]:
def random_element(lst, n):
    """
    Assume n <= len(lst)
    Return n elements randomly from lst
    """
    random_index = np.random.randint(len(lst), size = n)
    return [lst[i] for i in random_index]

def random_negatives(negative_audionames, n):
    """
    Given a python dictionary of all audio names in negative files
    Return a list of n randomly selected negative audios
    """
    
    # find n random files
    random_files = random_element(list(negative_audionames.keys()), n)
    output = []
    for file in random_files:
        audio_names = negative_audionames[file]
        # from each file generate a random audio
        random_audio = random_element(audio_names, 1)[0]
        # load the chosen audio
        global NEGATIVES_DIRECTORY
        link = NEGATIVES_DIRECTORY + file + "/" + random_audio
        audio = AudioSegment.from_wav(link)
        output.append(audio)
    return output

In [59]:
# Randomly collect 9 words
negatives = random_negatives(NEGATIVES_AUDIONAMES, 7)

In [25]:
for sound in negatives:
    play(sound)

In [60]:
# Get background
background = AudioSegment.from_wav(BACKGROUND_DIRECTORY + BACKGROUND_EXAMPLE)
# Make background quieter
background = background - 20

# Get positive word
positive = AudioSegment.from_wav(POSITIVE_DIRECTORY + POSITIVE_EXAMPLE)

In [61]:
# Randomly insert positive word into list of negatives
insertion_point = np.random.randint(0, len(negatives))

In [62]:
from copy import deepcopy
sounds = deepcopy(negatives)
sounds.insert(insertion_point, positive)

In [57]:
for sound in sounds:
    play(sound)

In [None]:
def insert_audio_clip(background, audio_clip, previous_segments):
    """
    Insert a new audio segment over the background noise at a random time step, ensuring that the 
    audio segment does not overlap with existing segments.
    
    Arguments:
    background -- a 10 second background audio recording.  
    audio_clip -- the audio clip to be inserted/overlaid. 
    previous_segments -- times where audio segments have already been placed
    
    Returns:
    new_background -- the updated background audio
    """
    
    # Get the duration of the audio clip in ms
    segment_ms = len(audio_clip)
    
    ### START CODE HERE ### 
    # Step 1: Use one of the helper functions to pick a random time segment onto which to insert 
    # the new audio clip. (≈ 1 line)
    segment_time = get_random_time_segment(segment_ms)
    
    # Step 2: Check if the new segment_time overlaps with one of the previous_segments. If so, keep 
    # picking new segment_time at random until it doesn't overlap. (≈ 2 lines)
    while is_overlapping(segment_time, previous_segments):
        segment_time = get_random_time_segment(segment_ms)

    # Step 3: Add the new segment_time to the list of previous_segments (≈ 1 line)
    previous_segments.append(segment_time)
    ### END CODE HERE ###
    
    # Step 4: Superpose audio segment and background
    new_background = background.overlay(audio_clip, position = segment_time[0])
    
    return new_background, segment_time

In [84]:
def create_continuous(bg, sounds):
    continuous = sounds[0]
    for i in range(1,len(sounds)):
        continuous += sounds[i]
    print("Length of continuous: {}".format(len(continuous)))
    # Superpose audio segment and background
    result = bg.overlay(continuous, position = 0)
    return result

In [85]:
# Overlay list of sounds onto background
example = create_continuous(background, sounds)

Length of continuous: 9053


In [87]:
play(example)

## TODO:
- Change everything into functions i.e. generalize the steps
- Currently, the number of negative sounds is hardcoded. Ideally, try to get up to 8-9 negatives depending on the length of the positives. Use a function to measure its length and see how many negatives can be picked
- Randomly insert miliseconds of spaces within each sound to fully fill up all the 10 seconds.

Less urgent:
- Try to cut off the positives' empty sounds at the start and at the end. There are only so few positives so might be possible to manually do it