# Randomly create test sentences from Google Speech Commands Dataset

In [2]:
import numpy as np
import random
import sys
import io
import os
import glob
import copy
import math

In [3]:
POSITIVE_DIRECTORY = "./raw_data/positive_data/"
BACKGROUND_DIRECTORY = "./raw_data/background_data/"
NEGATIVES_DIRECTORY = "./raw_data/google_dataset/"
NEGATIVES_TRUNCATED_DIRECTORY = "./raw_data/google_dataset_truncated/"
AUDIO_EXAMPLES_DIRECTORY = "./audio_examples/"
AUDIO_IGNORED_EXAMPLES_DIRECTORY = "./audio_ignored_examples/"

POSITIVE_EXAMPLE = "jh_1.wav"
BACKGROUND_EXAMPLE = "bg_10.wav"

AUDIO_EXAMPLE = "example_train.wav"

In [4]:
NEGATIVES_FILENAMES = [name for name in os.listdir(NEGATIVES_DIRECTORY) if os.path.isdir(os.path.join(NEGATIVES_DIRECTORY, name)) 
                       and '_' not in name]
NEGATIVES_AUDIONAMES = {}
for file in NEGATIVES_FILENAMES:
    NEGATIVES_AUDIONAMES[file] = [name for name in os.listdir(NEGATIVES_DIRECTORY + file + "/") if name.endswith("wav")]
POSITIVES_AUDIONAMES = [name for name in os.listdir(POSITIVE_DIRECTORY) if name.endswith("wav")]
BACKGROUND_AUDIONAMES = [name for name in os.listdir(BACKGROUND_DIRECTORY) if name.endswith("wav")]

In [5]:
def create_test(num_pos_min, num_pos_max, num_total, debug=False):
    """
    Creates a randomized sentence from the Google Speech Commands Dataset
    
    Arguments:
    num_pos_min -- minimum number of positives
    num_pos_max -- maximum number of positives
    num_total -- total number of words in one test example
    
    Returns:
    String of positives and negatives in randomized order
    """
    num_positives = np.random.randint(num_pos_min, num_pos_max+1)
    negatives = list(np.random.choice(list(NEGATIVES_AUDIONAMES.keys()), num_total - num_positives, replace=False))
    positives = ["basically"] * num_positives
    
    if debug:
        print("{} positives in {} words".format(num_positives, num_total))

    # Randomly insert positives into negatives
    insertion_points = list(np.random.choice(list(range(len(negatives)+1)), len(positives), replace=False))
    insertion_points.sort()
    num_inserted = 0

    while len(insertion_points) > 0:
        insertion_point = insertion_points.pop(0) + num_inserted
        negatives.insert(insertion_point, positives[num_inserted])
        num_inserted += 1
    
    return " ".join(negatives)

In [22]:
create_test(1, 8, 20, debug=True)

4 positives in 20 words


'seven one bed two cat yes basically off wow basically three down tree happy basically no on basically go house'