# Curate Dataset Where Test Set Voices are Combined

Audio segments contain a single speech segment from two speakers in the same room setup where one voice is facing and another voice is not facing. 

Default Settings: curated audio segments are a combination of one facing voice and one non-facing voice from Speaker 2 and Speaker 9 in the held out session: "upstairs"

In [1]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display
import soundfile as sf

from IPython import display
from matplotlib import pyplot

In [2]:
# A simple wrapper class for (1-channel) audio data
# data is a 1-D NumPy array containing the data
# rate is a number expressing the samples per second
class Audio:
    def __init__(self, data, rate):
        self.data = data
        self.rate = rate
    def play(self):
        return display.Audio(self.data, rate=self.rate)
    def plot_wave(self):
        librosa.display.waveplot(self.data, sr=self.rate)
    def plot_spectrum(self):
        n_fft = int(self.rate / 20)
        D = librosa.amplitude_to_db(np.abs(librosa.stft(self.data, n_fft)), ref=np.max)
        librosa.display.specshow(D, y_axis='linear', sr=self.rate, hop_length=n_fft/4)
    @classmethod
    def fromfile(cls, fn):
        return cls(*librosa.load(fn, sr=None))

In [3]:
def get_facing_not_facing_paths(subject, room, wall, dov_range):
    '''
    subject - the speaker number
    room - upstairs or downstairs
    wall - wall or nowall microphone setup
    dov_range - angles that qualify as facing
    
    Return the paths to audio recordings from a single subject in a single room for a 
    single microphone wall setup within a range of facing angles (facing_paths) and 
    the paths that correspond to not_facing audio recordings.
    '''
    # get all recording paths for that subject and room and microphone wall setup
    data_dir = os.path.join(os.path.abspath('.'), 'data/raw/')
    subject_names = ['s' + str(subject)]
    trial_names = [f'{s}_trial{t}'
               for s in [f'{room}_{wall}', f'{room}_{wall}']
               for t in range(1, 2+1)]    
    polar_pos_names = [pos[0] + angle[0] + '_' + pos[1] + '_' + angle[1]
                       for pos in [('A', '1'), ('B', '3'), ('C', '5')]
                       for angle in [(str(i), str(45*i)) for i in range(3)]]
    trial_paths = [os.path.join(s, s + '_' + t)
                   for s in subject_names
                   for t in trial_names]
    recording_paths = [os.path.join(data_dir, t, p) 
                       for t in trial_paths
                       for p in polar_pos_names]
    rel_recording_paths = [(s, t, p) for s in subject_names
                                     for t in trial_names
                                     for p in polar_pos_names]
    
    # get facing paths and not facing paths
    facing_paths = []
    not_facing_paths = []
    for pth_ind in range(len(recording_paths)):
        for angle in map(lambda i:45*i, range(360//45)):
            if angle in dov_range:
                for r in range(2):
                    facing_paths.append([os.path.join(recording_paths[pth_ind], f'recording{r}_{angle}_{i}.wav') for i in range(1,4+1)])
            else:
                for r in range(2):
                    not_facing_paths.append([os.path.join(recording_paths[pth_ind], f'recording{r}_{angle}_{i}.wav') for i in range(1,4+1)])                
    return(facing_paths, not_facing_paths)

In [4]:
def name_combined_audio_file(file1, file2):
    '''
    file1 - the first audio file in the combined audio
    file2 - the second audio file in the combined audio
    
    Helper function for combine_facing_and_not_facing_audio to aid in the naming of
    the combined audio recordings. 
    '''
    file1 = file1.split('/')
    file2 = file2.split('/')
    combined_audio_name = file1[-3] + '_' + file1[-2] + '_' + file1[-1].strip('.wav') + '-' + file2[-3] + '_' + file2[-2] + '_' + file2[-1]
    return(combined_audio_name)

In [5]:
def combine_facing_and_not_facing_audio(facing_paths, not_facing_paths, prefix, cutoff):
    '''
    facing_paths - paths to audio recordings where the dov is facing.
    not_facing_paths - paths to recordings where the dov is not facing.
    prefix - where to write the audio files to
    cutoff - number of speech combinations to curate
    
    This function terminates when cutoff audio combinations are written to disk. 
    A cutoff value of 100 will produce 100 x 4 channels x 2 order structure
    recordings. 
    '''
    for facing_bundle in facing_paths:
        facing_audio_files = [Audio.fromfile(a) for a in facing_bundle]
        for not_facing_bundle in not_facing_paths:
            not_facing_audio_files = [Audio.fromfile(a) for a in not_facing_bundle]
            for channel in range(4):
                combined_data = np.append(facing_audio_files[channel].data, not_facing_audio_files[channel].data)
                combined_rate = facing_audio_files[channel].rate
                combined_audio = Audio(combined_data, combined_rate)
                combined_audio_name = name_combined_audio_file(facing_bundle[channel], not_facing_bundle[channel])
                sf.write(prefix + combined_audio_name, combined_data, combined_rate, subtype='PCM_24')
                combined_data = np.append(not_facing_audio_files[channel].data, facing_audio_files[channel].data)
                combined_audio = Audio(combined_data, combined_rate)
                combined_audio_name = name_combined_audio_file(not_facing_bundle[channel], facing_bundle[channel])
                sf.write(prefix + combined_audio_name, combined_data, combined_rate, subtype='PCM_24')
            cutoff -= 1
            if cutoff == 0:
                return

In [186]:
test_room = 'upstairs'
facing_dov = [0, 45, 315]

sub2_facing_wall, sub2_not_facing_wall = get_facing_not_facing_paths(2, test_room, 'wall', facing_dov)
sub2_facing_nowall, sub2_not_facing_nowall = get_facing_not_facing_paths(2, test_room, 'nowall', facing_dov)

sub9_facing_wall, sub9_not_facing_wall = get_facing_not_facing_paths(9, test_room, 'wall', facing_dov)
sub9_facing_nowall, sub9_not_facing_nowall = get_facing_not_facing_paths(9, test_room, 'nowall', facing_dov)

combine_facing_and_not_facing_audio(sub2_facing_wall, sub9_not_facing_wall, 'data/combined/upstairs/wall/45-to-negative-45/', 100)
combine_facing_and_not_facing_audio(sub2_facing_nowall, sub9_not_facing_nowall, 'data/combined/upstairs/nowall/45-to-negative-45/', 100)

combine_facing_and_not_facing_audio(sub9_facing_wall, sub2_not_facing_wall, 'data/combined/upstairs/wall/45-to-negative-45/', 100)
combine_facing_and_not_facing_audio(sub9_facing_nowall, sub2_not_facing_nowall, 'data/combined/upstairs/nowall/45-to-negative-45/', 100)

In [187]:
test_room = 'upstairs'
facing_dov = [0, 45, 90, 270, 315]

sub2_facing_wall, sub2_not_facing_wall = get_facing_not_facing_paths(2, test_room, 'wall', facing_dov)
sub2_facing_nowall, sub2_not_facing_nowall = get_facing_not_facing_paths(2, test_room, 'nowall', facing_dov)

sub9_facing_wall, sub9_not_facing_wall = get_facing_not_facing_paths(9, test_room, 'wall', facing_dov)
sub9_facing_nowall, sub9_not_facing_nowall = get_facing_not_facing_paths(9, test_room, 'nowall', facing_dov)

combine_facing_and_not_facing_audio(sub2_facing_wall, sub9_not_facing_wall, 'data/combined/upstairs/wall/90-to-negative-90/', 100)
combine_facing_and_not_facing_audio(sub2_facing_nowall, sub9_not_facing_nowall, 'data/combined/upstairs/nowall/90-to-negative-90/', 100)

combine_facing_and_not_facing_audio(sub9_facing_wall, sub2_not_facing_wall, 'data/combined/upstairs/wall/90-to-negative-90/', 100)
combine_facing_and_not_facing_audio(sub9_facing_nowall, sub2_not_facing_nowall, 'data/combined/upstairs/nowall/90-to-negative-90/', 100)