# Pre-Process the data for cross utterance

Train set:

 - utterance 1: hey assistant
 - 10 participants
 - 2 sessions
 - 2 room: downstairs, upstairs
 - 3 user distances: 1, 3, 5
 - 3 user polar positions: 0, 45, 90
 - 8 spoken angles
    
Test set:
 - utterance 2: the quick brown fox jumped over the lazy sheep
 - 10 participants
 - 2 sessions
 - 2 room: downstairs, upstairs
 - 3 user distances: 1, 3, 5
 - 3 user polar positions: 0, 45, 90
 - 8 spoken angles

In [1]:
import os
import numpy as np
import pandas as pd
import librosa
import librosa.display

from IPython import display
from matplotlib import pyplot
import tqdm

In [2]:
class Audio:
    '''
    A simple wrapper class for (1-channel) audio data
    data is a 1-D NumPy array containing the data
    rate is a number expressing the samples per second
    '''
    
    def __init__(self, data, rate):
        self.data = data
        self.rate = rate
        
    def play(self):
        return display.Audio(self.data, rate=self.rate)
    
    def plot_wave(self):
        librosa.display.waveplot(self.data, sr=self.rate)
        
    def plot_spectrum(self):
        n_fft = int(self.rate / 20)
        D = librosa.amplitude_to_db(np.abs(librosa.stft(self.data, n_fft)), ref=np.max)
        librosa.display.specshow(D, y_axis='linear', sr=self.rate, hop_length=n_fft/4)
        
    @classmethod
    def fromfile(cls, fn):
        return cls(*librosa.load(fn, sr=None))

In [3]:
# Credit to Yihui Xiong, 2017, 
# https://github.com/xiongyihui/tdoa/blob/master/gcc_phat.py
def gcc_phat(sig, refsig, fs=1, max_tau=None, interp=16):
    '''
    This function computes the offset between the signal sig and the reference signal refsig
    using the Generalized Cross Correlation - Phase Transform (GCC-PHAT) method.
    '''
    
    # make sure the length for the FFT is larger or equal than len(sig) + len(refsig)
    n = sig.shape[0] + refsig.shape[0]

    # Generalized Cross Correlation Phase Transform
    SIG = np.fft.rfft(sig, n=n)
    REFSIG = np.fft.rfft(refsig, n=n)
    R = SIG * np.conj(REFSIG)

    cc = np.fft.irfft(R / np.abs(R), n=(interp * n))

    max_shift = int(interp * n / 2)
    if max_tau:
        max_shift = np.minimum(int(interp * fs * max_tau), max_shift)

    cc = np.concatenate((cc[-max_shift:], cc[:max_shift+1]))

    # find max cross correlation index
    shift = np.argmax(np.abs(cc)) - max_shift

    tau = shift / float(interp * fs)
    
    return tau, cc

In [4]:
def get_data_split(participants, rooms, walls, trials, distances, polars, utterances):
    '''
    This function specifies that data split by subject number and trial number
    and returns the corresponding recording paths and relative recording paths
    for the data split
    '''
    data_dir = os.path.join(('E:/python_project/dov-audio-filter/'), 'data/raw/')

    recording_paths = [os.path.join(data_dir,p,f'{p}_{r}_{w}_{t}',f'{d[0]}{po[0]}_{d[1]}_{po[1]}',f'recording{u}_{dov*45}_{i}.wav')
                       for p in participants 
                       for r in rooms 
                       for w in walls 
                       for t in trials 
                       for d in distances 
                       for po in polars 
                       for u in utterances
                       for dov in range(0,8)
                       for i in range(1,5)
                       ]
    
    path_class = [(p,r,w,t,d,po,u,dov,i)
                  for p in participants 
                  for r in rooms 
                  for w in walls 
                  for t in trials 
                  for d in distances 
                  for po in polars 
                  for u in utterances
                  for dov in range(0,8)
                  for i in range(1,5)
                 ]
    return(recording_paths,path_class)

In [5]:
col_names = ['participants', 'rooms', 'walls', 'trials', 'distances', 'polars', 'utterances', 'dov',
             *[f'gccphat_{i}_{j}_{d}' for i in range(4) for j in range(i+1, 4) for d in ['maxshift', 'auc', 'peakval']],
             *[f'gccphatval_{i}_{j}_{k}' for i in range(4) for j in range(i+1, 4) for k in range(23)]]

def get_featurized_data(recording_paths, path_class):
    df = pd.DataFrame(columns=col_names)
    for num in tqdm.tqdm(range(int(len(recording_paths)/4))):
        channel_recordings = recording_paths[num*4:(num+1)*4]
        audio_files = [Audio.fromfile(r) for r in channel_recordings]
        four_channels = [a.data for a in audio_files]
        data_row = {
            'participants': path_class[num*4][0], 
            'rooms': path_class[num*4][1], 
            'walls': path_class[num*4][2], 
            'trials': path_class[num*4][3], 
            'distances': path_class[num*4][4][1], 
            'polars': path_class[num*4][5][1], 
            'utterances': path_class[num*4][6],
            'dov': path_class[num*4][7]*45
        }
        for i in range(4):
            for j in range(i+1,4):
                gcc_phat_data = gcc_phat(four_channels[i], four_channels[j], 
                                      fs = audio_files[0].rate, max_tau=0.236 * 1e-3, interp=1)
                data_row[f'gccphat_{i}_{j}_peakval'] = gcc_phat_data[1][11]
                data_row[f'gccphat_{i}_{j}_auc'] = np.sum(gcc_phat_data[1])
                data_row[f'gccphat_{i}_{j}_maxshift'] = gcc_phat_data[0]
                for k in range(23):
                    data_row[f'gccphatval_{i}_{j}_{k}'] = gcc_phat_data[1][k]
        df = df.append(data_row, ignore_index=True)
        
    return df

In [6]:
participants = ['s1', 's2', 's3', 's4', 's5', 's6', 's7', 's8', 's9', 's10']
rooms = ['downstairs', 'upstairs']
walls = ['nowall', 'wall']
trials = ['trial1','trial2']
distances = [('A','1'),('B','3'),('C','5'),]
polars = [('0','0'),('1','45'),('2','90'),]
train_utterances = ['0']
test_utterances = ['1']
train_paths,train_class = get_data_split(participants, rooms, walls, trials, distances, polars, train_utterances)
test_paths,test_class = get_data_split(participants, rooms, walls, trials, distances, polars, test_utterances)

In [8]:
train_paths

['E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial1\\A0_1_0\\recording0_0_1.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial1\\A0_1_0\\recording0_0_2.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial1\\A0_1_0\\recording0_0_3.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial1\\A0_1_0\\recording0_0_4.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial1\\A0_1_0\\recording0_45_1.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial1\\A0_1_0\\recording0_45_2.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial1\\A0_1_0\\recording0_45_3.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial1\\A0_1_0\\recording0_45_4.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial1\\A0_1_0\\recording0_90_1.wav',
 'E:/python_project/dov

In [7]:
from fft_hlbr import cal_hlbr

In [9]:
a = cal_hlbr(train_paths[:4])

In [10]:
a

[-1.9317709654326936,
 -2.0433105179670186,
 -1.6307046245976053,
 -1.6591618888561692]

In [7]:
len(test_paths)

23040

In [8]:
train_df = get_featurized_data(train_paths,train_class)
test_df = get_featurized_data(test_paths,test_class)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5760/5760 [18:21<00:00,  5.23it/s]
  7%|█████████████▊                                                                                                                                                                               | 420/5760 [07:42<1:38:02,  1.10s/it]


KeyboardInterrupt: 

In [138]:
df

Unnamed: 0,participants,rooms,walls,trials,distances,polars,utterances,dov,gccphat_0_1_maxshift,gccphat_0_1_auc,...,gccphatval_2_3_13,gccphatval_2_3_14,gccphatval_2_3_15,gccphatval_2_3_16,gccphatval_2_3_17,gccphatval_2_3_18,gccphatval_2_3_19,gccphatval_2_3_20,gccphatval_2_3_21,gccphatval_2_3_22
0,s1,downstairs,nowall,trial1,1,0,0,0,0.0,0.929901,...,0.000874,0.057902,0.000011,0.001541,-0.052348,0.000403,0.000849,0.023651,0.000048,0.001163
1,s1,downstairs,nowall,trial1,1,0,0,45,0.0,0.935565,...,0.000171,-0.050401,0.000652,0.000966,0.017136,0.000918,-0.000259,-0.035492,0.000443,0.000769
2,s1,downstairs,nowall,trial1,1,0,0,90,0.0,0.977008,...,-0.000743,-0.007790,-0.000195,0.001031,0.055236,0.002341,-0.001265,-0.038105,-0.000551,0.000891
3,s1,downstairs,nowall,trial1,1,0,0,135,0.0,0.909018,...,-0.001413,0.076345,-0.000457,0.001607,0.070554,0.003886,-0.000515,-0.046689,-0.001151,0.001109
4,s1,downstairs,nowall,trial1,1,0,0,180,0.0,0.897188,...,-0.000334,0.157349,0.002033,0.001508,0.063899,-0.000289,0.000181,-0.036023,0.000971,0.000908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,s1,downstairs,nowall,trial2,3,0,0,135,0.0,0.881426,...,-0.002377,0.098851,-0.000723,0.000069,0.122165,0.002764,-0.000752,-0.025217,0.000042,0.001375
196,s1,downstairs,nowall,trial2,3,0,0,180,0.0,0.870925,...,-0.000514,0.164111,0.000148,0.002987,-0.007813,0.001845,0.000172,0.007497,0.000091,0.001946
197,s1,downstairs,nowall,trial2,3,0,0,225,0.0,0.881492,...,-0.000612,0.230916,-0.000383,0.002959,0.075322,0.003034,-0.000470,-0.064106,-0.001116,0.001883
198,s1,downstairs,nowall,trial2,3,0,0,270,0.0,0.912883,...,0.000266,0.228883,-0.000124,0.000602,0.091180,0.001719,0.000700,-0.020110,-0.000277,0.000796


In [141]:
df.to_csv('data/featurized/test.csv', index=False)

In [145]:
a[143*4:800]

['E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial1\\C2_5_90\\recording1_315_1.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial1\\C2_5_90\\recording1_315_2.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial1\\C2_5_90\\recording1_315_3.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial1\\C2_5_90\\recording1_315_4.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial2\\A0_1_0\\recording0_0_1.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial2\\A0_1_0\\recording0_0_2.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial2\\A0_1_0\\recording0_0_3.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial2\\A0_1_0\\recording0_0_4.wav',
 'E:/python_project/dov-audio-filter/data/raw/s1\\s1_downstairs_nowall_trial2\\A0_1_0\\recording0_45_1.wav',
 'E:/python_pro

In [117]:
a[0][9212:]

['E:/python_project/dov-audio-filter/data/raw/s10\\s10_upstairs_wall_2\\c2_3_90\\recording2_315_1.wav',
 'E:/python_project/dov-audio-filter/data/raw/s10\\s10_upstairs_wall_2\\c2_3_90\\recording2_315_2.wav',
 'E:/python_project/dov-audio-filter/data/raw/s10\\s10_upstairs_wall_2\\c2_3_90\\recording2_315_3.wav',
 'E:/python_project/dov-audio-filter/data/raw/s10\\s10_upstairs_wall_2\\c2_3_90\\recording2_315_4.wav']

In [116]:
len(a[0])

9216