# Multi source classification

In [99]:
from itertools import combinations
import math
import pyroomacoustics as pra
from pyroomacoustics.utilities import normalize
from pyroomacoustics.transform import stft
from collections import defaultdict
from itertools import combinations

import numpy as np
from scipy.io import wavfile
from scipy import signal
import pandas as pd
import os
import sys

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams
from matplotlib import rc
from pandas.plotting import register_matplotlib_converters

import tensorflow as tf
from keras import Sequential
from keras.models import Model
from keras.layers import Input
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Conv1D
from keras.layers import MaxPooling1D

In [18]:
# Label resolution of classification
RESOLUTION = 10

# Number of samples to include while creating one ML feature
SAMPLES = 2048

# Determines the overlap of samples between consecutive features
STEP = 1024

# Training rooms dimensions
ROOMS = {
    'small' : np.array([4, 4, 3]),
    'medium' : np.array([6, 6, 3]),
    'large' : np.array([8, 8, 3])
}

# Testing rooms dimensions
TEST_ROOMS = {
    'small' : np.array([5, 5, 2]),
    'medium' : np.array([7, 7, 2]),
    'large' : np.array([9, 9, 2])
}

AUDIO_PATH = '../training_data/audio/multi_source'

# Number of microphones on the array
MICS_NUMBER = 6

MIC_COMBS = len(list(combinations(range(MICS_NUMBER), 2)))

In [19]:
def create_simulation_room(room_dim=[4, 4, 3], mic_pos=[2, 2, 1.5], room_fs=16000):
    """
    Utility function to create a shoebox room,
    with a microphone array included.
    """
    
    # Initialize room
    room = pra.ShoeBox(room_dim, fs=room_fs)

    # Declare microphone array position
    mic_center = mic_pos[:2]
    mic_height = mic_pos[-1]
    
    # Radius constant, will always be the same for MiniDSP array
    mic_radius = 0.045

    # Generate the microphone array
    mic_array_2D = pra.circular_2D_array(center=mic_center, M=6, phi0=0, radius=mic_radius)
    mic_array_3D = np.vstack((mic_array_2D, [mic_height] * 6))
    
    # Add the microphone array to room
    room.add_microphone_array(mic_array_3D)
    
    return room

def create_sound_sources(room_dim=[4, 4, 3], resolution=1):
    """
    Function that creates a list of all audio source,
    that will be placed inside the room for simulation.
    """
    
    wall_len = room_dim[0] / 2
    
    #Specify angle in distance ranges
    angle_range = range(0, 360, resolution)
    R = 1
   
    sources = []
    for angle in angle_range:
        source = [R * math.cos(math.radians(angle)) + wall_len, 
                  R * math.sin(math.radians(angle)) + wall_len, room_dim[-1] / 2]
        sources.append((source, angle))

    return sources

def simulate_room(audio_file, subset, room_type='small', mic_pos=[2, 2, 1.5]):
    """
    Function to simulate room with specified dimensions and microphone array position.
    Generates CSV files for all angle/distance combinations.
    
    Returns: a dictionary with all dataframes that were created.
    Keys are in a form of (angle, distance) tuples
    """
    
    file_name = os.path.basename(audio_file).split('.')[0]
    
    # Read the audio file
    fs, audio = wavfile.read(audio_file)
    
    # Get room dimensions
    if subset == 'train':
        room_dim = ROOMS[room_type]
    else:
        room_dim = TEST_ROOMS[room_type]
    
    # Create all sound sources
    sources = create_sound_sources(room_dim, 10)
    wav_signals = defaultdict(list)
    
    combs = len(list(combinations(sources, r=2)))
    i = 1
    
    for (source1, angle1), (source2, angle2) in combinations(sources, r=2):
        print(f'Simulating combination {i}/{combs}', end='\r')
        room = create_simulation_room(room_dim=room_dim, mic_pos=mic_pos, room_fs=fs)
        room.add_source(source1, signal=audio, delay=0.0)
        room.add_source(source2, signal=audio, delay=0.0)
        room.simulate()

        # Extract simulated signal for each microphone
        data = room.mic_array.signals.T
        data = np.array(normalize(data, bits=16), dtype=np.int16)
        
        save_name = f'{subset}_angles_{angle1}_{angle2}_room_{room_type}_{file_name}.wav'
        wavfile.write(f'{AUDIO_PATH}/{save_name}', fs, np.array(data))
        
        i += 1
        
    for source, angle in sources:
        print(f'Simulating single source {i}/{len(sources)}', end='\r')
        room = create_simulation_room(room_dim=room_dim, mic_pos=mic_pos, room_fs=fs)
        room.add_source(source, signal=audio, delay=0.0)
        room.simulate()
        data = room.mic_array.signals.T
        data = np.array(normalize(data, bits=16), dtype=np.int16)
        
        save_name = f'{subset}_angle_{angle}_room_{room_type}_{file_name}.wav'
        wavfile.write(f'{AUDIO_PATH}/{save_name}', fs, np.array(data))
            
    print('Simulation successful!')

In [20]:
# Simulate room for every audio file in the directory
training_sounds = os.listdir('../sounds/training')
testing_sounds = os.listdir('../sounds/testing')

print('Simulating training data:\n')
for sound in training_sounds:
    for room, dim in ROOMS.items():
        # Call the function above and store all results
        print('Audio file: ' + sound)
        print(f'Room: {room}')
        simulate_room(f'../sounds/training/{sound}', 'train', room, dim / 2)
        
print('\nSimulating testing data:\n')
for sound in testing_sounds:
    for room, dim in TEST_ROOMS.items():
        # Call the function above and store all results
        print('Audio file: ' + sound)
        print(f'Room: {room}')
        simulate_room(f'../sounds/testing/{sound}', 'test', room, dim / 2)

Simulating training data:

Audio file: CantinaBand3.wav
Room: small
Simulation successful!ce 631/36
Audio file: CantinaBand3.wav
Room: medium
Simulation successful!ce 631/36
Audio file: CantinaBand3.wav
Room: large
Simulation successful!ce 631/36

Simulating testing data:

Audio file: StarWars1.wav
Room: small
Simulation successful!ce 631/36
Audio file: StarWars1.wav
Room: medium
Simulation successful!ce 631/36
Audio file: StarWars1.wav
Room: large
Simulation successful!ce 631/36


In [132]:
def gcc_phat(x_1, x_2, FS=16000, interp=1):
    """
    Function that will compute the GCC-PHAT
    cross-correlation of two separate audio channels
    
    Returns:
        A 1-D GCC vector
    """
    
    n = len(x_1) + len(x_2) - 1
    n += 1 if n % 2 else 0
    
    # Fourier transforms of the two signals
    X_1 = np.fft.rfft(x_1, n=n)
    X_2 = np.fft.rfft(x_2, n=n)
    
     # Normalize by the magnitude of FFT - because PHAT
    np.divide(X_1, np.abs(X_1), X_1, where=np.abs(X_1) != 0)
    np.divide(X_2, np.abs(X_2), X_2, where=np.abs(X_2) != 0)
    
    # GCC-PHAT = [X_1(f)X_2*(f)] / |X_1(f)X_2*(f)|
    # See http://www.xavieranguera.com/phdthesis/node92.html for reference
    CC = X_1 * np.conj(X_2)
    cc = np.fft.irfft(CC, n=n * interp)
        
    # Maximum delay between a pair of microphones,
    # expressed in a number of samples.
    # 0.09 m is the mic array diameter and 
    # 340 m/s is assumed to be the speed of sound.
    max_len = math.ceil(0.09 / 340 * FS * interp)
    
    # Trim the cc vector to only include a 
    # small number of samples around the origin
    cc = np.concatenate((cc[-max_len:], cc[:max_len+1]))
    
    # Return the cross correlation
    return cc


def compute_gcc_matrix(observation, fs):
    """
    Creates a GCC matrix, where each row is a vector of GCC 
    between a given pair of microphones.
    """ 
    
    mic_pairs = combinations(range(MICS_NUMBER), r=2)

    # Initialize a transformed observation, that will be populated with GCC vectors
    # of the observation
    transformed_observation = []

    # Compute GCC for every pair of microphones
    for mic_1, mic_2 in mic_pairs:
        x_1 = observation[:, mic_1]
        x_2 = observation[:, mic_2]

        gcc = gcc_phat(x_1, x_2, FS=fs, interp=1)

        # Add the GCC vector to the GCC matrix
        transformed_observation.append(gcc)    
        
    return transformed_observation


def create_observations(wav_signals, fs, labels, samples=1, step=1, resolution=20, music=False):
    """
    Create list of observations from the pandas dataframe.
    Each observation will be a GCC matrix, where each row 
    is a vector of GCC between a given pair of microphones.
    
    Returns: 
        a tuple of observations and their corresponding labels
    """
    
    # Lists of observations and labels that will be populated
    X = []
    y = []
    
    # Loop through the signal frame and take subframes
    for i in range(0, len(wav_signals) - samples + 1, step):
        y.append(labels)
        
        # Extract the observation from subframe
        observation = np.array(wav_signals[i : i + samples])
        
        if music:
            # Transform observation into a STFT matrix
            transformed_observation = compute_stft_matrix(observation)
        else:
            # Transform observation into a GCC matrix
            transformed_observation = compute_gcc_matrix(observation, fs)
            
        X.append(transformed_observation)

    return X, y


def multi_hot_encode(encoder, y_train, y_test):
    """
    Creates a multi-hot encoding of categorical labels
    provided in y_train and y_test.
    """
    
    # One-hot encode training and testing labels
    enc = encoder.fit(y_train)
    y_train = enc.transform(y_train)
    y_test = enc.transform(y_test)
    
    return y_train, y_test
  
    
def create_whole_dataset(df_train, df_test, encoder, room=None, dist=None):
    """
    Creates an entire dataset by extracting values
    from train and tests dataframes.
    
    One-hot encodes the labels before returning.
    """
    
    # Can filter testing entries to only check performance
    # for given conditions
    if room:
        df_test = df_test[df_test.room == room]
    
    # Create train/test observations
    X_train = df_train.drop(columns=['room', 'label']).values.reshape(
        len(df_train), MIC_COMBS, -1)
    X_test = df_test.drop(columns=['room', 'label']).values.reshape(
        len(df_test), MIC_COMBS, -1)
    
    # Create train/test labels
    y_train, y_test = multi_hot_encode(
        encoder, df_train['label'].values, df_test['label'].values)
    
    return X_train, y_train, X_test, y_test


def create_dataframe(subset, samples=20, step=5, resolution=20, is_info=True):
    """
    Creates a whole dataframe 
    It is achieved by looping through all WAV files in the directory
    and creating observations from each of them. 
    
    These observations are then all concatenated together 
    into one large dataframe
    
    Returns:
        a pandas dataframe containing all data points (without any splits)
    """
    
    dataframes = []
    
    files = [file for file in os.listdir(AUDIO_PATH) if subset in file]

    # Loop through all WAVs
    for i, file in enumerate(files):
        if file[-3:] != 'wav': 
            continue
            
        print(f'{subset} file {i+1}/{len(files)}', end='\r')

        path = os.path.join(AUDIO_PATH, file)
        fs, wav_signals = wavfile.read(path)
        
        labels = (int(file.split('_')[2]), )
        if file.split('_')[1] == 'angles':
            labels = (int(file.split('_')[2]), int(file.split('_')[3]))
            
        # Create observations from a given WAV file
        X_temp, y_temp = create_observations(wav_signals, fs, labels, samples, step, resolution)
        
        cols = [
            f'mics{mic_1+1}{mic_2+1}_{i}' 
                for mic_1, mic_2 in combinations(range(MICS_NUMBER), r=2) 
                    for i in range(np.shape(X_temp)[2])
        ]
        
        df = pd.DataFrame(data=np.reshape(X_temp, (len(X_temp), -1)), columns=cols)
        
        # Add extra info columns
        if is_info:
            room = file.split('_')[5 if file.split('_')[1] == 'angles' else 4]
            df['room'] = room
            
        # Add label column
        df['label'] = y_temp
        dataframes.append(df)
        
    return pd.concat(dataframes, ignore_index=True)

In [133]:
df_train = create_dataframe('train', samples=SAMPLES, step=STEP, resolution=RESOLUTION)
print()
df_test = create_dataframe('test', samples=SAMPLES, step=STEP, resolution=RESOLUTION)
print()

df_train.to_csv('../training_data/multi_train_dataset.csv')
df_test.to_csv('../training_data/multi_test_dataset.csv')

encoder = MultiLabelBinarizer()
X_train, y_train, X_test, y_test = create_whole_dataset(df_train, df_test, encoder)

print(np.shape(X_train), np.shape(X_test), np.shape(y_train), np.shape(y_test))
pd.set_option('display.max_columns', 15)
df_train.head(10)

train file 1998/1998
test file 1998/1998
(127206, 15, 13) (41292, 15, 13) (127206, 36) (41292, 36)


Unnamed: 0,mics12_0,mics12_1,mics12_2,mics12_3,mics12_4,mics12_5,mics12_6,...,mics56_8,mics56_9,mics56_10,mics56_11,mics56_12,room,label
0,0.017008,-0.025037,-0.01462,-0.034485,0.090218,0.341672,0.36176,...,-0.032745,0.26611,0.018237,-0.101055,0.060874,large,"(0, 100)"
1,0.020044,-0.020852,0.022412,-0.073405,0.096545,0.307868,0.413434,...,-0.054997,0.235105,0.008377,-0.108711,0.05765,large,"(0, 100)"
2,0.000733,-0.010926,0.01031,-0.04972,0.070847,0.287535,0.453541,...,-0.045041,0.224659,0.00766,-0.077425,0.033713,large,"(0, 100)"
3,0.033825,-0.034581,-0.02321,-0.029677,0.082875,0.375525,0.201475,...,0.005436,0.343816,0.109243,-0.110293,0.090125,large,"(0, 100)"
4,0.010089,-0.02026,0.002919,-0.062344,0.090493,0.275583,0.476308,...,-0.040501,0.206523,-0.010091,-0.069473,0.067644,large,"(0, 100)"
5,0.040759,-0.051229,-0.012842,-0.04089,0.072353,0.402834,0.185805,...,-0.022587,0.374707,0.119225,-0.128495,0.096771,large,"(0, 100)"
6,0.053734,-0.065775,-0.019492,-0.045982,0.085565,0.418619,0.174143,...,0.016318,0.342236,0.094446,-0.046922,0.068393,large,"(0, 100)"
7,0.029235,-0.040301,-0.023376,-0.046448,0.088075,0.378493,0.180392,...,-0.005233,0.374659,0.112359,-0.113179,0.082425,large,"(0, 100)"
8,0.007024,-0.014353,-0.006071,-0.037153,0.065561,0.330927,0.261098,...,0.041435,0.275205,0.09665,-0.053802,0.065953,large,"(0, 100)"
9,0.053495,-0.043568,-0.036108,-0.025762,0.078903,0.340318,0.23844,...,0.024151,0.305723,0.080028,-0.076471,0.075838,large,"(0, 100)"


Only run this when all the variables are not stored in memory (i.e. after restarting the kernel):

In [None]:
df_train = pd.read_csv('../training_data/multi_train_dataset.csv', index_col=[0])
df_test = pd.read_csv('../training_data/multi_test_dataset.csv', index_col=[0])
encoder = MultiLabelBinarizer()
X_train, y_train, X_test, y_test = create_whole_dataset(df_train, df_test, encoder)
np.shape(X_train), np.shape(X_test), np.shape(y_train), np.shape(y_test)

In [134]:
# Transpose the observations because Conv1D requires timesteps as the 1st dim
if X_train.shape[1] == MIC_COMBS:
    X_train, X_test = np.transpose(X_train, axes=[0, 2, 1]), np.transpose(X_test, axes=[0, 2, 1])
X_train.shape, X_test.shape

((127206, 13, 15), (41292, 13, 15))

In [135]:
# Fit model
epochs, batch_size, verbose = 5, 32, 1

def create_model(X_train, y_train, X_test, y_test):
    n_timesteps, n_features, n_outputs = X_train.shape[1], X_train.shape[2], y_train.shape[1]

    # Init model
    model = Sequential()

    # Add layers
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(n_timesteps,n_features)))
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(n_outputs, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam')

    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=verbose)
    
    return model, history

In [136]:
model, history = create_model(X_train, y_train, X_test, y_test)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [137]:
accuracy_score(y_test, model.predict(X_test).round())

0.6033129904097646

In [138]:
def evaluate_for_property(df_train, df_test, model, prop, value):
    """
    Measures the model prediction for test samples
    with a given property, such as room size.
    """
    
    encoder = MultiLabelBinarizer()
    
    # Filter test set by property value
    X_trn, y_trn, X_tst, y_tst = create_whole_dataset(
        df_train, df_test[df_test[prop]==value], encoder
    )
    
    # Evaluate the model on the filtered set
    X_tst = np.transpose(X_tst, axes=[0, 2, 1])
    acc = accuracy_score(y_tst, model.predict(X_tst).round())
    
    return round(acc, 3)

# Evaluate performance for different properties

print('Room sizes')
for room in ROOMS:
    acc = evaluate_for_property(df_train, df_test, model, 'room', room)
    print(f"{room} room accuracy: {acc}")

Room sizes
small room accuracy: 0.619
medium room accuracy: 0.6
large room accuracy: 0.591
