# Multichannel audio source separation by Consistent-ILRMA

In [None]:
%%shell
git clone https://github.com/tky823/audio_source_separation.git

In [None]:
%cd "/content/audio_source_separation/egs/bss-example/ilrma"

## 1\. Data preparation
Create multichannel mixtures using the audios of [CMU ARCTIC database](http://www.festvox.org/cmu_arctic/) and impulse responses of [Multi-Channel Impulse Response Database](https://www.iks.rwth-aachen.de/en/research/tools-downloads/databases/multi-channel-impulse-response-database/).

In [None]:
%%shell
. ./prepare.sh

In [None]:
import sys
sys.path.append("../../../src")

In [None]:
import numpy as np
import scipy.signal as ss
import soundfile as sf
import IPython.display as ipd
import matplotlib.pyplot as plt

In [None]:
from algorithm.stft import stft, istft
from bss.ilrma import ConsistentGaussILRMA

In [None]:
plt.rcParams['figure.dpi'] = 200

## 2\. Effect of consistency

### 2.1 Example of artifitial spectrogram

In [None]:
fft_size, hop_size = 1024, 256
n_bins, n_frames = fft_size//2+1, 128

In [None]:
start_bin, end_bin = 15*n_bins//32, 17*n_bins//32+1
start_frame, end_frame = 3*n_frames//8, 5*n_frames//8+1

In [None]:
np.random.seed(111)

In [None]:
Y = np.zeros((n_bins, n_frames), dtype=np.complex)
phase = 2* np.pi * np.random.rand()
Y[n_bins//2, n_frames//2] = np.cos(phase) + 1j*np.sin(phase)
y = istft(Y, fft_size=fft_size, hop_size=hop_size)
Y_hat = stft(y, fft_size=fft_size, hop_size=hop_size)

In [None]:
power = np.maximum(np.abs(Y)**2, 1e-12)
log_power = 10*np.log10(power)
power_hat = np.maximum(np.abs(Y_hat)**2, 1e-12)
log_power_hat = 10*np.log10(power_hat)

In [None]:
fig, axes= plt.subplots(1, 2, figsize=(12, 4))
axes[0].pcolormesh(np.arange(start_frame, end_frame), np.arange(start_bin, end_bin), log_power[start_bin: end_bin, start_frame: end_frame], vmax=0)
axes[0].set_title(r'Spectrogram $|Y|^{2}$')
axes[0].axis("off")
axes[1].pcolormesh(np.arange(start_frame, end_frame), np.arange(start_bin, end_bin), log_power_hat[start_bin: end_bin, start_frame: end_frame], vmax=0)
axes[1].set_title(r'Spectrogram $|\mathrm{STFT}(\mathrm{iSTFT}(Y))|^{2}$')
axes[1].axis("off")
plt.show()

### 2.2 Example of speech signal
Random drop out

In [None]:
fft_size, hop_size = 4096, 2048
n_bins = fft_size//2+1
start_bin, end_bin = 0, 256

In [None]:
y, sr = sf.read("./data/cmu_us_aew_arctic/wav/arctic_a0001.wav")
T = len(y)

In [None]:
np.random.seed(111)

In [None]:
Y = stft(y, fft_size=fft_size, hop_size=hop_size)
_, n_frames = Y.shape

mask = np.random.randint(0, 5, (n_bins, n_frames))
mask = np.minimum(mask, 1)
Y = Y * mask

In [None]:
y = istft(Y, fft_size=fft_size, hop_size=hop_size, length=T)
Y_hat = stft(y, fft_size=fft_size, hop_size=hop_size)

In [None]:
power = np.maximum(np.abs(Y)**2, 1e-12)
log_power = 10*np.log10(power)
power_hat = np.maximum(np.abs(Y_hat)**2, 1e-12)
log_power_hat = 10*np.log10(power_hat)

In [None]:
fig, axes= plt.subplots(1, 2, figsize=(12, 4))
axes[0].pcolormesh(np.arange(n_frames), np.arange(start_bin, end_bin), log_power[start_bin: end_bin])
axes[0].set_title(r'Spectrogram $|Y|^{2}$')
axes[0].axis("off")
axes[1].pcolormesh(np.arange(n_frames), np.arange(start_bin, end_bin), log_power_hat[start_bin: end_bin])
axes[1].set_title(r'Spectrogram $|\mathrm{STFT}(\mathrm{iSTFT}(Y))|^{2}$')
axes[1].axis("off")
plt.show()

## 3\. Multichannel audio source separation

Configuration of STFT
- The reverberation time is $T_{60}=160$ [ms] in the impulse response.
- The window length is $4096$ samples (= $256$ [ms]) because of the assumption of rank-1 constraint.
- The hop length is the half of the window length, i.e. $2048$ samples (= $128$ [ms]) .

In [None]:
fft_size, hop_size = 4096, 2048

### 3.1 2 speakers

In [None]:
aew_mic3, sr = sf.read("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic3.wav")
axb_mic3, sr = sf.read("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic3.wav")
x_mic3 = aew_mic3 + axb_mic3

aew_mic4, sr = sf.read("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic4.wav")
axb_mic4, sr = sf.read("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic4.wav")
x_mic4 = aew_mic4 + axb_mic4

x = np.vstack([x_mic3, x_mic4])
n_channels, T = x.shape
n_sources = n_channels

#### Target sources after convolution of impulse response

In [None]:
display(ipd.Audio(aew_mic3, rate=sr))
display(ipd.Audio(axb_mic3, rate=sr))

#### Mixture

In [None]:
for idx in range(n_channels):
    display(ipd.Audio(x[idx], rate=sr))

#### Execution of ILRMA

In [None]:
_, _, X = ss.stft(x, nperseg=fft_size, noverlap=fft_size-hop_size)

In [None]:
np.random.seed(111)
ilrma = ConsistentGaussILRMA(n_basis=2, fft_size=fft_size, hop_size=hop_size)

In [None]:
print(ilrma)

In [None]:
Y = ilrma(X, iteration=100)

In [None]:
_, y = ss.istft(Y, nperseg=fft_size, noverlap=fft_size-hop_size)
y = y[:,:T]

#### Separated sources

In [None]:
for idx in range(n_channels):
    display(ipd.Audio(y[idx], rate=sr))

In [None]:
plt.figure()
plt.plot(ilrma.loss, color='black')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.show()

### 3.2 3 speakers

In [None]:
aew_mic2, sr = sf.read("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic2.wav")
axb_mic2, sr = sf.read("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic2.wav")
bdl_mic2, sr = sf.read("./data/cmu_us_bdl_arctic/trimmed/convolved-16000_deg330-mic2.wav")
x_mic2 = aew_mic2 + axb_mic2 + bdl_mic2

aew_mic4, sr = sf.read("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic4.wav")
axb_mic4, sr = sf.read("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic4.wav")
bdl_mic4, sr = sf.read("./data/cmu_us_bdl_arctic/trimmed/convolved-16000_deg330-mic4.wav")
x_mic4 = aew_mic4 + axb_mic4 + bdl_mic4

aew_mic5, sr = sf.read("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic5.wav")
axb_mic5, sr = sf.read("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic5.wav")
bdl_mic5, sr = sf.read("./data/cmu_us_bdl_arctic/trimmed/convolved-16000_deg330-mic5.wav")
x_mic5 = aew_mic5 + axb_mic5 + bdl_mic5

x = np.vstack([x_mic2, x_mic4, x_mic5])
n_channels, T = x.shape
n_sources = n_channels

#### Target sources after convolution of impulse response

In [None]:
display(ipd.Audio(aew_mic2, rate=sr))
display(ipd.Audio(axb_mic2, rate=sr))
display(ipd.Audio(bdl_mic2, rate=sr))

#### Mixture

In [None]:
for idx in range(n_channels):
    display(ipd.Audio(x[idx], rate=sr))

#### Execution of ILRMA

In [None]:
_, _, X = ss.stft(x, nperseg=fft_size, noverlap=hop_size)

In [None]:
np.random.seed(111)
ilrma = ConsistentGaussILRMA(n_basis=2, fft_size=fft_size, hop_size=hop_size)

In [None]:
print(ilrma)

In [None]:
Y = ilrma(X, iteration=100)

In [None]:
_, y = ss.istft(Y, nperseg=fft_size, noverlap=hop_size)
y = y[:,:T]

#### Separated sources

In [None]:
for idx in range(n_sources):
    display(ipd.Audio(y[idx], rate=sr))

In [None]:
plt.figure()
plt.plot(ilrma.loss, color='black')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.show()