# Multichannel audio source separation by gradient or natural gradient IVA

In [None]:
%%shell
git clone https://github.com/tky823/audio_source_separation.git

In [None]:
%cd "/content/audio_source_separation/egs/bss-example/iva"

In [None]:
import sys
sys.path.append("../../../src")

In [None]:
import numpy as np
import scipy.signal as ss
import soundfile as sf
import IPython.display as ipd
import matplotlib.pyplot as plt

In [None]:
from bss.iva import GradLaplaceIVA, NaturalGradLaplaceIVA

In [None]:
plt.rcParams['figure.dpi'] = 200

## 1 Music source separation

### Data preparation for music source separation
We already created multichannel mixtures using the impulse responses of [Multi-Channel Impulse Response Database](https://www.iks.rwth-aachen.de/en/research/tools-downloads/databases/multi-channel-impulse-response-database/).
You can find the original sources (piano & violin) and its mixture in `audio_source_separation/dataset/sample-song/`.

### Target sources

In [None]:
source_piano, sr = sf.read("../../../dataset/sample-song/sample-2_piano_16000.wav")
source_violin, sr = sf.read("../../../dataset/sample-song/sample-2_violin_16000.wav")

In [None]:
display(ipd.Audio(source_piano, rate=sr))
display(ipd.Audio(source_violin, rate=sr))

In [None]:
y = np.vstack([source_piano, source_violin])

### Mixture

In [None]:
mixture, sr = sf.read("../../../dataset/sample-song/sample-2_mixture_16000.wav")
x = mixture.T
n_channels, T = x.shape
n_sources = n_channels

In [None]:
for idx in range(n_channels):
    display(ipd.Audio(x[idx], rate=sr))

Configuration of STFT
- The reverberation time is $T_{60}=160$ [ms] in the impulse response.
- The window length is $4096$ samples (= $256$ [ms]) because of the assumption of rank-1 constraint.
- The hop length is the half of the window length, i.e. $2048$ samples (= $128$ [ms]) .

In [None]:
fft_size, hop_size = 4096, 2048

In [None]:
_, _, X = ss.stft(x, nperseg=fft_size, noverlap=fft_size-hop_size)

### Gradient IVA

#### Exection of IVA

In [None]:
np.random.seed(111)
iva = GradLaplaceIVA()

In [None]:
print(iva)

In [None]:
Y = iva(X, iteration=10000)

In [None]:
_, y = ss.istft(Y, nperseg=fft_size, noverlap=fft_size-hop_size)
y = y[:, :T]

#### Separated sources

In [None]:
for idx in range(n_sources):
    display(ipd.Audio(y[idx], rate=sr))

In [None]:
plt.figure()
plt.plot(iva.loss, color='black')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.show()

### Natual gradient IVA

#### Execution of IVA

In [None]:
np.random.seed(111)
iva = NaturalGradLaplaceIVA()

In [None]:
print(iva)

In [None]:
Y = iva(X, iteration=500)

#### Separated sources

In [None]:
_, y = ss.istft(Y, nperseg=fft_size, noverlap=fft_size-hop_size)
y = y[:, :T]

In [None]:
for idx in range(n_sources):
    display(ipd.Audio(y[idx], rate=sr))

In [None]:
plt.figure()
plt.plot(iva.loss, color='black')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.show()

## 2 Speech separation

### Data preparation for speech separation
Create multichannel mixtures using the audios of [CMU ARCTIC database](http://www.festvox.org/cmu_arctic/) and impulse responses of [Multi-Channel Impulse Response Database](https://www.iks.rwth-aachen.de/en/research/tools-downloads/databases/multi-channel-impulse-response-database/).

In [None]:
%%shell
. ./prepare.sh

Configuration of STFT
- The reverberation time is $T_{60}=160$ [ms] in the impulse response.
- The window length is $4096$ samples (= $256$ [ms]) because of the assumption of rank-1 constraint.
- The hop length is the half of the window length, i.e. $2048$ samples (= $128$ [ms]) .

In [None]:
fft_size, hop_size = 4096, 2048

In [None]:
aew_mic3, sr = sf.read("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic3.wav")
axb_mic3, sr = sf.read("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic3.wav")
x_mic3 = aew_mic3 + axb_mic3

aew_mic4, sr = sf.read("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic4.wav")
axb_mic4, sr = sf.read("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic4.wav")
x_mic4 = aew_mic4 + axb_mic4

x = np.vstack([x_mic3, x_mic4])
n_sources, T = x.shape
n_channels = n_sources

### Target sources after convolution of impulse response

In [None]:
display(ipd.Audio(aew_mic3, rate=sr))
display(ipd.Audio(axb_mic3, rate=sr))

### Mixture

In [None]:
for idx in range(n_channels):
    display(ipd.Audio(x[idx], rate=sr))

### Execution of IVA

In [None]:
_, _, X = ss.stft(x, nperseg=fft_size, noverlap=hop_size)

In [None]:
np.random.seed(111)
iva = NaturalGradLaplaceIVA()

In [None]:
print(iva)

In [None]:
Y = iva(X, iteration=500)

In [None]:
_, y = ss.istft(Y, nperseg=fft_size, noverlap=fft_size-hop_size)
y = y[:,:T]

### Separated sources

In [None]:
for idx in range(n_sources):
    display(ipd.Audio(y[idx], rate=sr))

In [None]:
plt.figure()
plt.plot(iva.loss, color='black')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.show()