# ILRMAによる多チャネル音源分離

In [None]:
!git clone https://github.com/tky823/audio_source_separation.git

In [None]:
%cd "/content/audio_source_separation/egs/bss-example/ilrma"

## データの準備
[効果音ラボ](https://soundeffect-lab.info/sound/voice/game.html)の音声，および[Multi-Channel Impulse Response Database](https://www.iks.rwth-aachen.de/en/research/tools-downloads/databases/multi-channel-impulse-response-database/)のインパルス応答を用いて，多チャネルの混合音をシミュレーションする．

In [None]:
!. ./prepare.sh

In [None]:
import sys
sys.path.append("../../../src")

In [None]:
import numpy as np
import scipy.signal as ss
import librosa
import IPython.display as ipd
from algorithm.ilrma import GaussILRMA

窓長などについて
- このノートブックでは，すべてのデータを$16$ kHzにリサンプリングしている．
- $T_{60}=160$ [ms]の残響のインパルス応答を使用する．
- 空間がランク$1$である仮定から，フーリエ変換の窓長は，$4096$サンプル（$=256$ [ms]）としている．
- シフト長は，窓長の半分の$2048$サンプルとしている

In [None]:
fft_size, hop_size = 4096, 2048

## 2音源分離

In [None]:
wizard_mic3, sr = librosa.load("./data/wizard/convolved-16000_deg60-mic3.wav", sr=None)
swordwoman_mic3, sr = librosa.load("./data/swordwoman/convolved-16000_deg300-mic3.wav", sr=None)
y_mic3 = wizard_mic3 + swordwoman_mic3

wizard_mic4, sr = librosa.load("./data/wizard/convolved-16000_deg60-mic4.wav", sr=None)
swordwoman_mic4, sr = librosa.load("./data/swordwoman/convolved-16000_deg300-mic4.wav", sr=None)
y_mic4 = wizard_mic4 + swordwoman_mic4

y = np.vstack([y_mic3, y_mic4])
n_sources, T = y.shape

### 混合音

In [None]:
ipd.Audio(y[0], rate=sr)

In [None]:
ipd.Audio(y[1], rate=sr)

### ILRMAの実行

In [None]:
_, _, Y = ss.stft(y, nperseg=fft_size, noverlap=hop_size)

In [None]:
np.random.seed(111)
ilrma = GaussILRMA(n_bases=2)

In [None]:
S_hat = ilrma(Y, iteration=200)

In [None]:
_, s = ss.istft(S_hat, nperseg=fft_size, noverlap=hop_size)
s = s[:,:T]

### 分離音

In [None]:
ipd.Audio(s[0], rate=sr)

In [None]:
ipd.Audio(s[1], rate=sr)

## 3音源分離

In [None]:
wizard_mic2, sr = librosa.load("./data/wizard/convolved-16000_deg60-mic2.wav", sr=None)
swordwoman_mic2, sr = librosa.load("./data/swordwoman/convolved-16000_deg300-mic2.wav", sr=None)
theifboy_mic2, sr = librosa.load("./data/thief-boy/convolved-16000_deg330-mic2.wav", sr=None)
y_mic2 = wizard_mic2 + swordwoman_mic2 + theifboy_mic2

wizard_mic4, sr = librosa.load("./data/wizard/convolved-16000_deg60-mic4.wav", sr=None)
swordwoman_mic4, sr = librosa.load("./data/swordwoman/convolved-16000_deg300-mic4.wav", sr=None)
theifboy_mic4, sr = librosa.load("./data/thief-boy/convolved-16000_deg330-mic4.wav", sr=None)
y_mic4 = wizard_mic4 + swordwoman_mic4 + theifboy_mic4

wizard_mic5, sr = librosa.load("./data/wizard/convolved-16000_deg60-mic5.wav", sr=None)
swordwoman_mic5, sr = librosa.load("./data/swordwoman/convolved-16000_deg300-mic5.wav", sr=None)
theifboy_mic5, sr = librosa.load("./data/thief-boy/convolved-16000_deg330-mic5.wav", sr=None)
y_mic5 = wizard_mic5 + swordwoman_mic5 + theifboy_mic5

y = np.vstack([y_mic2, y_mic4, y_mic5])
n_sources, T = y.shape

### 混合音

In [None]:
ipd.Audio(y[0], rate=sr)

In [None]:
ipd.Audio(y[1], rate=sr)

In [None]:
ipd.Audio(y[2], rate=sr)

### ILRMAの実行

In [None]:
_, _, Y = ss.stft(y, nperseg=fft_size, noverlap=hop_size)

In [None]:
np.random.seed(111)
ilrma = GaussILRMA(n_bases=2)

In [None]:
S_hat = ilrma(Y, iteration=200)

In [None]:
_, s = ss.istft(S_hat, nperseg=fft_size, noverlap=hop_size)
s = s[:,:T]

### 分離音

In [None]:
ipd.Audio(s[0], rate=sr)

In [None]:
ipd.Audio(s[1], rate=sr)

In [None]:
ipd.Audio(s[2], rate=sr)