# ILRMAによる多チャネル音源分離

In [1]:
!git clone https://github.com/tky823/audio_source_separation.git

Cloning into 'audio_source_separation'...
remote: Enumerating objects: 480, done.[K
remote: Counting objects: 100% (480/480), done.[K
remote: Compressing objects: 100% (273/273), done.[K
remote: Total 480 (delta 178), reused 435 (delta 142), pack-reused 0[K
Receiving objects: 100% (480/480), 64.62 KiB | 1.50 MiB/s, done.
Resolving deltas: 100% (178/178), done.


In [2]:
%cd "/content/audio_source_separation/egs/bss-example/ilrma"

/content/audio_source_separation/egs/bss-example/ilrma


## データの準備
[CMU ARCTICデータベース](http://www.festvox.org/cmu_arctic/)の音声，および[Multi-Channel Impulse Response Database](https://www.iks.rwth-aachen.de/en/research/tools-downloads/databases/multi-channel-impulse-response-database/)のインパルス応答を用いて，多チャネルの混合音をシミュレーションする．

In [5]:
!. ./prepare.sh

--2021-01-01 03:36:35--  http://festvox.org/cmu_arctic/packed/cmu_us_aew_arctic.tar.bz2
Resolving festvox.org (festvox.org)... 199.4.150.154
Connecting to festvox.org (festvox.org)|199.4.150.154|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 93295070 (89M) [application/x-bzip2]
Saving to: ‘data/cmu_us_aew_arctic.tar.bz2’


2021-01-01 03:36:37 (37.4 MB/s) - ‘data/cmu_us_aew_arctic.tar.bz2’ saved [93295070/93295070]

--2021-01-01 03:36:37--  http://festvox.org/cmu_arctic/packed/cmu_us_axb_arctic.tar.bz2
Resolving festvox.org (festvox.org)... 199.4.150.154
Connecting to festvox.org (festvox.org)|199.4.150.154|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 49831327 (48M) [application/x-bzip2]
Saving to: ‘data/cmu_us_axb_arctic.tar.bz2’


2021-01-01 03:36:40 (22.3 MB/s) - ‘data/cmu_us_axb_arctic.tar.bz2’ saved [49831327/49831327]

--2021-01-01 03:36:40--  http://festvox.org/cmu_arctic/packed/cmu_us_bdl_arctic.tar.bz2
Resolving festvox.org 

In [4]:
import sys
sys.path.append("../../../src")

In [6]:
import numpy as np
import scipy.signal as ss
import librosa
import IPython.display as ipd
from algorithm.ilrma import GaussILRMA

窓長などについて
- $T_{60}=160$ [ms]の残響のインパルス応答を使用する．
- 空間がランク$1$である仮定から，フーリエ変換の窓長は，$4096$サンプル（$=256$ [ms]）としている．
- シフト長は，窓長の半分の$2048$サンプルとしている

In [7]:
fft_size, hop_size = 4096, 2048

## 2音源分離

In [9]:
aew_mic3, sr = librosa.load("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic3.wav", sr=None)
axb_mic3, sr = librosa.load("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic3.wav", sr=None)
y_mic3 = aew_mic3 + axb_mic3

aew_mic4, sr = librosa.load("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic4.wav", sr=None)
axb_mic4, sr = librosa.load("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic4.wav", sr=None)
y_mic4 = aew_mic4 + axb_mic4

y = np.vstack([y_mic3, y_mic4])
n_sources, T = y.shape

### 混合音

In [10]:
ipd.Audio(y[0], rate=sr)

In [11]:
ipd.Audio(y[1], rate=sr)

### ILRMAの実行

In [12]:
_, _, Y = ss.stft(y, nperseg=fft_size, noverlap=hop_size)

In [13]:
np.random.seed(111)
ilrma = GaussILRMA(n_bases=2)

In [14]:
S_hat = ilrma(Y, iteration=200)

In [15]:
_, s = ss.istft(S_hat, nperseg=fft_size, noverlap=hop_size)
s = s[:,:T]

### 分離音

In [16]:
ipd.Audio(s[0], rate=sr)

In [17]:
ipd.Audio(s[1], rate=sr)

## 3音源分離

In [18]:
aew_mic2, sr = librosa.load("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic2.wav", sr=None)
axb_mic2, sr = librosa.load("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic2.wav", sr=None)
bdl_mic2, sr = librosa.load("./data/cmu_us_bdl_arctic/trimmed/convolved-16000_deg330-mic2.wav", sr=None)
y_mic2 = aew_mic2 + axb_mic2 + bdl_mic2

aew_mic4, sr = librosa.load("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic4.wav", sr=None)
axb_mic4, sr = librosa.load("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic4.wav", sr=None)
bdl_mic4, sr = librosa.load("./data/cmu_us_bdl_arctic/trimmed/convolved-16000_deg330-mic4.wav", sr=None)
y_mic4 = aew_mic4 + axb_mic4 + bdl_mic4

aew_mic5, sr = librosa.load("./data/cmu_us_aew_arctic/trimmed/convolved-16000_deg60-mic5.wav", sr=None)
axb_mic5, sr = librosa.load("./data/cmu_us_axb_arctic/trimmed/convolved-16000_deg300-mic5.wav", sr=None)
bdl_mic5, sr = librosa.load("./data/cmu_us_bdl_arctic/trimmed/convolved-16000_deg330-mic5.wav", sr=None)
y_mic5 = aew_mic5 + axb_mic5 + bdl_mic5

y = np.vstack([y_mic2, y_mic4, y_mic5])
n_sources, T = y.shape

### 混合音

In [19]:
ipd.Audio(y[0], rate=sr)

In [20]:
ipd.Audio(y[1], rate=sr)

In [21]:
ipd.Audio(y[2], rate=sr)

### ILRMAの実行

In [22]:
_, _, Y = ss.stft(y, nperseg=fft_size, noverlap=hop_size)

In [47]:
np.random.seed(111)
ilrma = GaussILRMA(n_bases=2)

In [48]:
S_hat = ilrma(Y, iteration=200)

In [49]:
_, s = ss.istft(S_hat, nperseg=fft_size, noverlap=hop_size)
s = s[:,:T]

### 分離音

In [50]:
ipd.Audio(s[0], rate=sr)

In [51]:
ipd.Audio(s[1], rate=sr)

In [52]:
ipd.Audio(s[2], rate=sr)