In [2]:
%matplotlib inline


# Vocal separation


This notebook demonstrates a simple technique for separating vocals (and
other sporadic foreground signals) from accompanying instrumentation.

This is based on the "REPET-SIM" method of `Rafii and Pardo, 2012
<http://www.cs.northwestern.edu/~zra446/doc/Rafii-Pardo%20-%20Music-Voice%20Separation%20using%20the%20Similarity%20Matrix%20-%20ISMIR%202012.pdf>`_, but includes a couple of modifications and extensions:

    - FFT windows overlap by 1/4, instead of 1/2
    - Non-local filtering is converted into a soft mask by Wiener filtering.
      This is similar in spirit to the soft-masking method used by `Fitzgerald, 2012
      <http://arrow.dit.ie/cgi/viewcontent.cgi?article=1086&context=argcon>`_,
      but is a bit more numerically stable in practice.



In [1]:
!wget -c -A '*.mp3' -r -l 1 -nd https://github.com/librosa/librosa_gallery/blob/master/audio/Cheese_N_Pot-C_-_16_-_The_Raps_Well_Clean_Album_Version.mp3


--2021-06-28 09:30:18--  https://github.com/librosa/librosa_gallery/blob/master/audio/Cheese_N_Pot-C_-_16_-_The_Raps_Well_Clean_Album_Version.mp3
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘Cheese_N_Pot-C_-_16_-_The_Raps_Well_Clean_Album_Version.mp3’

Cheese_N_Pot-C_-_16     [ <=>                ] 111.33K   615KB/s    in 0.2s    

2021-06-28 09:30:19 (615 KB/s) - ‘Cheese_N_Pot-C_-_16_-_The_Raps_Well_Clean_Album_Version.mp3’ saved [113997]

Loading robots.txt; please ignore errors.
--2021-06-28 09:30:19--  https://github.com/robots.txt
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 200 OK
Length: 1481 (1.4K) [text/plain]
Saving to: ‘robots.txt.tmp’


2021-06-28 09:30:19 (42.8 MB/s) - ‘robots.txt.tmp’ saved [1481/1481]

--2021-06-28 09:30:19--  https://github.com/
Reusing existing con

In [None]:
import IPython
IPython.display.Audio('/content/music.mp3')

In [8]:
import numpy as np
import librosa.display
import soundfile as sf
import matplotlib.pyplot as plt
import librosa

y, sr = librosa.load('/content/music.mp3', sr=None)
output_file_path = "/content/music_margin_10.wav"

#get spectrogram magnitude and phase
S_full, phase = librosa.magphase(librosa.stft(y))
S_filter = librosa.decompose.nn_filter(S_full,
                                       aggregate=np.median,
                                       metric='cosine',
                                       width=int(librosa.time_to_frames(2, sr=sr)))
# S_filter should be minimum - output filter should not be greater than input
S_filter = np.minimum(S_full, S_filter)
#
margin_i, margin_v = 1, 10
power = 1

mask_i = librosa.util.softmask(S_filter,
                               margin_i * (S_full - S_filter),
                               power=power)

mask_v = librosa.util.softmask(S_full - S_filter,
                               margin_v * S_filter,
                               power=power)

# Once we have the masks, simply multiply them with the input spectrum
# to separate the components
S_foreground = mask_v * S_full
S_background = mask_i * S_full
D_foreground = S_foreground * phase
y_foreground = librosa.istft(D_foreground)
sf.write(output_file_path, y_foreground, samplerate=sr, subtype='PCM_24')

