In [5]:
import numpy as np
from scipy.optimize import minimize
import scipy.signal
import librosa
import os
import sox
import tempfile
import shutil
import matplotlib.pyplot as plt

%matplotlib inline

def get_feature_audio(filename):
    sr = 8192
    y, fs = librosa.load(filename, mono=True, sr=sr)
    feature = y ** 2.0
    max_feature = np.max(feature)
    voiced = 1
    if max_feature < 0.0005:
        voiced = 0
        
    
    return feature, voiced


def linear_model(x, A, y):
    return np.linalg.norm(np.dot(A, x) - y, ord=2)

def hex_to_stem_list(hex_file):
    temp_path = tempfile.mkdtemp() + '/'

    output_mapping = {'0': {1: [1]},
                      '1': {1: [2]},
                      '2': {1: [3]},
                      '3': {1: [4]},
                      '4': {1: [5]},
                      '5': {1: [6]}
                      }
    stem_files = []
    for mix_type in ['0', '1', '2', '3', '4', '5']:
        tfm = sox.Transformer()
        tfm.remix(remix_dictionary=output_mapping[mix_type])
        output_path = os.path.join(temp_path, '{}.wav'.format(mix_type))
        stem_files.append(output_path)
        tfm.build(hex_file, output_path)
    return stem_files, temp_path # a list of stem file names.

def analyze_mix_audio(mix_file, stem_files):
    mix_audio, voiced = get_feature_audio(mix_file)
    win_len = 4096
    stem_audio_list = []
    voiced_list = []
    for stem_path in stem_files:
        y, voiced = get_feature_audio(stem_path)
        stem_audio_list.append(y*voiced)
        if not voiced:
            print('unvoiced')
        voiced_list.append(voiced)
    
    stem_audio = np.array(stem_audio_list)
    n_stems = stem_audio.shape[0]
    stem_indices = range(n_stems)
    # force weights to be between 0 and 10
    bounds = tuple([(0, 10.0) for _ in range(n_stems)])
    res = minimize(
        linear_model, x0=np.ones((n_stems,)), args=(stem_audio.T, mix_audio.T),
        bounds=bounds
    )
    coefs = res['x']

    mixing_coeffs = {
        int(i): float(c) for i, c in zip(stem_indices, coefs)
    }
    return mixing_coeffs, voiced_list

def mix_stem(mixing_coeffs, stem_files, output_file, voiced_list):
    remix_dict = {k: [v] for (k, v) in zip(range(1,7), range(1,7))}
    gain_list = mixing_coeffs.values()
    gain_list = list(np.array(gain_list) * np.array(voiced_list))
    print(gain_list)
    
    cbn = sox.Combiner()
    cbn.remix(remix_dictionary=remix_dict)
    cbn.gain(normalize=True)
    cbn.build(stem_files, output_file, input_volumes=gain_list, combine_type='merge')
        

def run(mix_file, hex_file, output_file, normalized):
    stem_files, temp_path = hex_to_stem_list(hex_file)
    mixing_coeffs, voiced_list = analyze_mix_audio(mix_file, stem_files)
    mix_stem(mixing_coeffs, stem_files, output_file, voiced_list)
    shutil.rmtree(temp_path)
#     normalize_hex(output_file, normalized)


In [6]:
def track_energy(wave, win_len, win):
    """Compute the energy of an audio signal

    Parameters
    ----------
    wave : np.array
        The signal from which to compute energy
    win_len: int
        The number of samples to use in energy computation
    win : np.array
        The windowing function to use in energy computation

    Returns
    -------
    energy : np.array
        Array of track energy

    """
    hop_len = win_len // 2

    wave = np.lib.pad(
        wave, pad_width=(win_len - hop_len, 0), mode='constant',
        constant_values=0
    )

    # post padding
    wave = librosa.util.fix_length(
        wave, int(win_len * np.ceil(len(wave) / win_len))
    )

    # cut into frames
    wavmat = librosa.util.frame(wave, frame_length=win_len, hop_length=hop_len)

    # Envelope follower
    wavmat = hwr(wavmat) ** 0.5  # half-wave rectification + compression

    return np.mean((wavmat.T * win), axis=1)


def hwr(x):
    """ Half-wave rectification.

    Parameters
    ----------
    x : array-like
        Array to half-wave rectify

    Returns
    -------
    x_hwr : array-like
        Half-wave rectified array

    """
    return (x + np.abs(x)) / 2

In [7]:
base_dir_mix = '/Users/tom/Music/DataSet/test_set_ref/'
mix_list = [os.path.join(base_dir_mix, f) for f in os.listdir(base_dir_mix) if f.endswith(".wav")]

base_dir_hex = '/Users/tom/Music/DataSet/test_set_cleaned2/'
hex_list = [os.path.join(base_dir_hex, f) for f in os.listdir(base_dir_hex) if f.endswith(".wav")]

base_dir_out = '/Users/tom/Music/DataSet/test_set_gained3/'
out_list = [os.path.join(base_dir_out, f.split('.')[0]+'_gained.wav') for f in os.listdir(base_dir_hex) if f.endswith(".wav")]

normalized_list = [f.split('.')[0]+'_normalized.wav' for f in out_list]


In [8]:
for m, h, o, n in zip(mix_list, hex_list, out_list, normalized_list):
    print(m)
    run(m, h, o, n)

/Users/tom/Music/DataSet/test_set_ref/eh_BN1-129-Eb_mic_comp.wav
unvoiced
unvoiced




[4.1077378066831898, 4.7294065066376367, 3.8919206912441022, 1.1764851855540359, 0.0, 0.0]
/Users/tom/Music/DataSet/test_set_ref/eh_BN1-129-Eb_mic_solo.wav
unvoiced
unvoiced
unvoiced




[0.0, 0.40379564661032596, 0.6253629361524079, 0.22688155521898584, 0.0, 0.0]
/Users/tom/Music/DataSet/test_set_ref/jf_Funk3-112-C#_mic_comp.wav
unvoiced




[0.0, 0.1764492786019419, 0.19397988876274694, 0.099710802062194862, 0.10162058598472318, 0.21007482572327199]
/Users/tom/Music/DataSet/test_set_ref/jf_Funk3-112-C#_mic_solo.wav
unvoiced




[0.0, 1.5113976455012417, 1.8697324678340783, 0.62723129176374992, 0.13951840877399135, 0.48658014514707509]
/Users/tom/Music/DataSet/test_set_ref/js_SS2-107-Ab_mic_comp.wav




[0.19011408709686869, 0.3585622885470014, 0.31003495772637008, 0.26049461001489849, 0.18135763623379741, 0.44901833971025507]
/Users/tom/Music/DataSet/test_set_ref/js_SS2-107-Ab_mic_solo.wav
unvoiced
unvoiced
unvoiced




[0.0, 0.0, 0.89083034544848705, 0.20416363318996947, 0.28268712473321589, 0.0]
/Users/tom/Music/DataSet/test_set_ref/vl_Jazz2-110-Bb_mic_comp.wav




[2.713971229468251, 2.2077506048561371, 5.4477928959309523, 3.0339279105756791, 0.35063862853932254, 2.4333734426891418]
/Users/tom/Music/DataSet/test_set_ref/vl_Jazz2-110-Bb_mic_solo.wav
unvoiced
unvoiced




[0.0, 0.0, 10.0, 2.5166657317484451, 0.069041769034387121, 0.21476569422306105]


In [None]:
mat = np.array([[1,2,3]]) * np.array([[1],[0],[1]])

In [35]:
for row in mat:
    print(row)

[1 2 3]
[0 0 0]
[1 2 3]
