In [None]:
# pip install pytube
# pip install pymusickit
# conda install ffmpeg <- must be installed using conda!

In [None]:
# Setup
from pytube import YouTube
import librosa
import numpy as np
from IPython.display import Audio
import re
import os
import subprocess
from pymusickit.key_finder import KeyFinder
from collections import defaultdict
import ipywidgets as widgets
import math
import soundfile

In [None]:
from vocal_remover import direct_call_on_audio_array
from decompose import run_decomposer

In [None]:
# Functions
# Convert mp4 to wav
def convert_mp4_to_wav(mp4_path):
    root, ext = os.path.splitext(mp4_path)
    wav_path = root + '.wav'
    # Run ffmpeg command
    command = ['ffmpeg', '-y','-i', mp4_path, wav_path]
    subprocess.run(command, check=True)
    return wav_path

# Save YouTube video from url
def save_youtube_audio(url, filename):
    yt = YouTube(url)
    video = yt.streams.filter(only_audio = True).first()
    filepath = os.path.join('audio', f'{filename}.mp4')
    mp4_path = video.download(filename = filepath)
    wav_path = convert_mp4_to_wav(mp4_path)
    os.remove(mp4_path)
    return wav_path

# Load audio from file
def load_audio_from_file(path, duration = 180):
    audio, sr = librosa.load(path, duration = duration)
    #audio = np.clip(audio, 0, 1)
    return audio, sr

# Display audio in notebook
def display_audio(audio, sr = None):
    return Audio(data = audio, rate = sr)

# Save file to disk
def write_audio(path, y, sr):
    soundfile.write(path, y, sr)

# Extract layers (old)
def extract_fore_and_background(audio, sample_rate, margin_i = 2, margin_v = 10, power = 2):
    audio = np.clip(audio, 0, 1)
    S_full, phase = librosa.magphase(librosa.stft(audio))
    S_filter = librosa.decompose.nn_filter(S_full,
                                        aggregate=np.median,
                                        metric='cosine',
                                        width=int(librosa.time_to_frames(2, sr=sample_rate)))
    S_filter = np.minimum(S_full, S_filter)
    mask_i = librosa.util.softmask(S_filter,
                               margin_i * (S_full - S_filter),
                               power=power)

    mask_v = librosa.util.softmask(S_full - S_filter,
                                margin_v * S_filter,
                                power=power)

    S_foreground = mask_v * S_full
    S_background = mask_i * S_full

    y_foreground = librosa.istft(S_foreground * phase)
    y_background = librosa.istft(S_background * phase)

    return(y_foreground, y_background)

# Define key changes
pitches = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
possible_keys = [*[f"{p} major" for p in pitches], *[f"{p} minor" for p in pitches],]
possible_keys.index("C minor")


key_to_key_pitch_change = defaultdict(dict)
for i in range(len(possible_keys)):
    source_key = possible_keys[i]
    source_key_type = source_key.split()[1]
    for j in range(len(possible_keys)):
        target_key = possible_keys[j]
        target_key_type = target_key.split()[1]

        diff = j-i
        if source_key_type == "minor" and target_key_type == "major":
            diff-=3
        if source_key_type == "major" and target_key_type == "minor":
            diff+=3

        diff = diff % 12
        diff = diff if diff<=abs(diff-12) else diff-12
        key_to_key_pitch_change[source_key][target_key] = diff

# Shift audio to match key       
def shift_to_match_target_key(source_song, target_song, source_audio, source_sr):
    # Source - will be changed
    # Target - base for changing
    n_steps = key_to_key_pitch_change[source_song.key_primary][target_song.key_primary]
    shifted_audio = librosa.effects.pitch_shift(y = source_audio, sr = source_sr, n_steps = n_steps)
    return shifted_audio

# Change song tempo
def match_tempo(source_audio, source_sr, target_audio, target_sr):
    # Source audio = source foreground if you want to modify vocals
    source_tempo, _ =  librosa.beat.beat_track(y = source_audio, sr = source_sr)
    target_tempo, _ =  librosa.beat.beat_track(y = target_audio, sr = target_sr)
    rate = np.round(source_tempo[0] / target_tempo[0], 1)
    return librosa.effects.time_stretch(source_audio, rate = rate)


def slider_to_db(slider_value, min_db=-40.0, max_db=0.0):
    db_value = slider_value * (max_db - min_db) + min_db
    return db_value

def db_to_amplitude(db_value):
    amplitude = 10 ** (db_value / 20.0)
    return amplitude

def calculate_rms(y,scale=None):
    rms = np.sqrt(np.mean(np.square(y)))

    if scale:
        db_value = slider_to_db(scale)
        amp_mult = db_to_amplitude(db_value)
        rms = rms * amp_mult
    
    return rms

# Put audio layers together
def combine_audio_layers(audio_list, vol_list):
    lengths = [len(audio) for audio in audio_list]
    length = min(lengths)
    
    if len(audio_list) == len(vol_list):
        combined = audio_list[0][:length] * calculate_rms(audio_list[0], scale=vol_list[0])

        # Set volume level of each audio file from given list
        for audio,vol in zip(audio_list[1:],vol_list[1:]):
            adjusted_audio = audio * calculate_rms(audio, scale=vol)
            combined = combined + adjusted_audio[:length]
    else:
        # Set volume level based on first audio file
        combined = audio_list[0][:length]
        target_rms = calculate_rms(audio_list[0]) # Choose first layer as target
        for audio in audio_list[1:]:
            rate = target_rms / calculate_rms(audio) # Set volume
            adjusted_audio = audio * rate
            combined = combined + adjusted_audio[:length]
    return combined / np.max(np.abs(combined))

# Mixing Tool

Introductory message here ...

Required Python version: 3.10, using conda is recommended...

How to use this notebook: read the explanations and run cells one by one.

## 1. Choose input audio files

Provide links to two audio files ...

In [None]:
# Add YouTube links for 2 songs you want to mix
url1 = 'https://www.youtube.com/watch?v=ozXZnwYTMbs'
url2 = 'https://www.youtube.com/watch?v=84Nby3G1AOE'

# Saving and loading audio files
audiopath1 = save_youtube_audio(url1, 'audio1')
audiopath2 = save_youtube_audio(url2, 'audio2')

song1 = KeyFinder(audiopath1)
song2 = KeyFinder(audiopath2)

audio1, sr1 = song1.waveform, song1.sr
audio2, sr2 = song2.waveform, song2.sr

In [None]:
# Extracting layers
fore1, back1 = direct_call_on_audio_array(audio1, sr1)
fore2, back2 = direct_call_on_audio_array(audio2, sr2)

In [None]:
write_audio('back1.wav', back1.T, sr1)
write_audio('back2.wav', back2.T, sr2) 
_11,_12,perc1 = run_decomposer('back1.wav', sr1)
_21,_22,perc2 = run_decomposer('back2.wav', sr1)

In [None]:
shifted_audio = shift_to_match_target_key(song2, song1, fore2, sr2)

## 2. Listen to extracted stems and mix

Listen to the extracted background & vocal parts (also probably rhythm...)

In [None]:
print('Backing audio:')
display_audio(back1, sr1)

In [None]:
print('Shifted vocals to match:')
display_audio(shifted_audio, sr1)

In [None]:
print('Percussion track 1:')
display_audio(perc1, sr1)

Now choose which stems do you want to mix together and configure the volume of each one...

In [None]:
print('VOLUME CONFIGURATION')

print('Backing:')
s1 = widgets.FloatSlider(min=0.0, max=1.0, step=0.1, value=1.0)
display(s1)

print('Shifted vocals:')
s2 = widgets.FloatSlider(min=0.0, max=1.0, step=0.1, value=1.0)
display(s2)

In [None]:
new = combine_audio_layers([back1, shifted_audio], [s1.value, s2.value])
print('Combined audio:')
display_audio(new, sr1)

If you want to adjust the volume further, make sure to run the previous two cells in the same order again until you're satisfied with the mix ...

In [None]:
matched_tempo = match_tempo(shifted_audio, sr1, back2, sr2)

In [None]:
new = combine_audio_layers([back2, matched_tempo])
display_audio(new, sr1)