In [3]:
import IPython.display as ipd

import pandas as pd

import itertools
import random
import numpy as np
import gradio as gr
import librosa
import os
import torch
import torchaudio
from torchaudio.pipelines import CONVTASNET_BASE_LIBRI2MIX
from torch.utils.data import TensorDataset, DataLoader
from torch.nn import functional as F
from transformers import pipeline

from tqdm import tqdm

import sys
sys.path.append('../')
from src.crepe_model import CREPEModel

# use GPU if available, otherwise, use cpu
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

  import multipart


In [5]:
mixed_audio_path = "../audio/test_data/leon_7_03_jmzen_5_03.wav"
audio, sample_rate = torchaudio.load(f"{mixed_audio_path}")

In [8]:
model = CONVTASNET_BASE_LIBRI2MIX.get_model()
model = model.to(device)
print(f"Initialized CONVTASNET_BASE_LIBRI2MIX model.")

Initialized CONVTASNET_BASE_LIBRI2MIX model.


  state_dict = torch.load(path)


In [11]:
with torch.no_grad():
    pred = model(audio.reshape(1, 1, -1).to(device))

In [13]:
pred.shape

torch.Size([1, 2, 54785])

In [None]:
# Save split audio samples
torchaudio.save(f'../audio/test_data/split_full_1.wav', F.normalize(pred[:1, 0]), 8000)
torchaudio.save(f'../audio/test_data/split_full_2.wav', F.normalize(pred[:1, 1]), 8000)

In [14]:
def audio_to_frames(vocals):
    # make 1024-sample frames of the audio with hop length of 10 milliseconds
    num_samples = len(vocals)
    num_frames = int((num_samples - 1024) / 160) + 1
    frames = vocals.unfold(step=160, size=1024, dimension=0)
    return frames

In [24]:
vocals = pred[:1, 0]
vocals = librosa.resample(vocals.cpu().numpy(), orig_sr=8000, target_sr=16000)
frames = audio_to_frames(torch.tensor(vocals.T))

In [26]:
vocals.shape

(1, 109570)

In [28]:
frames.shape

torch.Size([679, 1, 1024])

In [47]:
# model = CREPEModel(mult).to(device)
# model.load_state_dict(torch.load(f'best_crepe_{mult}.pkl'))
# model.eval()

model = CREPEModel.from_pretrained("omgitsqing/CREPE_MIR-1K_16")
model.eval()

# pitches = model(vocals)

CREPEModel(
  (model): Sequential(
    (0): Conv2d(1, 512, kernel_size=(512, 1), stride=(4, 1), padding=(254, 0), bias=False)
    (1): ReLU()
    (2): BatchNorm2d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (4): Dropout(p=0.25, inplace=False)
    (5): Conv2d(512, 64, kernel_size=(64, 1), stride=(1, 1), padding=same, bias=False)
    (6): ReLU()
    (7): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (9): Dropout(p=0.25, inplace=False)
    (10): Conv2d(64, 64, kernel_size=(64, 1), stride=(1, 1), padding=same, bias=False)
    (11): ReLU()
    (12): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): MaxPool2d(kernel_size=(2, 1), stride=(2, 1), padding=0, dilation=1, ceil_mode=False)
    (14): Dropout(p

In [38]:
train_mean = 0.3302
train_std  = 0.6109

In [42]:
test_data = torch.clip((frames - train_mean) / train_std, min=1e-8, max=None)

In [44]:
test_loader = DataLoader(TensorDataset(test_data, test_data), batch_size=20, shuffle=False)

In [52]:
note_range = list(librosa.midi_to_note([i/10 for i in range(360 , 770)], cents=True))
note_range.append('silence')

In [72]:
all_notes = []
for x, y in test_loader:
    reshaped = torch.reshape(x, (x.shape[0],1024,1)).to(device)
    pitches = model(reshaped)

    # assign the note with the highest probability to each frame
    pred_notes = [str(note_range[p.argmax()]) for p in pitches.cpu().detach().numpy()]
    all_notes.extend(pred_notes)
    # print(pitches.shape)
    # break

In [74]:
len(all_notes)

679

In [28]:
def split_audio(audio_file):
    """
    Wrapper function for Gradio

    Takes input audio and outputs two audio files, each containing one voice.
    """
    if audio_file is None:
        raise gr.Error("No audio file submitted!")
    sr, audio = audio_file

    model = CONVTASNET_BASE_LIBRI2MIX.get_model()
    model = model.to(device)
    model.eval()

    with torch.no_grad():
        pred = model(torch.tensor(audio).float().reshape(1, 1, -1).to(device))

    # output = pipe(audio_input, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)
    # vocal_1, vocal_2 = split_audio_pipe(audio)

    torchaudio.save(f'split_1.wav', F.normalize(pred[0][:1, :]), 8000)
    torchaudio.save(f'split_2.wav', F.normalize(pred[0][1:, :]), 8000)

    torch.cuda.empty_cache()

    return "split_1.wav", "split_2.wav"

def audio_to_frames(vocals):
    # make 1024-sample frames of the audio with hop length of 10 milliseconds
    num_samples = len(vocals)
    num_frames = int((num_samples - 1024) / 160) + 1
    frames = vocals.unfold(step=160, size=1024, dimension=0)
    return frames
    
def audio_to_pitch(audio_file):
    """
    Wrapper function for Gradio

    Takes input audio and outputs two audio files, each containing one voice.
    """
    if audio_file is None:
        raise gr.Error("No audio file submitted!")
    sr, audio = audio_file

    vocals = librosa.resample(torch.tensor(audio).float().numpy(), orig_sr=sr, target_sr=16000)
    frames = audio_to_frames(torch.tensor(vocals.T))

    model = CREPEModel.from_pretrained("omgitsqing/CREPE_MIR-1K_32")
    model.eval()

    train_mean = 0.3302
    train_std  = 0.6109

    test_data = torch.clip((frames - train_mean) / train_std, min=1e-8, max=None)

    test_loader = DataLoader(TensorDataset(test_data, test_data), batch_size=20, shuffle=False)

    note_range = list(librosa.midi_to_note([i/10 for i in range(360 , 770)], cents=True))
    note_range.append('silence')

    all_notes = []
    for x, y in test_loader:
        reshaped = torch.reshape(x, (x.shape[0],1024,1)).to(device)
        pitches = model(reshaped)

        # assign the note with the highest probability to each frame
        pred_notes = [str(note_range[p.argmax()]) for p in pitches.cpu().detach().numpy()]
        all_notes.extend(pred_notes)

    torch.cuda.empty_cache()

    return str(all_notes)

In [31]:
demo = gr.Blocks()

with demo:

    # 1. Get users to record a duet
    gr.Markdown(
    """
    # Karaoke Chaos!
    Sing your hearts out with a friend. We'll split the tracks and show you the notes you hit.
    """)
    moods = gr.State([])
    audio_file = gr.Audio(type="numpy")
    gr.Examples(
        examples=[
            "leon_7_03_jmzen_5_03.wav",
            "leon_7_jmzen_5.wav",
        ],
        inputs=audio_file,
        # run_on_click=True,
    )
    b1 = gr.Button("Submit masterpiece")

    # 2. Let users listen to split tracks
    gr.Markdown(
    """
    ## Split Tracks:
    Are your voices split as you expected? Listen to the tracks below.
    """)
    wav_path_1 = gr.Audio(interactive=False)
    wav_path_2 = gr.Audio(interactive=False)

    # 3. Let users re-upload pure pitches  
    gr.Markdown(
    """
    ## Transcribe Pitches:
    Select `split_1`/`split_2` to transcribe your split vocals or upload a new audio file to transcribe the pitches.
    """)
    audio_file_to_transcribe = gr.Audio(type="numpy")
    gr.Examples(
        examples=[
            "split_1.wav",
            "split_2.wav",
        ],
        inputs=audio_file_to_transcribe,
    )
    b2 = gr.Button("Transcribe Pitches")

    transcribed_pitch = gr.Textbox("Pitches", type="text")

    b1.click(split_audio, inputs=audio_file, outputs=[wav_path_1, wav_path_2])
    b2.click(audio_to_pitch, inputs=audio_file_to_transcribe, outputs=transcribed_pitch)

    
demo.launch(debug=True, height=1500)

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.


