In [None]:
import librosa
import torch
import soundfile as sf
from nemo.collections.tts.models import AudioCodecModel

model_name = "nvidia/mel-codec-22khz"
nemo_codec_model = AudioCodecModel.from_pretrained(model_name).eval()

In [None]:
import math
import wget
import os
import librosa
import torch
import numpy as np
import IPython.display as ipd
import matplotlib.pyplot as plt
from pathlib import Path


# Utility for displaying signals and metrics
def show_signal(signal: np.ndarray, sample_rate: int = 16000, tag: str = 'Signal'):
    """Show the time-domain signal and its spectrogram.
    """
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 2.5))

    # show waveform
    t = np.arange(0, len(signal)) / sample_rate

    ax[0].plot(t, signal)
    ax[0].set_xlim(0, t.max())
    ax[0].grid()
    ax[0].set_xlabel('time / s')
    ax[0].set_ylabel('amplitude')
    ax[0].set_title(tag)

    n_fft = 1024
    hop_length = 256

    D = librosa.amplitude_to_db(np.abs(librosa.stft(signal, n_fft=n_fft, hop_length=hop_length)), ref=np.max)
    img = librosa.display.specshow(D, y_axis='linear', x_axis='time', sr=sample_rate, n_fft=n_fft, hop_length=hop_length, ax=ax[1])
    ax[1].set_title(tag)

    plt.tight_layout()
    plt.colorbar(img, format="%+2.f dB", ax=ax)


# Utility for displaying a latent representation
def show_latent(latent: np.ndarray, tag: str):
    plt.figure(figsize = (16, 3))
    img = plt.imshow(latent, aspect='equal')
    plt.colorbar(img, ax=plt.gca())
    plt.title(tag)
    plt.xlabel('Time frame')
    plt.ylabel('Latent vector index')
    plt.tight_layout()

In [None]:
import time
from IPython.display import Audio

path_to_input_audio = './eo_05.mp3' # path of the input audio
path_to_output_audio = './eo_05_recon.mp3' # path of the reconstructed output audio

# get discrete tokens from audio
audio, _ = librosa.load(path_to_input_audio, sr=nemo_codec_model.sample_rate)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
audio_tensor = torch.from_numpy(audio).unsqueeze(dim=0).to(device)
audio_len = torch.tensor([audio_tensor[0].shape[0]]).to(device)

with torch.no_grad():
    st = time.time()
    encoded_tokens, encoded_len = nemo_codec_model.encode(audio=audio_tensor, audio_len=audio_len)
    print(time.time() - st)
    
    # Reconstruct audio from tokens
    st = time.time()
    reconstructed_audio, _ = nemo_codec_model.decode(tokens=encoded_tokens, tokens_len=encoded_len)
    print(time.time() - st)

# save reconstructed audio
output_audio = reconstructed_audio.cpu().numpy().squeeze()
sf.write(path_to_output_audio, output_audio, nemo_codec_model.sample_rate)

display(Audio(path_to_input_audio))
display(Audio(path_to_output_audio))

In [None]:
encoded_tokens.shape

In [None]:
audio.shape[-1]/nemo_codec_model.sample_rate

In [None]:
# Convert audio to the encoded representation
encoded, encoded_len = nemo_codec_model.encode_audio(audio=audio_tensor, audio_len=audio_len)

print('encoded information:')
print(f'\tshape (batch, codebook, time frame) : {encoded.size()}')
print(f'\tdtype                               : {encoded.dtype}')
print(f'\tmin                                 : {encoded.min()}')
print(f'\tmax                                 : {encoded.max()}')


# Show the encoded representation
show_latent(encoded.detach().squeeze().cpu().numpy(), tag='Encoder output')

In [None]:
# Encoder output to tokens
tokens = nemo_codec_model.quantize(encoded=encoded, encoded_len=encoded_len)
print(tokens.shape)

# Tokens back to a continuous vector
dequantized = nemo_codec_model.dequantize(tokens=tokens, tokens_len=encoded_len)
print(dequantized.shape)

In [None]:
from datasets import load_dataset, Audio
from transformers import MimiModel, AutoFeatureExtractor

# load the model + feature extractor (for pre-processing the audio)
model = MimiModel.from_pretrained("kyutai/mimi")
feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/mimi")

model.to(device)
print("-")

In [None]:
path_to_input_audio = './eo_05.mp3' # path of the input audio
path_to_output_audio = './eo_05_recon.mp3' # path of the reconstructed output audio

# get discrete tokens from audio
audio, _ = librosa.load(path_to_input_audio, sr=feature_extractor.sampling_rate)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# audio_tensor = torch.from_numpy(audio).unsqueeze(dim=0).to(device)
# audio_len = torch.tensor([audio_tensor[0].shape[0]]).to(device)

# pre-process the inputs
inputs = feature_extractor(raw_audio=audio, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")

# explicitly encode then decode the audio inputs
encoder_outputs = model.encode(inputs["input_values"].to(device))
st = time.time()
with torch.inference_mode(), torch.autocast(device_type='cuda', enabled=(device=="cuda")):
    audio_values = model.decode(encoder_outputs.audio_codes)[0]
print(time.time() - st)

# or the equivalent with a forward pass
audio_values = model(inputs["input_values"].to(device)).audio_values

In [None]:
model.encode_to_latent(inputs["input_values"].to(device))