In [3]:
from IPython.display import Audio
from transformers import AutoProcessor, EncodecModel
import torch
import torchaudio

In [4]:
# load audio to encode
audio_path = "../sounds/721313-hangpan.wav"
sample_rate = 24000
# Load the pretrained processor
processor = AutoProcessor.from_pretrained("facebook/encodec_24khz")
# Load the audio file
waveform, orig_sample_rate = torchaudio.load(audio_path)
# Resample the audio
waveform = torchaudio.functional.resample(waveform, orig_freq=orig_sample_rate, new_freq=sample_rate).squeeze(0)
# Listen to the file
Audio(waveform, rate=sample_rate)

In [5]:
# Preprocess the audio
inputs = processor(raw_audio=waveform, sampling_rate=sample_rate, return_tensors="pt")
# Load the pretrained model
encodec = EncodecModel.from_pretrained("facebook/encodec_24khz")
bandwidths = encodec.config.target_bandwidths
print(f"Available bandwidths: {bandwidths} kbps")

# Encode the audio
target_bandwidth = bandwidths[-1] # highest bandwidth
encoder_outputs = encodec.encode(**inputs, bandwidth=target_bandwidth)

codes = encoder_outputs.audio_codes
scales = encoder_outputs.audio_scales

model.safetensors:   0%|          | 0.00/93.1M [00:00<?, ?B/s]

  WeightNorm.apply(module, name, dim)
  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)


Available bandwidths: [1.5, 3.0, 6.0, 12.0, 24.0] kbps


In [6]:
# compute bandwith
import math

print(f"Target bandwidth: {target_bandwidth:.2f} kbps")
duration = waveform.shape[-1] / sample_rate
# latent codes per second
cps = codes.shape[-2] * codes.shape[-1] / duration
# bits per codebook
bpc = math.log2(encodec.config.codebook_size)
calculated_bandwidth = cps * bpc / 1000
print(f"Calculated bandwidth: {calculated_bandwidth:.2f} kbps")

Target bandwidth: 24.00 kbps
Calculated bandwidth: 24.01 kbps


In [7]:
# Decode the audio
with torch.no_grad():
  decoder_outputs = encodec.decode(audio_codes=codes, audio_scales=scales)

audio = decoder_outputs.audio_values

# Display the audio
Audio(audio.numpy().squeeze(), rate=sample_rate)

In [8]:
print(f"Initial number of residual vectors: {codes.shape[-2]}")

compression_factor = 16
compressed_codes = codes[:, :, :codes.shape[-2]//compression_factor]

print(f"Compressed number of residual vectors: {compressed_codes.shape[-2]}")

with torch.no_grad():
  decoder_outputs = encodec.decode(audio_codes=codes, audio_scales=scales)

audio = decoder_outputs.audio_values

# Display the audio
Audio(audio.numpy().squeeze(), rate=sample_rate)

Initial number of residual vectors: 32
Compressed number of residual vectors: 2


In [9]:
# Choose a bandwidth
target_bandwidth = bandwidths[0]
print(f"Compression bandwidth: {target_bandwidth} kbps")

# Encode the audio
encoder_outputs = encodec.encode(**inputs, bandwidth=target_bandwidth)
codes = encoder_outputs.audio_codes
scales = encoder_outputs.audio_scales

# Save to .ecdc file
torch.save({"codes": codes, "scales": scales}, "compressed_audio.ecdc")

# Calculate compression rate
import os
original_size = os.path.getsize(audio_path)
compressed_size = os.path.getsize("compressed_audio.ecdc")
compression_rate = compressed_size / original_size*100
print(f"Compression rate: {compression_rate:.2f}%")

# Load back from .ecdc file
loaded_data = torch.load("compressed_audio.ecdc")
codes = loaded_data["codes"]
scales = loaded_data["scales"]

# Decode the audio
with torch.no_grad():
  decoder_outputs = encodec.decode(audio_codes=codes, audio_scales=scales)

audio = decoder_outputs.audio_values

# Display the audio
Audio(audio.numpy().squeeze(), rate=sample_rate)


Compression bandwidth: 1.5 kbps
Compression rate: 1.42%


  loaded_data = torch.load("compressed_audio.ecdc")
