## Load Pre-trained Model

In [1]:
import dac
# from audiotools import AudioSignal

model_path = dac.utils.download(model_type="16khz")
model = dac.DAC.load(model_path)

2023-12-01 10:21:09.021023: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
model.quantizer

ResidualVectorQuantize(
  (quantizers): ModuleList(
    (0-11): 12 x VectorQuantize(
      (in_proj): Conv1d(1024, 8, kernel_size=(1,), stride=(1,))
      (out_proj): Conv1d(8, 1024, kernel_size=(1,), stride=(1,))
      (codebook): Embedding(1024, 8)
    )
  )
)

In [3]:
import torch
import numpy as np
from tqdm import tqdm
from pesq import pesq

testset = torch.load("./data/DNS_CHALLENGE/processed_yz/test.pt")
testset.shape

torch.Size([1158, 160000])

In [8]:
SR = 16000
n_quantizers = 12

device = "cuda"
model = model.to(device)

test_perf = []
model.eval()
with torch.inference_mode():
    for n_q in [12, 9, 6, 3]:
        print(f"Evaluating DAC at {n_q*.5:.2f}kbps")
        for i in tqdm(range(testset.size(0))):
            x = testset[i:i+1, :-80].unsqueeze(1).to(device)
            x_process = model.preprocess(x, sample_rate=SR)

            z, codes, latents, _, _ = model.encode(x_process, n_quantizers=n_q)
            x_recon = model.decode(z)

            obj_score = pesq(SR, 
                             x.squeeze(0).squeeze(0).cpu().numpy(), 
                             x_recon.squeeze(0).squeeze(0).cpu().numpy(), 'wb')
            test_perf.append(obj_score)
        print("Test PESQ: ", np.mean(test_perf))

Evaluating DAC at 6.00kbps


100%|██████████| 1158/1158 [11:15<00:00,  1.71it/s]


Test PESQ:  4.01297817380523
Evaluating DAC at 4.50kbps


100%|██████████| 1158/1158 [11:03<00:00,  1.74it/s]


Test PESQ:  3.5776374306085814
Evaluating DAC at 3.00kbps


100%|██████████| 1158/1158 [11:00<00:00,  1.75it/s]


Test PESQ:  3.0323141937497433
Evaluating DAC at 1.50kbps


100%|██████████| 1158/1158 [11:44<00:00,  1.64it/s]

Test PESQ:  2.588869664284026





In [35]:
z.shape, codes.shape, latents.shape

(torch.Size([1, 1024, 499]),
 torch.Size([1, 12, 499]),
 torch.Size([1, 96, 499]))

In [41]:
10 * 36 * 500 / 10

18000.0

In [38]:
10 * 12 * 500 / 10

6000.0