## Evaluate Released DAC16k Model

In [None]:
!pip install descript-audio-codec pesq

In [23]:
import dac
model_path = dac.utils.download(model_type="16khz")
model = dac.DAC.load(model_path)



In [24]:
device = "cuda"
model = model.to(device)

In [7]:
model.quantizer

ResidualVectorQuantize(
  (quantizers): ModuleList(
    (0-11): 12 x VectorQuantize(
      (in_proj): Conv1d(1024, 8, kernel_size=(1,), stride=(1,))
      (out_proj): Conv1d(8, 1024, kernel_size=(1,), stride=(1,))
      (codebook): Embedding(1024, 8)
    )
  )
)

In [10]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
import torch
import numpy as np
from tqdm import tqdm
from pesq import pesq

from huggingface_hub import hf_hub_download

repo_id = "Tracygu/dnscustom"
data_path = hf_hub_download(repo_id=repo_id, filename="DNS_CHALLENGE/processed_yz/test.pt", repo_type="dataset")
testset = torch.load(data_path, map_location=device)
testset.shape

# testset = torch.load("./data/DNS_CHALLENGE/processed_yz/test.pt")
# testset.shape

test.pt:   0%|          | 0.00/741M [00:00<?, ?B/s]

torch.Size([1158, 160000])

In [None]:
SR = 16000
n_quantizers = 12

device = "cuda"
model = model.to(device)

test_perf = []
model.eval()
with torch.inference_mode():
    for n_q in [12, 9, 6, 3]:
        print(f"Evaluating DAC at {n_q*.5:.2f}kbps")
        for i in tqdm(range(testset.size(0))):
            x = testset[i:i+1, :-80].unsqueeze(1).to(device)
            x_process = model.preprocess(x, sample_rate=SR)

            z, codes, latents, _, _ = model.encode(x_process, n_quantizers=n_q)
            x_recon = model.decode(z)

            obj_score = pesq(SR,
                             x.squeeze(0).squeeze(0).cpu().numpy(),
                             x_recon.squeeze(0).squeeze(0).cpu().numpy(), 'wb')
            test_perf.append(obj_score)
        print("Test PESQ: ", np.mean(test_perf))

Evaluating DAC at 6.00kbps


100%|██████████| 1158/1158 [11:15<00:00,  1.71it/s]


Test PESQ:  4.01297817380523
Evaluating DAC at 4.50kbps


100%|██████████| 1158/1158 [11:03<00:00,  1.74it/s]


Test PESQ:  3.5776374306085814
Evaluating DAC at 3.00kbps


100%|██████████| 1158/1158 [11:00<00:00,  1.75it/s]


Test PESQ:  3.0323141937497433
Evaluating DAC at 1.50kbps


100%|██████████| 1158/1158 [11:44<00:00,  1.64it/s]

Test PESQ:  2.588869664284026





In [None]:
z.shape, codes.shape, latents.shape

(torch.Size([1, 1024, 499]),
 torch.Size([1, 12, 499]),
 torch.Size([1, 96, 499]))

In [None]:
10 * 36 * 500 / 10

18000.0

In [None]:
10 * 12 * 500 / 10

6000.0

## Evaluate Reproduced DAC16k model

In [12]:
import dac

model = dac.DAC(
    encoder_dim=64,
    encoder_rates=[2,4,5,8],
    decoder_dim=1536,
    decoder_rates=[8,5,4,2],
    sample_rate=16000,
    n_codebooks=12,
    codebook_size=1024,
    codebook_dim=8,
    quantizer_dropout=0.5,
)



In [13]:
import torch
ckp = torch.load("./weights.pth", map_location=device)["state_dict"]
model.load_state_dict(ckp, strict=False)

<All keys matched successfully>

In [14]:
trainable_params = sum(
	p.numel() for p in model.parameters() if p.requires_grad
)
trainable_params

74175906

In [16]:
SR = 16000
n_quantizers = 12

device = "cuda"
model = model.to(device)

test_perf = []
model.eval()
with torch.inference_mode():
    for n_q in [12, 9, 6, 3]:
        print(f"Evaluating DAC at {n_q*.5:.2f}kbps")
        for i in tqdm(range(testset.size(0))):
            x = testset[i:i+1, :-80].unsqueeze(1).to(device)
            x_process = model.preprocess(x, sample_rate=SR)

            z, codes, latents, _, _ = model.encode(x_process, n_quantizers=n_q)
            x_recon = model.decode(z)

            obj_score = pesq(SR,
                             x.squeeze(0).squeeze(0).cpu().numpy(),
                             x_recon.squeeze(0).squeeze(0).cpu().numpy(), 'wb')
            test_perf.append(obj_score)
        print("Test PESQ: ", np.mean(test_perf))

Evaluating DAC at 6.00kbps


  2%|▏         | 26/1158 [00:15<10:53,  1.73it/s]


KeyboardInterrupt: 

## Evaluate Latency

In [25]:
import time, glob, torchaudio

eval_pth = "./eval_instances"
eval_files = glob.glob(f"{eval_pth}/*.wav")
eval_audios = [
    (torchaudio.load(f)[0].unsqueeze(1)).to(device) for f in eval_files
]
# eval_audios = eval_audios[:1]

model = model.to(device)
def compress(model, n_q):
    encoded = []
    for d in eval_audios:
        x_process = model.preprocess(d, sample_rate=16000)

        z, codes, latents, _, _ = model.encode(x_process, n_quantizers=n_q)

        encoded.append(z)

    return encoded


def recover(model, encoded):
    for encoded_d in encoded:
        x_recon = model.decode(encoded_d)
    return


In [None]:
start = time.time()
encoded = compress(model, n_q=12)
end = time.time()
print(f"DAC Compress Time ({len(eval_audios)} 10sec audio) on {device}: ", end - start)

start = time.time()
recover(model, encoded)
end = time.time()
print(f"DAC Recover Time ({len(eval_audios)} 10sec audio) on {device}: ", end - start)