In [None]:
!pip install descript-audio-codec pesq

In [1]:
import dac
# model_path = dac.utils.download(model_type="16khz")
# model = dac.DAC.load(model_path)

device = "cuda"
# model = model.to(device)

from metrics import MelDistance, SISDRLoss, PESQ, STFTDistance, PSNR, SNR
metircs = {
    "Test_PESQ": PESQ(sample_rate=16000, device=device),
    "Test_MelDist": MelDistance(win_lengths=[32,64,128,256,512,1024,2048], n_mels=[5,10,20,40,80,160,320]).to(device),
    "Test_STFTDist": STFTDistance(win_lengths=[2048,512]).to(device), "Test_SNR": SNR()
}

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import torch
import numpy as np
from tqdm.notebook import tqdm

from huggingface_hub import hf_hub_download

# repo_id = "Tracygu/dnscustom"
# data_path = hf_hub_download(repo_id=repo_id, filename="DNS_CHALLENGE/processed_yz/test.pt", repo_type="dataset")
data_path = "../data/DNS_CHALLENGE/processed_yz/test.pt"
testset = torch.load(data_path, map_location=device)
print(testset.shape)

torch.Size([1158, 160000])


In [3]:
import torch
from torch.utils.data import DataLoader, Dataset, default_collate

class EvalSet(Dataset):
    def __init__(self, test_file_path) -> None:
        super().__init__()
        self.testset = torch.load(test_file_path)

    def __len__(self):
        return self.testset.size(0)

    def __getitem__(self, i):

        return self.testset[i, :-80]

dns_eval_set = EvalSet(test_file_path=data_path)
eval_loader = DataLoader(dns_eval_set, batch_size=16, shuffle=False, collate_fn=default_collate)
next(iter(eval_loader)).shape

torch.Size([16, 159920])

## Evaluate Released DAC16kHz Model

In [10]:
SR = 16000
n_quantizers = 12

device = "cuda"
model = model.to(device)

model.eval()
with torch.inference_mode():
    for n_q in [12, 9, 6, 3]: # 6k -> 1.5k
        test_perf = {}
        for i, x in tqdm(enumerate(eval_loader), total=len(eval_loader), desc=f"Evaluating DAC-16kHz at {n_q*.5:.2f}kbps"):
            # x: [bs, T]#
            x = x.unsqueeze(1).to(device)
            x_process = model.preprocess(x, sample_rate=SR)
            z, codes, latents, _, _ = model.encode(x_process, n_quantizers=n_q)
            x_recon = model.decode(z)
            x_recon = x_recon[:, :, :-72]

            for k, m in metircs.items():
                if k not in test_perf:
                    test_perf[k] = m(x.squeeze(1), x_recon.squeeze(1))
                else:
                    test_perf[k].extend(m(x.squeeze(1), x_recon.squeeze(1)))

        for k, v in test_perf.items():
            test_perf[k] = np.mean(v)
        print(f"Test Metrics at {n_q*.5:.2f}kbps: ", end="")
        print(" | ".join(f"{k}: {v:.4f}" for k, v in test_perf.items()))

Evaluating DAC-16kHz at 6.00kbps:   0%|          | 0/73 [00:00<?, ?it/s]

Test Metrics:  {'mel_dist': 2.3306075044246537, 'si-sdr': 10.987368748942309, 'pesq': 4.013627748827036}


Evaluating DAC-16kHz at 4.50kbps:   0%|          | 0/73 [00:00<?, ?it/s]

Test Metrics:  {'mel_dist': 2.974803227834751, 'si-sdr': 11.448827898126604, 'pesq': 3.142567297749363}


Evaluating DAC-16kHz at 3.00kbps:   0%|          | 0/73 [00:00<?, ?it/s]

Test Metrics:  {'mel_dist': 3.6125088269228764, 'si-sdr': 12.009477991041356, 'pesq': 1.941796916239414}


Evaluating DAC-16kHz at 1.50kbps:   0%|          | 0/73 [00:00<?, ?it/s]

Test Metrics:  {'mel_dist': 4.516397475784513, 'si-sdr': 12.466802372460435, 'pesq': 1.2585744640575995}


In [None]:
z.shape, codes.shape, latents.shape

(torch.Size([1, 1024, 499]),
 torch.Size([1, 12, 499]),
 torch.Size([1, 96, 499]))

In [None]:
10 * 36 * 500 / 10

18000.0

In [None]:
10 * 12 * 500 / 10

6000.0

## Evaluate Reproduced DAC16kHz model (0.5kbps -> 6.0kbps)

In [11]:
import dac
model = dac.DAC(
    encoder_dim=64,
    encoder_rates=[2,4,5,8],
    decoder_dim=1536,
    decoder_rates=[8,5,4,2],
    sample_rate=16000,
    n_codebooks=12,
    codebook_size=1024,
    codebook_dim=8,
    quantizer_dropout=0.5,
)
import torch
ckp = torch.load("./weights.pth", map_location=device)["state_dict"]
model.load_state_dict(ckp, strict=False)

In [12]:
trainable_params = sum(
	p.numel() for p in model.parameters() if p.requires_grad
)
trainable_params

74175906

In [13]:
SR = 16000
n_quantizers = 12

device = "cuda"
model = model.to(device)

model.eval()
with torch.inference_mode():
    for n_q in [12, 9, 6, 3]: # 6k -> 1.5k
        test_perf = {}
        for i, x in tqdm(enumerate(eval_loader), total=len(eval_loader), desc=f"Evaluating DAC-16kHz at {n_q*.5:.2f}kbps"):
            # x: [bs, T]#
            x = x.unsqueeze(1).to(device)
            x_process = model.preprocess(x, sample_rate=SR)
            z, codes, latents, _, _ = model.encode(x_process, n_quantizers=n_q)
            x_recon = model.decode(z)
            x_recon = x_recon[:, :, :-72]

            for k, m in metircs.items():
                if k not in test_perf:
                    test_perf[k] = m(x.squeeze(1), x_recon.squeeze(1))
                else:
                    test_perf[k].extend(m(x.squeeze(1), x_recon.squeeze(1)))

        for k, v in test_perf.items():
            test_perf[k] = np.mean(v)
        print(f"Test Metrics at {n_q*.5:.2f}kbps: ", end="")
        print(" | ".join(f"{k}: {v:.4f}" for k, v in test_perf.items()))

Evaluating DAC-16kHz at 6.00kbps:   0%|          | 0/73 [00:00<?, ?it/s]

Test Metrics:  {'mel_dist': 1.823663345272677, 'si-sdr': -11.434682292744698, 'pesq': 4.064680259977186}


Evaluating DAC-16kHz at 4.50kbps:   0%|          | 0/73 [00:00<?, ?it/s]

Test Metrics:  {'mel_dist': 1.9953874549305706, 'si-sdr': -10.059480801430597, 'pesq': 3.820185761171713}


Evaluating DAC-16kHz at 3.00kbps:   0%|          | 0/73 [00:00<?, ?it/s]

Test Metrics:  {'mel_dist': 2.2183342746914154, 'si-sdr': -8.575224089458752, 'pesq': 3.446854174445948}


Evaluating DAC-16kHz at 1.50kbps:   0%|          | 0/73 [00:00<?, ?it/s]

Test Metrics:  {'mel_dist': 2.6885750480060544, 'si-sdr': -4.684817125164262, 'pesq': 2.518783029702867}


## Evaluate Reproduced DAC16kHz model (0.5kbps -> 9.0kbps)

In [5]:
import dac
model = dac.DAC(
    encoder_dim=64,
    encoder_rates=[2,4,5,8],
    decoder_dim=1536,
    decoder_rates=[8,5,4,2],
    sample_rate=16000,
    n_codebooks=18,
    codebook_size=1024,
    codebook_dim=8,
    quantizer_dropout=0.5,
)
import torch
# ckp = torch.load("../dac_runs/9kmodel/weights.pth", map_location=device)["state_dict"]
# model.load_state_dict(ckp, strict=False)

In [6]:
trainable_params = sum(
	p.numel() for p in model.parameters() if p.requires_grad
)
trainable_params

74335746

In [5]:
from tqdm import tqdm
SR = 16000
n_quantizers = 18

device = "cuda"
model = model.to(device)

model.eval()
with torch.inference_mode():
    for n_q in [18, 15, 12, 9, 6, 3]: # 9k -> 1.5k
        test_perf = {}
        for i, x in tqdm(enumerate(eval_loader), total=len(eval_loader), desc=f"Evaluating DAC-16kHz at {n_q*.5:.2f}kbps"):
            # x: [bs, T]#
            x = x.unsqueeze(1).to(device)
            x_process = model.preprocess(x, sample_rate=SR)
            z, codes, latents, _, _ = model.encode(x_process, n_quantizers=n_q)
            x_recon = model.decode(z)
            x_recon = x_recon[:, :, :-72]

            for k, m in metircs.items():
                if k not in test_perf:
                    test_perf[k] = m(x.squeeze(1), x_recon.squeeze(1)).tolist()
                else:
                    test_perf[k].extend(m(x.squeeze(1), x_recon.squeeze(1)).tolist())

        for k, v in test_perf.items():
            test_perf[k] = np.mean(v)
        print(f"Test Metrics at {n_q*.5:.2f}kbps: ", end="")
        print(" | ".join(f"{k}: {v:.4f}" for k, v in test_perf.items()))

Evaluating DAC-16kHz at 9.00kbps: 100%|██████████| 73/73 [07:43<00:00,  6.35s/it]


Test Metrics at 9.00kbps: Test_PESQ: 4.2572 | Test_MelDist: 1.6459 | Test_STFTDist: 1.1035 | Test_SNR: 13.0357


Evaluating DAC-16kHz at 7.50kbps: 100%|██████████| 73/73 [07:42<00:00,  6.34s/it]


Test Metrics at 7.50kbps: Test_PESQ: 4.1702 | Test_MelDist: 1.7451 | Test_STFTDist: 1.1224 | Test_SNR: 12.3376


Evaluating DAC-16kHz at 6.00kbps: 100%|██████████| 73/73 [07:42<00:00,  6.33s/it]


Test Metrics at 6.00kbps: Test_PESQ: 4.0619 | Test_MelDist: 1.8532 | Test_STFTDist: 1.1458 | Test_SNR: 11.6587


Evaluating DAC-16kHz at 4.50kbps: 100%|██████████| 73/73 [07:42<00:00,  6.33s/it]


Test Metrics at 4.50kbps: Test_PESQ: 3.8348 | Test_MelDist: 2.0179 | Test_STFTDist: 1.1817 | Test_SNR: 10.5353


Evaluating DAC-16kHz at 3.00kbps: 100%|██████████| 73/73 [07:42<00:00,  6.33s/it]


Test Metrics at 3.00kbps: Test_PESQ: 3.3909 | Test_MelDist: 2.2751 | Test_STFTDist: 1.2389 | Test_SNR: 8.7008


Evaluating DAC-16kHz at 1.50kbps: 100%|██████████| 73/73 [07:42<00:00,  6.34s/it]

Test Metrics at 1.50kbps: Test_PESQ: 2.5201 | Test_MelDist: 2.7276 | Test_STFTDist: 1.3451 | Test_SNR: 5.4607





## Evaluate Reproduced DAC16kHz Tiny 8M model (0.5kbps -> 9.0kbps)

In [2]:
import dac, torch
model = dac.DAC(
    encoder_dim=32,
    encoder_rates=[2,4,5,8],
    decoder_dim=288,
    decoder_rates=[8,5,4,2],
    sample_rate=16000,
    n_codebooks=18,
    codebook_size=1024,
    codebook_dim=8,
    quantizer_dropout=0.5,
)
trainable_params = sum(
	p.numel() for p in model.parameters() if p.requires_grad
)
trainable_params

8191092

In [5]:
device = "cuda"
ckp = torch.load("/root/autodl-fs/dac_output/dac16khz9k_tiny/best/dac/weights.pth", map_location="cpu")["state_dict"]
model.load_state_dict(ckp, strict=False)
model = model.to(device)

In [6]:
from tqdm import tqdm
SR = 16000
n_quantizers = 18

device = "cuda"
model = model.to(device)

model.eval()
with torch.inference_mode():
    for n_q in [18, 15, 12, 9, 6, 3]: # 9k -> 1.5k
        test_perf = {}
        for i, x in tqdm(enumerate(eval_loader), total=len(eval_loader), desc=f"Evaluating DAC-16kHz at {n_q*.5:.2f}kbps"):
            # x: [bs, T]#
            x = x.unsqueeze(1).to(device)
            x_process = model.preprocess(x, sample_rate=SR)
            z, codes, latents, _, _ = model.encode(x_process, n_quantizers=n_q)
            x_recon = model.decode(z)
            x_recon = x_recon[:, :, :-72]

            for k, m in metircs.items():
                if k not in test_perf:
                    test_perf[k] = m(x.squeeze(1), x_recon.squeeze(1)).tolist()
                else:
                    test_perf[k].extend(m(x.squeeze(1), x_recon.squeeze(1)).tolist())

        for k, v in test_perf.items():
            test_perf[k] = np.mean(v)
        print(f"Test Metrics at {n_q*.5:.2f}kbps: ", end="")
        print(" | ".join(f"{k}: {v:.4f}" for k, v in test_perf.items()))

Evaluating DAC-16kHz at 9.00kbps: 100%|██████████| 73/73 [03:02<00:00,  2.50s/it]


Test Metrics at 9.00kbps: Test_PESQ: 3.9574 | Test_MelDist: 1.9587 | Test_STFTDist: 1.2000 | Test_SNR: 11.5076


Evaluating DAC-16kHz at 7.50kbps: 100%|██████████| 73/73 [03:04<00:00,  2.53s/it]


Test Metrics at 7.50kbps: Test_PESQ: 3.8491 | Test_MelDist: 2.0513 | Test_STFTDist: 1.2161 | Test_SNR: 10.8713


Evaluating DAC-16kHz at 6.00kbps: 100%|██████████| 73/73 [02:57<00:00,  2.43s/it]


Test Metrics at 6.00kbps: Test_PESQ: 3.7139 | Test_MelDist: 2.1607 | Test_STFTDist: 1.2365 | Test_SNR: 10.1725


Evaluating DAC-16kHz at 4.50kbps: 100%|██████████| 73/73 [02:56<00:00,  2.42s/it]


Test Metrics at 4.50kbps: Test_PESQ: 3.4286 | Test_MelDist: 2.3367 | Test_STFTDist: 1.2691 | Test_SNR: 8.9550


Evaluating DAC-16kHz at 3.00kbps: 100%|██████████| 73/73 [02:55<00:00,  2.41s/it]


Test Metrics at 3.00kbps: Test_PESQ: 2.9143 | Test_MelDist: 2.6121 | Test_STFTDist: 1.3223 | Test_SNR: 7.0585


Evaluating DAC-16kHz at 1.50kbps: 100%|██████████| 73/73 [03:04<00:00,  2.53s/it]

Test Metrics at 1.50kbps: Test_PESQ: 1.9847 | Test_MelDist: 3.1390 | Test_STFTDist: 1.4319 | Test_SNR: 3.6396





## Evaluate Released DAC24kHz model

In [None]:
# 24kHz model # .75kbps per codebook
10 * 32 * 750 / 10

In [4]:
import dac
model_path = dac.utils.download(model_type="24khz")
model = dac.DAC.load(model_path)

device = "cuda"
model = model.to(device)



In [6]:
libri_eval_set = EvalSet(test_file_path="./test.pt")
eval_loader = DataLoader(libri_eval_set, batch_size=16, shuffle=False, collate_fn=default_collate)
next(iter(eval_loader)).shape

torch.Size([16, 239920])

In [None]:
SR = 24000
n_quantizers = 32

device = "cuda"
model = model.to(device)

model.eval()
with torch.inference_mode():
    for n_q in [24, 20, 16, 12, 8, 4]: # 18k -> 3k
        test_perf = {"mel_dist": [], "si-sdr": [], "pesq": []}
        for i, x in tqdm(enumerate(eval_loader), total=len(eval_loader), desc=f"Evaluating DAC-24kHz at {n_q*.75:.2f}kbps"):
            # x: [bs, T]#
            x = x.unsqueeze(1).to(device)

            x_process = model.preprocess(x, sample_rate=SR)
            z, codes, latents, _, _ = model.encode(x_process, n_quantizers=n_q)
            x_recon = model.decode(z)

            x_recon = x_recon[:, :, :-72]
            test_perf["mel_dist"].extend(mel_distance_metric(x.squeeze(1), x_recon.squeeze(1)).tolist())
            test_perf["si-sdr"].extend(sisdr_metric(x.squeeze(1), x_recon.squeeze(1)).tolist())
            test_perf["pesq"].extend(pesq_metric(x.squeeze(1), x_recon.squeeze(1)))

        for k, v in test_perf.items():
            test_perf[k] = np.mean(v)
        print("Test Metrics: ", test_perf)

Evaluating DAC-24kHz at 18.00kbps:   0%|          | 0/63 [00:00<?, ?it/s]

Test Metrics:  {'mel_dist': 1.7218352154493333, 'si-sdr': 7.79192368571338, 'pesq': 4.403405965089798}


Evaluating DAC-24kHz at 15.00kbps:   0%|          | 0/63 [00:00<?, ?it/s]

Test Metrics:  {'mel_dist': 1.8206092302799224, 'si-sdr': 7.802491025560536, 'pesq': 4.361736610174179}


Evaluating DAC-24kHz at 12.00kbps:   0%|          | 0/63 [00:00<?, ?it/s]

Test Metrics:  {'mel_dist': 1.963197175860405, 'si-sdr': 7.8063335680384185, 'pesq': 4.2500029585361485}


Evaluating DAC-24kHz at 9.00kbps:   0%|          | 0/63 [00:00<?, ?it/s]

Test Metrics:  {'mel_dist': 2.1523833882808687, 'si-sdr': 7.843131553557497, 'pesq': 4.019923070192337}


Evaluating DAC-24kHz at 6.00kbps:   0%|          | 0/63 [00:00<?, ?it/s]

## Evaluate Latency of Reprodueced Model (computation time)

In [9]:
import time, glob, torchaudio
device = "cpu"

eval_pth = "/Users/tracy/Desktop/dns/eval_instances"
eval_files = glob.glob(f"{eval_pth}/*.wav")
eval_audios = [
    (torchaudio.load(f)[0].unsqueeze(1)).to(device) for f in eval_files
]

eval_audios = eval_audios[:10]

model = model.to(device)
def compress(model, n_q):
    encoded = []
    for d in eval_audios:
        x_process = model.preprocess(d, sample_rate=16000)

        z, codes, latents, _, _ = model.encode(x_process, n_quantizers=n_q)

        encoded.append(codes)

    return encoded


def recover(model, encoded):
    for encoded_d in encoded:
        z_q, _, _ = model.quantizer.from_codes(encoded_d)
        x_recon = model.decode(z_q)
    return


In [10]:
start = time.time()
encoded = compress(model, n_q=18)
end = time.time()
print(f"DAC Compress Time ({len(eval_audios)} 10sec audio) on {device}: ", end - start)

start = time.time()
recover(model, encoded)
end = time.time()
print(f"DAC Recover Time ({len(eval_audios)} 10sec audio) on {device}: ", end - start)

DAC Compress Time (10 10sec audio) on cpu:  16.369736909866333
DAC Recover Time (10 10sec audio) on cpu:  32.90382790565491


In [1]:
import dac, torchaudio
model_path = dac.utils.download(model_type="16khz")
model = dac.DAC.load(model_path)

x, _ = torchaudio.load("/Users/tracy/Desktop/Audio_Codec/swin-debug-vis/test/mandarin_instance1.wav")

RuntimeError: Failed to load audio from /Users/tracy/Desktop/Audio_Codec/swin-debug-vis/test/mandarin_instance1.wav

In [7]:
x = x.unsqueeze(1)
x_process = model.preprocess(x, sample_rate=16000)
z, codes, latents, _, _ = model.encode(x_process, n_quantizers=12)
x_recon = model.decode(z)

In [8]:
x_recon.shape

torch.Size([1, 1, 159992])

In [9]:
torchaudio.save("/Users/tracy/Desktop/recon_dac.wav", x_recon.squeeze(1), 16000)

In [2]:
model

DAC(
  (encoder): Encoder(
    (block): Sequential(
      (0): Conv1d(1, 64, kernel_size=(7,), stride=(1,), padding=(3,))
      (1): EncoderBlock(
        (block): Sequential(
          (0): ResidualUnit(
            (block): Sequential(
              (0): Snake1d()
              (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
              (2): Snake1d()
              (3): Conv1d(64, 64, kernel_size=(1,), stride=(1,))
            )
          )
          (1): ResidualUnit(
            (block): Sequential(
              (0): Snake1d()
              (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
              (2): Snake1d()
              (3): Conv1d(64, 64, kernel_size=(1,), stride=(1,))
            )
          )
          (2): ResidualUnit(
            (block): Sequential(
              (0): Snake1d()
              (1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(27,), dilation=(9,))
              (2): Snake1d()
              