In [2]:
import glob
import json
import os
from pathlib import Path

import torch.nn.functional as F
import soundfile as sf
import torch
from tqdm import tqdm
from vqvae import VQVAE
import librosa
from librosa.util import normalize

with open('./config_24k_320d.json', 'r') as f:
    config = json.load(f)
    sample_rate = config['sampling_rate']

outputdir = '../outputdir'
inputdir = '../test-other'
num = 1024

if __name__ == '__main__':
    Path(outputdir).mkdir(parents=True, exist_ok=True)
    print("Init model and load weights")
    # make sure you download the weights from https://huggingface.co/Dongchao/AcademiCodec/blob/main/HiFi-Codec-24k-320d and put it in ../ckpt/
    model = VQVAE('./config_24k_320d.json', '../ckpt/HiFi-Codec-24k-320d', with_encoder=True, return_acoustic_tokens_only=True)
    model.cuda()
    model.eval()
    print("Model ready")
    
    wav_paths = glob.glob(f"{inputdir}/**/**/*.wav")[:num]
    print(f"Globbed {len(wav_paths)} wav files.")
    fid_to_acoustic_token = {}
    for wav_path in tqdm(wav_paths):
        wav, sr = sf.read(wav_path)
        if sr != sample_rate:
            wav = librosa.resample(wav, orig_sr=sr, target_sr=sample_rate)
        fid = os.path.basename(wav_path)[:-4]
        wav = normalize(wav) * 0.95
        wav = torch.FloatTensor(wav).unsqueeze(0)
        wav = wav.to(torch.device('cuda'))
        vq_codes = model.encode(wav) # 
        acoustic_token = model(vq_codes)
        fid = os.path.basename(wav_path)[:-4]
        fid_to_acoustic_token[fid] = acoustic_token
    
    torch.save(fid_to_acoustic_token, outputdir + '/fid_to_acoustic_token.pth')

Init model and load weights
Model ready
Globbed 1024 wav files.


100%|██████████| 1024/1024 [00:34<00:00, 29.81it/s]
