In [8]:
import tensorflow as tf
from IPython.display import Audio, display
import torchaudio
import numpy as np
import torch
from tqdm import tqdm

from main import SoundStreamEncoder, load_weights, load_model

# my_model = SoundStreamEncoder()
# my_model.build([1, 320, 1, 1])
my_model = load_model()


wav, sr = torchaudio.load('./sample.wav')
wav = torchaudio.functional.resample(wav, sr, 16000)
wav = wav[:, 3 * 16000: 8*16000]
wav = torch.mean(wav, 0)[None]

def run_model(wav, my=False):
    tflite_model = tf.lite.Interpreter('./soundstream_encoder.tflite')
    quant_encode = tf.lite.Interpreter('./quantizer.tflite').get_signature_runner('encode')
    quant_decode = tf.lite.Interpreter('./quantizer.tflite').get_signature_runner('decode')
    decoder = tf.lite.Interpreter('./lyragan.tflite').get_signature_runner()

    tflite_runner = tflite_model.get_signature_runner()
    lyra_decoded_frames = []
    for i in tqdm(range(0, wav.shape[1], 320)):
        frame = wav[:, i: i + 320]
        if not my:
            embeddings = tflite_runner(input_audio=frame)['output_0']
        else:
            embeddings = my_model(frame.numpy())

        x = quant_encode(input_frames=embeddings, num_quantizers=np.int32(np.array([16])))['output_0']
        x = quant_decode(encoding_indices=x)['output_0']
        x = decoder(input_audio=x)['output_0']
        lyra_decoded_frames.append(x)

    lyra_decoded_audio = np.concatenate(lyra_decoded_frames, 1)  
    return lyra_decoded_audio

In [9]:
lyra_decoded_audio = run_model(wav, my=False)
my_decoded_audio = run_model(wav, my=True)

100%|█████████████████████████████████████████████████████████████| 250/250 [00:00<00:00, 489.21it/s]
100%|██████████████████████████████████████████████████████████████| 250/250 [00:12<00:00, 20.14it/s]


In [10]:
lyra_obj = Audio(lyra_decoded_audio, rate=16000)
my_obj = Audio(my_decoded_audio, rate=16000)
orig_obj = Audio(wav, rate=16000)
display(orig_obj)
display(lyra_obj)
display(my_obj)


In [11]:
from visqol import visqol_lib_py
from visqol.pb2 import visqol_config_pb2
from visqol.pb2 import similarity_result_pb2
import torchaudio
import os

config = visqol_config_pb2.VisqolConfig()
config.options.use_speech_scoring = True
svr_model_path = 'lattice_tcditugenmeetpackhref_ls2_nl60_lr12_bs2048_learn.005_ep2400_train1_7_raw.tflite'
config.options.svr_model_path = os.path.join(os.path.dirname(visqol_lib_py.__file__), "model", svr_model_path)
api_16 = visqol_lib_py.VisqolApi()
config.audio.sample_rate = 16000
api_16.Create(config)

api_8 = visqol_lib_py.VisqolApi()
config.audio.sample_rate = 8000
api_8.Create(config)

def score(reference, degraded, sr16=False):
    api = api_16 if sr16 else api_8
    reference = np.float64(reference.squeeze())
    degraded = np.float64(degraded.squeeze())
    
    similarity_result = api.Measure(reference, degraded)
    return similarity_result.moslqo

In [12]:
lyra_score =  score(lyra_decoded_audio, wav, sr16=True)
my_score =  score(my_decoded_audio, wav, sr16=True)
print(lyra_score, my_score)

4.216936514323561 4.181812599037918


In [13]:
score(my_decoded_audio, lyra_decoded_audio, sr16=True), score(lyra_decoded_audio,my_decoded_audio, sr16=True)

(4.340300082788045, 4.335655664296156)