In [1]:
import numpy as np

from Metrics.FidelityMetrics import SNRMetric, PESQMetric, SECSMetric
import os
from util import *
import pandas as pd

In [2]:
snr_metric = SNRMetric()
pesq_metric = PESQMetric()
secs_metric = SECSMetric()

Loaded the voice encoder model on cuda in 3.80 seconds.


In [3]:
folder = "/mnt/d/repo/AntiFake/antifake"

In [4]:
subfiles = [f for f in os.listdir(folder) if f.endswith(".wav")]

original = [f for f in subfiles if "ffmpeg" in f]
antifake = [f for f in subfiles if "antifake" in f]
def normalize_name(filename):
    return filename.replace("ffmpeg", "").replace("antifake", "")
original_dict = {normalize_name(f): f for f in original}
antifake_dict = {normalize_name(f): f for f in antifake}

paired_files = [(original_dict[name], antifake_dict[name]) 
                for name in original_dict.keys() 
                if name in antifake_dict]

assert len(paired_files) == len(original_dict) == len(antifake_dict) == 76

In [7]:
for original, antifake in paired_files:
    original = load_wav(folder + "/" + original, target_sr=16000)
    distorted = load_wav(folder + "/" + antifake, target_sr=16000)
    original = original[: , :distorted.size(1)]
    distorted = distorted[:, : original.size(1)]
    snr_metric.update(original, distorted)
    pesq_metric.update(original, distorted)
    secs_metric.update(original, distorted)
    
print("snr", snr_metric.compute())
print("pesq", pesq_metric.compute())
print("secs", secs_metric.compute())

tensor(14.0513) tensor(228)
snr (tensor(14.0513), tensor(3.4403))
pesq (tensor(1.3316), tensor(0.2905))
secs (tensor(0.4164), tensor(0.0789))


In [12]:
mos_data = pd.read_csv("./results/NISQA_results.csv")


mos_data_antifake = mos_data[mos_data["deg"].str.contains("antifake")]

mos_pred_mean = np.mean(mos_data_antifake["mos_pred"])
mos_pred_std_dev = np.std(mos_data_antifake["mos_pred"])

print("mos_pred_mean", mos_pred_mean)
print("mos_pred_std", mos_pred_std_dev)

mos_pred_mean 2.1834211365172735
mos_pred_std 0.49559235089312387


In [4]:
snr_metric = SNRMetric()
pesq_metric = PESQMetric()
secs_metric = SECSMetric()

folder = "./sampled_pair"

speakers = [f for f in os.listdir(folder) if os.path.isdir(os.path.join(folder, f))]

for speaker in speakers:
    for i in range(1, 3):
        prefix = f"{speaker}_{i}"
        original = load_wav(f"{folder}/{speaker}/{prefix}.wav", target_sr=16000)
        for subfile in os.listdir(f"{folder}/{speaker}"):
            if subfile.startswith(prefix) and "advspeech_ssim_only" in subfile:
                advspeech = load_wav(f"{folder}/{speaker}/{subfile}", target_sr=16000)
                original = original[: , :advspeech.size(1)]
                advspeech = advspeech[:, : original.size(1)]
                snr_metric.update(original, advspeech)
                pesq_metric.update(original, advspeech)
                secs_metric.update(original, advspeech)
                print(prefix, subfile)
                
print("snr", snr_metric.compute())
print("pesq", pesq_metric.compute())
print("secs", secs_metric.compute())
        


Loaded the voice encoder model on cuda in 0.01 seconds.
1272_1 1272_1_advspeech_ssim_only_0.wav
1272_1 1272_1_advspeech_ssim_only_1.wav
1272_2 1272_2_advspeech_ssim_only_0.wav
1272_2 1272_2_advspeech_ssim_only_1.wav
1462_1 1462_1_advspeech_ssim_only_0.wav
1462_1 1462_1_advspeech_ssim_only_1.wav
1462_2 1462_2_advspeech_ssim_only_0.wav
1462_2 1462_2_advspeech_ssim_only_1.wav
1673_1 1673_1_advspeech_ssim_only_0.wav
1673_1 1673_1_advspeech_ssim_only_1.wav
1673_2 1673_2_advspeech_ssim_only_0.wav
1673_2 1673_2_advspeech_ssim_only_1.wav
174_1 174_1_advspeech_ssim_only_0.wav
174_1 174_1_advspeech_ssim_only_1.wav
174_2 174_2_advspeech_ssim_only_0.wav
174_2 174_2_advspeech_ssim_only_1.wav
1919_1 1919_1_advspeech_ssim_only_0.wav
1919_1 1919_1_advspeech_ssim_only_1.wav
1919_2 1919_2_advspeech_ssim_only_0.wav
1919_2 1919_2_advspeech_ssim_only_1.wav
1988_1 1988_1_advspeech_ssim_only_0.wav
1988_1 1988_1_advspeech_ssim_only_1.wav
1988_2 1988_2_advspeech_ssim_only_0.wav
1988_2 1988_2_advspeech_ssim_onl

In [16]:
mos_data = pd.read_csv("./results/NISQA_results_advspeech.csv")

mos_data_advspeech = mos_data[mos_data["deg"].str.contains("advspeech")]

mos_pred_mean = np.mean(mos_data_advspeech["mos_pred"])
mos_pred_std_dev = np.std(mos_data_advspeech["mos_pred"])

print("mos_pred_mean", mos_pred_mean)
print("mos_pred_std", mos_pred_std_dev)

mos_pred_mean 2.8318524998095302
mos_pred_std 0.6538042911371987


In [5]:
folder = "./sampled_pair"
speakers = [f for f in os.listdir(folder) if os.path.isdir(os.path.join(folder, f))]
ref_dict = {}


for speaker in speakers:
    for i in range(1, 3):
        prefix = f"{speaker}_{i}"
        ref_dict[prefix] = f"{folder}/{speaker}/{prefix}.wav"
        

In [6]:
import wespeaker
model = wespeaker.load_model('english')
model.set_device('cuda:0')



In [8]:
def wespeaker_similarity(input_folder, mode):
    res = []
    
    for filename in os.listdir(f'{input_folder}/{mode}'):
        mode_index = filename.find(mode)
        prefix = filename[:mode_index-1]
        
        original = ref_dict[prefix]
        synthesized = f'{input_folder}/{mode}/{filename}'
        score = model.compute_similarity(original , synthesized)
        res.append(score)
    return res



In [9]:
synthesizer = f"/mnt/d/voicedata/Libri_adv/cosyvoice"
advspeech_similarity = wespeaker_similarity(synthesizer, 'advspeech_ssim_only')

In [10]:
print("advspeech: ", np.mean(advspeech_similarity), np.std(advspeech_similarity))

advspeech:  0.7281295326492279 0.075281447141619


In [44]:
for syn in ['xTTS', 'openvoice', 'cosyvoice']:
    synthesizer = f"/mnt/d/voicedata/Libri_adv/{syn}"

    original_similarity = wespeaker_similarity(synthesizer, 'ffmpeg')
    antifake_similarity = wespeaker_similarity(synthesizer, 'antifake')
    advspeech_similarity = wespeaker_similarity(synthesizer, 'advspeech')
    print(syn)
    print("original: ", np.mean(original_similarity), np.std(original_similarity))
    print("antifake: ", np.mean(antifake_similarity), np.std(antifake_similarity))
    print("advspeech: ", np.mean(advspeech_similarity), np.std(advspeech_similarity))

xTTS
original:  0.7996861952307978 0.04479642958655396
antifake:  0.5950886263944037 0.05735854769547628
advspeech:  0.6781693687856508 0.06247269280783284
openvoice
original:  0.6655337025264376 0.050913048547118755
antifake:  0.5826841509735555 0.05337649647797942
advspeech:  0.6107569044947417 0.04768445836260454
cosyvoice
original:  0.8987942161528688 0.025214177964369214
antifake:  0.6729299951387618 0.07127123018407143
advspeech:  0.7351352721931713 0.07062878341272188


original:  0.7996861952307978 0.04479642958655396
antifake:  0.5950886263944037 0.05735854769547628
advspeech:  0.6781693687856508 0.06247269280783284
