In [18]:
from Metrics.FidelityMetrics import SNRMetric, PESQMetric, SECSMetric
import os
import torchaudio
from util import *
import pandas as pd



In [2]:
filelist = ["5694_1.wav", "5694_2.wav", "6319_1.wav", "6319_2.wav"]
import os
import torchaudio

def load_audio_files(file_list, parent_dir, recursive=True):
    audio_files = []

    if recursive:
        for root, _, files in os.walk(parent_dir):
            for file in files:
                if file in file_list:
                    file_path = os.path.join(root, file)
                    waveform, sample_rate = torchaudio.load(file_path)
                    audio_files.append((file_path, waveform, sample_rate))
    else:
        for file in os.listdir(parent_dir):
            if file in file_list:
                file_path = os.path.join(parent_dir, file)
                waveform, sample_rate = torchaudio.load(file_path)
                audio_files.append((file_path, waveform, sample_rate))

    return audio_files

original_files = load_audio_files(filelist, "./trail_ds")
syn_files = load_audio_files(filelist, "./pop/CosyVoice")
gen_files = load_audio_files(filelist, "./pop", recursive=False)

In [16]:
# play first original file
import IPython.display as ipd
ipd.Audio(original_files[0][1], rate=original_files[0][2])

In [54]:
# resample
original_waveform = torchaudio.transforms.Resample(original_files[0][2], 16000)(original_files[0][1])
gen_waveform = torchaudio.transforms.Resample(gen_files[0][2], 16000)(gen_files[0][1])
min_lenth = min(original_waveform.size(1), gen_waveform.size(1))
original_waveform = original_waveform[:, :min_lenth]
gen_waveform = gen_waveform[:, :min_lenth]
syn_waveform = torchaudio.transforms.Resample(syn_files[0][2], 16000)(syn_files[0][1])
print(original_waveform.size(), gen_waveform.size())

torch.Size([1, 57120]) torch.Size([1, 57120])


In [47]:
ipd.display(ipd.Audio(original_waveform, rate=16000))

In [49]:
ipd.display(ipd.Audio(gen_waveform, rate=16000))

In [ ]:
import wespeaker
model = wespeaker.load_model('english')
model.set_device('cuda:0')

In [68]:
a = model.compute_similarity("trail_ds/5694/5694_1.wav", "pop/CosyVoice/5694_1.wav")
b = model.compute_similarity("trail_ds/5694/5694_2.wav", "pop/CosyVoice/5694_2.wav")
c = model.compute_similarity("trail_ds/6319/6319_1.wav", "pop/CosyVoice/6319_1.wav")
d = model.compute_similarity("trail_ds/6319/6319_2.wav", "pop/CosyVoice/6319_2.wav")
print((a+b+c+d)/4)

0.7145271115005016


In [65]:
model.compute_similarity("trail_ds/5694/5694_1.wav", "pop/CosyVoice/5694_1.wav")

0.7223431617021561

0.6943416148424149

0.7278458327054977

In [5]:
snr_metric = SNRMetric()
pesq_metric = PESQMetric()
secs_metric = SECSMetric()
for (original_path, original_waveform, original_sample_rate), (gen_path, gen_waveform, gen_sample_rate) in zip(original_files, gen_files):
    # resample
    original_waveform = torchaudio.transforms.Resample(original_sample_rate, 16000)(original_waveform)
    gen_waveform = torchaudio.transforms.Resample(gen_sample_rate, 16000)(gen_waveform)
    min_lenth = min(original_waveform.size(1), gen_waveform.size(1))
    threshold = 10
    assert min_lenth + threshold > original_waveform.size(1) and min_lenth + threshold > gen_waveform.size(1)
    original_waveform = original_waveform[:, :min_lenth]
    gen_waveform = gen_waveform[:, :min_lenth]
    noise = original_waveform - gen_waveform
    print("clean:", original_waveform)
    print("noise:", noise)
    #snr_metric.update(original_waveform, gen_waveform)
    #pesq_metric.update(original_waveform, gen_waveform)
    #secs_metric.update(original_waveform, gen_waveform)

print("snr", snr_metric.compute())
print("pesq", pesq_metric.compute())
print("secs", secs_metric.compute())

Loaded the voice encoder model on cuda in 0.03 seconds.
clean: tensor([[ 2.5369e-05,  3.1830e-05,  3.0400e-05,  ..., -1.3966e-04,
          4.5787e-05,  1.1751e-04]])
noise: tensor([[ 0.0081,  0.0019, -0.0039,  ...,  0.0412,  0.0378,  0.0410]])
clean: tensor([[0.0006, 0.0011, 0.0010,  ..., 0.0052, 0.0046, 0.0052]])
noise: tensor([[ 0.0060,  0.0087, -0.0242,  ...,  0.0315,  0.0302,  0.0304]])
clean: tensor([[-0.0001, -0.0005, -0.0007,  ..., -0.0014, -0.0012, -0.0014]])
noise: tensor([[-0.0107, -0.0013,  0.0047,  ..., -0.0141,  0.0051,  0.0054]])
clean: tensor([[ 0.0047,  0.0062,  0.0056,  ..., -0.0025, -0.0029, -0.0033]])
noise: tensor([[-0.0183, -0.0242, -0.0216,  ..., -0.0895, -0.0942, -0.0948]])
snr tensor(0.)
pesq tensor(0.)
secs tensor(0.)


In [None]:
folder = "/mnt/d/repo/AntiFake/antifake"

In [None]:
subfiles = [f for f in os.listdir(folder) if f.endswith(".wav")]

original = [f for f in subfiles if "ffmpeg" in f]
antifake = [f for f in subfiles if "antifake" in f]
def normalize_name(filename):
    return filename.replace("ffmpeg", "").replace("antifake", "")
original_dict = {normalize_name(f): f for f in original}
antifake_dict = {normalize_name(f): f for f in antifake}

paired_files = [(original_dict[name], antifake_dict[name]) 
                for name in original_dict.keys() 
                if name in antifake_dict]

assert len(paired_files) == len(original_dict) == len(antifake_dict) == 76

In [None]:
for original, antifake in paired_files:
    original = load_wav(folder + "/" + original, target_sr=16000)
    distorted = load_wav(folder + "/" + antifake, target_sr=16000)
    original = original[: , :distorted.size(1)]
    distorted = distorted[:, : original.size(1)]
    snr_metric.update(original, distorted)
    pesq_metric.update(original, distorted)
    secs_metric.update(original, distorted)
    
print("snr", snr_metric.compute())
print("pesq", pesq_metric.compute())
print("secs", secs_metric.compute())

In [None]:
mos_data = pd.read_csv("./results/NISQA_results.csv")


mos_data_antifake = mos_data[mos_data["deg"].str.contains("antifake")]

mos_pred_mean = np.mean(mos_data_antifake["mos_pred"])
mos_pred_std_dev = np.std(mos_data_antifake["mos_pred"])

print("mos_pred_mean", mos_pred_mean)
print("mos_pred_std", mos_pred_std_dev)

In [None]:
snr_metric = SNRMetric()
pesq_metric = PESQMetric()
secs_metric = SECSMetric()

folder = "./sampled_pair"

speakers = [f for f in os.listdir(folder) if os.path.isdir(os.path.join(folder, f))]

for speaker in speakers:
    for i in range(1, 3):
        prefix = f"{speaker}_{i}"
        original = load_wav(f"{folder}/{speaker}/{prefix}.wav", target_sr=16000)
        for subfile in os.listdir(f"{folder}/{speaker}"):
            if subfile.startswith(prefix) and "advspeech_ssim_only" in subfile:
                advspeech = load_wav(f"{folder}/{speaker}/{subfile}", target_sr=16000)
                original = original[: , :advspeech.size(1)]
                advspeech = advspeech[:, : original.size(1)]
                snr_metric.update(original, advspeech)
                pesq_metric.update(original, advspeech)
                secs_metric.update(original, advspeech)
                print(prefix, subfile)
                
print("snr", snr_metric.compute())
print("pesq", pesq_metric.compute())
print("secs", secs_metric.compute())
        


In [None]:
mos_data = pd.read_csv("./results/NISQA_results_advspeech.csv")

mos_data_advspeech = mos_data[mos_data["deg"].str.contains("advspeech")]

mos_pred_mean = np.mean(mos_data_advspeech["mos_pred"])
mos_pred_std_dev = np.std(mos_data_advspeech["mos_pred"])

print("mos_pred_mean", mos_pred_mean)
print("mos_pred_std", mos_pred_std_dev)

In [None]:
folder = "./sampled_pair"
speakers = [f for f in os.listdir(folder) if os.path.isdir(os.path.join(folder, f))]
ref_dict = {}


for speaker in speakers:
    for i in range(1, 3):
        prefix = f"{speaker}_{i}"
        ref_dict[prefix] = f"{folder}/{speaker}/{prefix}.wav"
        



In [None]:
def wespeaker_similarity(input_folder, mode):
    res = []
    
    for filename in os.listdir(f'{input_folder}/{mode}'):
        mode_index = filename.find(mode)
        prefix = filename[:mode_index-1]
        
        original = ref_dict[prefix]
        synthesized = f'{input_folder}/{mode}/{filename}'
        score = model.compute_similarity(original , synthesized)
        res.append(score)
    return res



In [30]:
synthesizer = f"/mnt/d/voicedata/Libri_adv/cosyvoice"
advspeech_similarity = wespeaker_similarity(synthesizer, 'advspeech') 

In [36]:
advspeech_similarity = sorted(advspeech_similarity, reverse=True)
index = int(len(advspeech_similarity) * 0.95)
threshold = advspeech_similarity[index]
print("threshold", threshold)

threshold 0.6063705906271935


In [None]:
print("advspeech: ", np.mean(advspeech_similarity), np.std(advspeech_similarity))

In [38]:
for syn in ['xTTS', 'openvoice', 'cosyvoice']:
    synthesizer = f"/mnt/d/voicedata/Libri_adv/{syn}"

    original_similarity = wespeaker_similarity(synthesizer, 'ffmpeg')
    original_similarity = sorted(original_similarity, reverse=True)
    index = int(len(original_similarity) * 0.95)
    threshold = original_similarity[index]
    print("synthesizer", syn, "threshold", threshold)
    
    antifake_similarity = wespeaker_similarity(synthesizer, 'antifake')
    advspeech_similarity = wespeaker_similarity(synthesizer, 'advspeech')
    
    antifake_accepted = sum([1 for score in antifake_similarity if score > threshold])
    advspeech_accepted = sum([1 for score in advspeech_similarity if score > threshold])
    print("antifake", antifake_accepted / len(antifake_similarity))
    print("advspeech", advspeech_accepted / len(advspeech_similarity))

synthesizer xTTS threshold 0.703551784157753
antifake 0.02631578947368421
advspeech 0.3888888888888889
synthesizer openvoice threshold 0.5727239400148392
antifake 0.5526315789473685
advspeech 0.7638888888888888
synthesizer cosyvoice threshold 0.8517224490642548
antifake 0.0
advspeech 0.006944444444444444
