In [17]:
from datasets import load_dataset
dataset = load_dataset("yvonne66/bznsyp")

In [None]:
dataset['train'] = dataset['train'].select(range(3900))
dataset['train']

In [None]:
from IPython.display import Audio
print(dataset["train"][0]["text"]) # transfer transcription to text 
Audio(dataset["train"][0]["audio"]["array"], rate=dataset["train"][0]["audio"]["sampling_rate"])


In [None]:
dataset['train'][0]

In [5]:
from datasets import Audio
from dataspeech import rate_apply, pitch_apply, snr_apply, squim_apply
import torch


In [None]:
batch_size = 2
num_workers_per_gpu_for_squim = 1
cpu_num_workers = 2
audio_column_name = "audio"

# compute rate of speech for each audio, SI-SDR, PESQ, STOI
squim_dataset = dataset.map(
    squim_apply,
    batched=True,
    batch_size=batch_size,
    with_rank=True if torch.cuda.device_count()>0 else False,
    num_proc=torch.cuda.device_count()*num_workers_per_gpu_for_squim if torch.cuda.device_count()>0 else cpu_num_workers,
    remove_columns=[audio_column_name], # tricks to avoid rewritting audio
    fn_kwargs={"audio_column_name": audio_column_name,},
)
squim_dataset['train'][0]

In [None]:
num_workers_per_gpu_for_pitch = 1
penn_batch_size = 4096

# compute pitch for each audio, pitch mean and pitch std
pitch_dataset = dataset.cast_column(audio_column_name, Audio(sampling_rate=16_000)).map(
    pitch_apply,
    batched=True,
    batch_size=batch_size,
    with_rank=True if torch.cuda.device_count()>0 else False,
    num_proc=torch.cuda.device_count()*num_workers_per_gpu_for_pitch if torch.cuda.device_count()>0 else cpu_num_workers,
    remove_columns=[audio_column_name], # tricks to avoid rewritting audio
    fn_kwargs={"audio_column_name": audio_column_name, "penn_batch_size": penn_batch_size},
)
pitch_dataset['train'][0]


In [None]:
num_workers_per_gpu_for_snr = 1

# compute snr for each audio, snr, c50, speech_duration
snr_dataset = dataset.map(
    snr_apply,
    batched=True,
    batch_size=batch_size,
    with_rank=True if torch.cuda.device_count()>0 else False,
    num_proc=torch.cuda.device_count()*num_workers_per_gpu_for_snr if torch.cuda.device_count()>0 else cpu_num_workers,
    remove_columns=[audio_column_name], # tricks to avoid rewritting audio
    fn_kwargs={"audio_column_name": audio_column_name},
)
snr_dataset['train'][0]


In [9]:
# # chinese espeakbackend is 'cmn' 
# # reference: https://textdata.cn/blog/phonemizer/

# from phonemizer.backend import EspeakBackend
# from phonemizer.punctuation import Punctuation
# from phonemizer.separator import Separator
# import re

# text = '想到他们可能在那个时候谈论他，他觉得好笑'
# words = re.findall('[\u4e00-\u9fa5]', text)


# backend = EspeakBackend('cmn', with_stress=True)

# separator = Separator(phone=' ', word=None)

# # 构建每个汉字一个对应的音素表达，输出结果为字典样式
# lexicon = {word: backend.phonemize([word], separator=separator, strip=True)[0]
#            for word in words}

# lexicon


In [None]:
from phonemizer import phonemize
from phonemizer.backend import EspeakBackend

backend = EspeakBackend('cmn')

# reference: https://github.com/huggingface/dataspeech/issues/3
# but it would crash when reaching 3992 rows (reference: https://github.com/egaznep/VoicePAT/issues/1)
def rate_apply(batch, rank=None, audio_column_name="audio", text_column_name="text"):
    if isinstance(batch[audio_column_name], list):  
        speaking_rates = []
        phonemes_list = []
        for text, audio in zip(batch[text_column_name], batch[audio_column_name]):
            phonemes = phonemize(text, language='cmn', backend='espeak', with_stress=True)
            
            sample_rate = audio["sampling_rate"]
            audio_length = len(audio["array"].squeeze()) / sample_rate
            
            speaking_rate = len(phonemes) / audio_length

            speaking_rates.append(speaking_rate)
            phonemes_list.append(phonemes)
        
        batch["speaking_rate"] = speaking_rates
        batch["phonemes"] = phonemes_list
    else:
        phonemes = phonemize(batch[text_column_name], language='cmn', backend='espeak', with_stress=True)
            
        sample_rate = batch[audio_column_name]["sampling_rate"]
        audio_length = len(batch[audio_column_name]["array"].squeeze()) / sample_rate
        
        speaking_rate = len(phonemes) / audio_length
        
        batch["speaking_rate"] = speaking_rate
        batch["phonemes"] = phonemes

    return batch


cpu_writer_batch_size = 1000
text_column_name = 'text'

# compute speaking_rate, phonemes
# rate_dataset = snr_dataset.map(
#     rate_apply,
#     with_rank=False,
#     num_proc=cpu_num_workers,
#     writer_batch_size= cpu_writer_batch_size,
#     fn_kwargs={"audio_column_name": audio_column_name, "text_column_name": text_column_name},
# )
rate_dataset = dataset.map(
    rate_apply,
    with_rank=False,
    num_proc=cpu_num_workers,
    writer_batch_size= cpu_writer_batch_size,
    remove_columns=[audio_column_name], # tricks to avoid rewritting audio
    fn_kwargs={"audio_column_name": audio_column_name, "text_column_name": text_column_name},
)
rate_dataset['train'][0]


In [None]:
# combine together
for split in dataset.keys():
    dataset[split] = pitch_dataset[split].add_column("snr", snr_dataset[split]["snr"]).add_column("c50", snr_dataset[split]["c50"])
    if "speech_duration" in snr_dataset[split]:
        dataset[split] = dataset[split].add_column("speech_duration", snr_dataset[split]["speech_duration"])
    dataset[split] = dataset[split].add_column("speaking_rate", rate_dataset[split]["speaking_rate"]).add_column("phonemes", rate_dataset[split]["phonemes"])
    dataset[split] = dataset[split].add_column("stoi", squim_dataset[split]["stoi"]).add_column("si-sdr", squim_dataset[split]["sdr"]).add_column("pesq", squim_dataset[split]["pesq"])
dataset['train'][0]

In [None]:
# dataset.push_to_hub("yvonne66/bznsyp_label_small")


In [None]:

from datasets import load_dataset
dataset = load_dataset("yvonne66/bznsyp_label_small")
print("1st sample:", dataset["train"][0])
# print("2nd sample:", dataset["train"][1]["text_description"])


In [None]:
dataset["train"][0]