In [1]:
# import pandas as pd
# df = pd.read_parquet('/mydev/dataspeech/datasets/jenny_tts_dataset/data/train-00000-of-00010.parquet')
# df.head()

In [2]:
from datasets import load_dataset
dataset = load_dataset("ylacombe/jenny-tts-6h")

In [3]:
from IPython.display import Audio
print(dataset["train"][0]["transcription"])
Audio(dataset["train"][0]["audio"]["array"], rate=dataset["train"][0]["audio"]["sampling_rate"])


It was a bright cold day in April, and the clocks were striking thirteen.


In [4]:
dataset['train'][0]

{'file_name': 'jenny/0',
 'transcription': 'It was a bright cold day in April, and the clocks were striking thirteen.',
 'transcription_normalised': 'it was a bright cold day in april, and the clocks were striking thirteen.',
 'audio': {'path': '0.flac',
  'array': array([-6.10351562e-05, -6.10351562e-05, -3.05175781e-05, ...,
         -2.13623047e-04, -1.83105469e-04, -1.22070312e-04]),
  'sampling_rate': 48000}}

In [5]:
from datasets import Audio
from dataspeech import rate_apply, pitch_apply, snr_apply, squim_apply
import torch


In [6]:
batch_size = 2
num_workers_per_gpu_for_squim = 1
cpu_num_workers = 2
audio_column_name = "audio"

# compute rate of speech for each audio, SI-SDR, PESQ, STOI
squim_dataset = dataset.map(
    squim_apply,
    batched=True,
    batch_size=batch_size,
    with_rank=True if torch.cuda.device_count()>0 else False,
    num_proc=torch.cuda.device_count()*num_workers_per_gpu_for_squim if torch.cuda.device_count()>0 else cpu_num_workers,
    remove_columns=[audio_column_name], # tricks to avoid rewritting audio
    fn_kwargs={"audio_column_name": audio_column_name,},
)

In [7]:
squim_dataset['train'][0]

{'file_name': 'jenny/0',
 'transcription': 'It was a bright cold day in April, and the clocks were striking thirteen.',
 'transcription_normalised': 'it was a bright cold day in april, and the clocks were striking thirteen.',
 'sdr': 25.49785804748535,
 'pesq': 3.302990198135376,
 'stoi': 0.9923241138458252}

In [8]:
num_workers_per_gpu_for_pitch = 1
penn_batch_size = 4096

# compute pitch for each audio, pitch mean and pitch std
pitch_dataset = dataset.cast_column(audio_column_name, Audio(sampling_rate=16_000)).map(
    pitch_apply,
    batched=True,
    batch_size=batch_size,
    with_rank=True if torch.cuda.device_count()>0 else False,
    num_proc=torch.cuda.device_count()*num_workers_per_gpu_for_pitch if torch.cuda.device_count()>0 else cpu_num_workers,
    remove_columns=[audio_column_name], # tricks to avoid rewritting audio
    fn_kwargs={"audio_column_name": audio_column_name, "penn_batch_size": penn_batch_size},
)



In [9]:
pitch_dataset['train'][0]

{'file_name': 'jenny/0',
 'transcription': 'It was a bright cold day in April, and the clocks were striking thirteen.',
 'transcription_normalised': 'it was a bright cold day in april, and the clocks were striking thirteen.',
 'utterance_pitch_mean': 201.19784545898438,
 'utterance_pitch_std': 33.74337387084961}

In [10]:
num_workers_per_gpu_for_snr = 1

# compute snr for each audio, snr, c50, speech_duration
snr_dataset = dataset.map(
    snr_apply,
    batched=True,
    batch_size=batch_size,
    with_rank=True if torch.cuda.device_count()>0 else False,
    num_proc=torch.cuda.device_count()*num_workers_per_gpu_for_snr if torch.cuda.device_count()>0 else cpu_num_workers,
    remove_columns=[audio_column_name], # tricks to avoid rewritting audio
    fn_kwargs={"audio_column_name": audio_column_name},
)


In [11]:
snr_dataset['train'][0]

{'file_name': 'jenny/0',
 'transcription': 'It was a bright cold day in April, and the clocks were striking thirteen.',
 'transcription_normalised': 'it was a bright cold day in april, and the clocks were striking thirteen.',
 'snr': 54.58160400390625,
 'c50': 59.780765533447266,
 'speech_duration': 3.796875}

In [12]:
snr_dataset[next(iter(snr_dataset.keys()))].features

{'file_name': Value(dtype='string', id=None),
 'transcription': Value(dtype='string', id=None),
 'transcription_normalised': Value(dtype='string', id=None),
 'snr': Value(dtype='float32', id=None),
 'c50': Value(dtype='float32', id=None),
 'speech_duration': Value(dtype='float32', id=None)}

In [13]:
cpu_writer_batch_size = 1000
text_column_name = 'transcription'

# compute speaking_rate, phonemes
rate_dataset = snr_dataset.map(
    rate_apply,
    with_rank=False,
    num_proc=cpu_num_workers,
    writer_batch_size= cpu_writer_batch_size,
    fn_kwargs={"audio_column_name": audio_column_name, "text_column_name": text_column_name},
)


In [14]:
rate_dataset['train'][0]

{'file_name': 'jenny/0',
 'transcription': 'It was a bright cold day in April, and the clocks were striking thirteen.',
 'transcription_normalised': 'it was a bright cold day in april, and the clocks were striking thirteen.',
 'snr': 54.58160400390625,
 'c50': 59.780765533447266,
 'speech_duration': 3.796875,
 'speaking_rate': 18.17283950617284,
 'phonemes': 'ɪt wɑz ʌ bɹaɪt koʊld deɪ ɪn eɪpɹʌl, ʌnd ðʌ klɑks wɜ˞ stɹaɪkɪŋ θɜ˞tin.'}

In [15]:
# combine together
for split in dataset.keys():
    dataset[split] = pitch_dataset[split].add_column("snr", snr_dataset[split]["snr"]).add_column("c50", snr_dataset[split]["c50"])
    if "speech_duration" in snr_dataset[split]:
        dataset[split] = dataset[split].add_column("speech_duration", snr_dataset[split]["speech_duration"])
    dataset[split] = dataset[split].add_column("speaking_rate", rate_dataset[split]["speaking_rate"]).add_column("phonemes", rate_dataset[split]["phonemes"])
    dataset[split] = dataset[split].add_column("stoi", squim_dataset[split]["stoi"]).add_column("si-sdr", squim_dataset[split]["sdr"]).add_column("pesq", squim_dataset[split]["pesq"])



In [16]:
dataset['train'][0]

{'file_name': 'jenny/0',
 'transcription': 'It was a bright cold day in April, and the clocks were striking thirteen.',
 'transcription_normalised': 'it was a bright cold day in april, and the clocks were striking thirteen.',
 'utterance_pitch_mean': 201.19784545898438,
 'utterance_pitch_std': 33.74337387084961,
 'snr': 54.58160400390625,
 'c50': 59.780765533447266,
 'speaking_rate': 18.17283950617284,
 'phonemes': 'ɪt wɑz ʌ bɹaɪt koʊld deɪ ɪn eɪpɹʌl, ʌnd ðʌ klɑks wɜ˞ stɹaɪkɪŋ θɜ˞tin.',
 'stoi': 0.9923241138458252,
 'si-sdr': 25.49785804748535,
 'pesq': 3.302990198135376}

In [17]:
import json
path_to_text_bins = "./examples/tags_to_annotations/v02_text_bins.json"

with open(path_to_text_bins) as json_file:
    text_bins_dict = json.load(json_file)
text_bins_dict



{'speaker_rate_bins': ['very slowly',
  'slowly',
  'slightly slowly',
  'moderate speed',
  'slightly fast',
  'fast',
  'very fast'],
 'snr_bins': ['very noisy',
  'noisy',
  'slightly noisy',
  'balanced in clarity',
  'slightly clean',
  'clean',
  'very clean'],
 'reverberation_bins': ['very distant-sounding',
  'distant-sounding',
  'slightly distant-sounding',
  'slightly close-sounding',
  'very close-sounding'],
 'utterance_level_std': ['very monotone',
  'monotone',
  'slightly expressive and animated',
  'expressive and animated',
  'very expressive and animated'],
 'speaker_level_pitch_bins': ['very low-pitch',
  'low-pitch',
  'slightly low-pitch',
  'moderate pitch',
  'slightly high-pitch',
  'high-pitch',
  'very high-pitch']}

In [18]:
path_to_bin_edges = "./examples/tags_to_annotations/v02_bin_edges.json"

bin_edges_dict = {}
if path_to_bin_edges:
    with open(path_to_bin_edges) as json_file:
        bin_edges_dict = json.load(json_file)
bin_edges_dict

{'speaking_rate': [0.0,
  3.8258038258038254,
  7.651607651607651,
  11.477411477411476,
  15.303215303215302,
  19.129019129019127,
  22.95482295482295,
  26.78062678062678],
 'noise': [17.12751579284668,
  25.4012325831822,
  33.67494937351772,
  41.94866616385323,
  50.22238295418875,
  58.49609974452427,
  66.76981653485979,
  75.04353332519531],
 'reverberation': [10, 35, 45, 55, 59, 60],
 'speech_monotony': [0.0,
  20.37920924595424,
  40.75841849190848,
  70,
  90,
  142.6544647216797],
 'pitch_bins_male': [64.6531982421875,
  81.66683959960938,
  98.68048095703125,
  115.69412231445312,
  132.707763671875,
  149.72140502929688,
  166.73504638671875,
  183.74868774414062],
 'pitch_bins_female': [120.17855072021484,
  141.6242690945264,
  163.06998746883795,
  184.51570584314953,
  205.96142421746106,
  227.40714259177264,
  248.8528609660842,
  270.29857934039575],
 'si-sdr': [-17.804332733154297,
  -0.40644073486328125,
  10,
  20,
  25,
  28,
  34.38934326171875],
 'pesq': [1,

In [19]:
SPEAKER_RATE_BINS = ["very slowly", "quite slowly", "slightly slowly", "moderate speed", "slightly fast", "quite fast", "very fast"]
SNR_BINS = ["very noisy", "quite noisy", "slightly noisy", "moderate ambient sound", "slightly clear", "quite clear", "very clear"]
REVERBERATION_BINS = ["very roomy sounding", "quite roomy sounding", "slightly roomy sounding", "moderate reverberation", "slightly confined sounding", "quite confined sounding", "very confined sounding"]
UTTERANCE_LEVEL_STD = ["very monotone", "quite monotone", "slightly monotone", "moderate intonation", "slightly expressive", "quite expressive", "very expressive"]
SI_SDR_BINS = ["extremely noisy", "very noisy", "noisy", "slightly noisy", "almost no noise", "very clear"]
PESQ_BINS = ["very bad speech quality", "bad speech quality", "slightly bad speech quality", "moderate speech quality", "great speech quality", "wonderful speech quality"]

# this one is supposed to be apply to speaker-level mean pitch, and relative to gender
SPEAKER_LEVEL_PITCH_BINS = ["very low pitch", "quite low pitch", "slightly low pitch", "moderate pitch", "slightly high pitch", "quite high pitch", "very high pitch"]

speaker_level_pitch_bins = text_bins_dict.get("speaker_level_pitch_bins", SPEAKER_LEVEL_PITCH_BINS)
speaker_rate_bins = text_bins_dict.get("speaker_rate_bins", SPEAKER_RATE_BINS)
snr_bins = text_bins_dict.get("snr_bins", SNR_BINS)
reverberation_bins = text_bins_dict.get("reverberation_bins", REVERBERATION_BINS)
utterance_level_std = text_bins_dict.get("utterance_level_std", UTTERANCE_LEVEL_STD)

sdr_bins = text_bins_dict.get("sdr_bins", SI_SDR_BINS)
pesq_std = text_bins_dict.get("pesq_bins", PESQ_BINS)


In [26]:
import os
import numpy as np
import matplotlib.pyplot as plt

def visualize_bins_to_text(values_1, values_2, name_1, name_2, text_bins, save_dir, output_column_name, default_bins=100, lower_range=None):
    # Save both histograms into a single figure
    fig, axs = plt.subplots(2, figsize=(8,6), sharex=True)
    
    # Plot histogram and vertical lines for subplot 1
    axs[0].hist(values_1, bins=default_bins, color='blue', alpha=0.7)
    _, bin_edges1 = np.histogram(values_1, bins=len(text_bins), range=(lower_range, values_1.max()) if lower_range else None)
    for edge in bin_edges1:
        axs[0].axvline(x=edge, color='red', linestyle='--', linewidth=1)


    # Plot histogram and vertical lines for subplot 2
    axs[1].hist(values_2, bins=default_bins, color='green', alpha=0.7)
    _, bin_edges2 = np.histogram(values_2, bins=len(text_bins), range=(lower_range, values_2.max()) if lower_range else None)
    for edge in bin_edges2:
        axs[1].axvline(x=edge, color='red', linestyle='--', linewidth=1)

    # Add labels and title
    axs[0].set_title(name_1)
    axs[1].set_title(name_2)
    axs[0].set_yscale('log')
    axs[1].set_yscale('log')
    axs[0].set_ylabel('Frequency')
    axs[1].set_ylabel('Frequency')
    axs[1].set_xlabel(f'{output_column_name}')

    # Adjust layout
    plt.tight_layout()

    filename = f"{output_column_name}.png"
    filepath = os.path.join(save_dir, filename)
    plt.savefig(filepath)
    print(f"Plots saved at '{filename}'!")

def bins_to_text(dataset, text_bins, column_name, output_column_name, leading_split_for_bins="train", batch_size = 4, num_workers = 1, std_tolerance=5, save_dir=None, only_save_plot=False, lower_range=None, bin_edges=None):
    '''
    Compute bins of `column_name` from the splits `leading_split_for_bins` and apply text bins to every split.
    `leading_split_for_bins` can be a string or a list.
    '''
    if bin_edges is None:
        values = []
        for df in dataset:
            for split in df:
                if leading_split_for_bins is None or leading_split_for_bins in split:
                    values.extend(df[split][column_name])
        
        # filter out outliers
        values = np.array(values)
        values = values[~np.isnan(values)]
        filtered_values = values
        if std_tolerance is not None:
            filtered_values = values[np.abs(values - np.mean(values)) < std_tolerance * np.std(values)]

        if save_dir is not None:
            visualize_bins_to_text(values, filtered_values, "Before filtering", "After filtering", text_bins, save_dir, output_column_name, lower_range=lower_range)
            
        # speaking_rate can easily have outliers
        if save_dir is not None and output_column_name=="speaking_rate":
            visualize_bins_to_text(filtered_values, filtered_values, "After filtering", "After filtering", text_bins, save_dir, f"{output_column_name}_after_filtering", lower_range=lower_range)
        
        values = filtered_values
        hist, bin_edges = np.histogram(values, bins = len(text_bins), range=(lower_range, values.max()) if lower_range else None)
        
        if only_save_plot:
            return dataset, bin_edges
    else:
        print(f"Already computed bin edges have been passed for {output_column_name}. Will use: {bin_edges}.")

batch_size = 16
cpu_num_workers = 2
leading_split_for_bins = "train"
speaking_rate_std_tolerance = 4
plot_directory = '/mydev/dataspeech/datasets/jenny_tts_dataset/plots'
only_save_plot = False
speaking_rate_lower_range = None

dataset, speaking_rate_bin_edges = bins_to_text(dataset, speaker_rate_bins, "speaking_rate", "speaking_rate", batch_size=batch_size, num_workers=cpu_num_workers, leading_split_for_bins=leading_split_for_bins, std_tolerance=speaking_rate_std_tolerance, save_dir=plot_directory, only_save_plot=only_save_plot, bin_edges=bin_edges_dict.get("speaking_rate",None), lower_range=speaking_rate_lower_range)
# dataset, noise_bin_edges = bins_to_text(dataset, snr_bins, "snr", "noise", batch_size=args.batch_size, num_workers=args.cpu_num_workers, leading_split_for_bins=args.leading_split_for_bins, std_tolerance=args.snr_std_tolerance, save_dir=args.plot_directory, only_save_plot=args.only_save_plot, bin_edges=bin_edges_dict.get("noise",None), lower_range=args.snr_lower_range)
# dataset, reverberation_bin_edges = bins_to_text(dataset, reverberation_bins, "c50", "reverberation", batch_size=args.batch_size, num_workers=args.cpu_num_workers, leading_split_for_bins=args.leading_split_for_bins, std_tolerance=args.reverberation_std_tolerance, save_dir=args.plot_directory, only_save_plot=args.only_save_plot, bin_edges=bin_edges_dict.get("reverberation",None))
# dataset, speech_monotony_bin_edges = bins_to_text(dataset, utterance_level_std, "utterance_pitch_std", "speech_monotony", batch_size=args.batch_size, num_workers=args.cpu_num_workers, leading_split_for_bins=args.leading_split_for_bins, std_tolerance=args.speech_monotony_std_tolerance, save_dir=args.plot_directory, only_save_plot=args.only_save_plot, bin_edges=bin_edges_dict.get("speech_monotony",None))

# dataset, sdr_bin_edges = bins_to_text(dataset, sdr_bins, "si-sdr", "sdr_noise", batch_size=args.batch_size, num_workers=args.cpu_num_workers, leading_split_for_bins=args.leading_split_for_bins, std_tolerance=args.sdr_std_tolerance, save_dir=args.plot_directory, only_save_plot=args.only_save_plot, bin_edges=bin_edges_dict.get("si-sdr",None))
# dataset, pesq_bin_edges = bins_to_text(dataset, pesq_std, "pesq", "pesq_speech_quality", batch_size=args.batch_size, num_workers=args.cpu_num_workers, leading_split_for_bins=args.leading_split_for_bins, std_tolerance=args.pesq_std_tolerance, save_dir=args.plot_directory, only_save_plot=args.only_save_plot, bin_edges=bin_edges_dict.get("pesq",None))


Already computed bin edges have been passed for speaking_rate. Will use: [0.0, 3.8258038258038254, 7.651607651607651, 11.477411477411476, 15.303215303215302, 19.129019129019127, 22.95482295482295, 26.78062678062678].


TypeError: cannot unpack non-iterable NoneType object

In [31]:
bin_edges=bin_edges_dict.get("speaking_rate",None)
text_bins = speaker_rate_bins
output_column_name = "speaking_rate"
column_name = "speaking_rate"
num_workers = 1
df = dataset['train']

def batch_association(batch):
    index_bins = np.searchsorted(bin_edges, batch, side="left")
    # do min(max(...)) when values are outside of the main bins
    # it happens when value = min or max or have been filtered out from bins computation
    batch_bins = [text_bins[min(max(i-1, 0), len(text_bins)-1)] for i in index_bins]
    return {
        output_column_name: batch_bins
    }

dataset = [df.map(batch_association, batched=True, batch_size=batch_size, input_columns=[column_name], num_proc=num_workers) for df in dataset]


AttributeError: 'str' object has no attribute 'map'

In [36]:
dataset['train'][0]

{'file_name': 'jenny/0',
 'transcription': 'It was a bright cold day in April, and the clocks were striking thirteen.',
 'transcription_normalised': 'it was a bright cold day in april, and the clocks were striking thirteen.',
 'utterance_pitch_mean': 201.19784545898438,
 'utterance_pitch_std': 33.74337387084961,
 'snr': 54.58160400390625,
 'c50': 59.780765533447266,
 'speaking_rate': 18.17283950617284,
 'phonemes': 'ɪt wɑz ʌ bɹaɪt koʊld deɪ ɪn eɪpɹʌl, ʌnd ðʌ klɑks wɜ˞ stɹaɪkɪŋ θɜ˞tin.',
 'stoi': 0.9923241138458252,
 'si-sdr': 25.49785804748535,
 'pesq': 3.302990198135376}