# Pitch Statistics per emotion

In [1]:
# pitch mean and std deviation calculations per emotion


In [2]:
from omegaconf import OmegaConf
from hydra.utils import instantiate
import torch
from tqdm import tqdm

In [3]:
manifest_path = "/DataEmotionalTTS/datasets_v2/openslr110/thorsten-emotional_v02/train_manifest_phonemes.json"
sup_path = "/DataGermanTTS/datasets_v2/openslr110/thorsten-emotional_v02/sup_data/"
emo_map = {8:"neutral", 1:"amused", 2:"angry", 3:"disgusted", 4:"drunk", 5:"sleepy", 6:"surprised", 7:"whisper"}

In [4]:
def get_pitch_stats(pitch_list):
    pitch_tensor = torch.cat(pitch_list)
    pitch_mean, pitch_std = pitch_tensor.mean().item(), pitch_tensor.std().item()
    pitch_min, pitch_max = pitch_tensor.min().item(), pitch_tensor.max().item()
    print(f"PITCH_MEAN={pitch_mean}, PITCH_STD={pitch_std}")
    print(f"PITCH_MIN={pitch_min}, PITCH_MAX={pitch_max}")

def preprocess_ds_for_fastpitch_align(dataloader):
    pitch_list = {k: [] for k in range(1,1+len(emo_map))}
    for batch in tqdm(dataloader, total=len(dataloader)):
        audios, audio_lengths, tokens, tokens_lengths, align_prior_matrices, pitches, pitches_lengths, emotion_ids = batch
        emotion_ext = emotion_ids.squeeze(0).item()
        pitch = pitches.squeeze(0)
        pitch_list[emotion_ext].append(pitch[pitch != 0])
        
    for emotion in range(1,len(emo_map)+1):
        print("Pitch statistics for emotionid: ", emotion, " = ", emo_map[emotion])
        get_pitch_stats(pitch_list[emotion])

In [5]:
cfg = OmegaConf.load("scripts/dataset_processing/tts/openslr_emotional/ds_conf/ds_for_fastpitch_align.yaml")
cfg.manifest_filepath = manifest_path
cfg.sup_data_path = sup_path
dataset = instantiate(cfg.dataset)

    
[NeMo W 2022-09-24 00:26:55 experimental:27] Module <class 'nemo.collections.tts.torch.tts_tokenizers.IPATokenizer'> is experimental, not ready for production and is not fully supported. Use at your own risk.


[NeMo I 2022-09-24 00:26:55 tokenize_and_classify:81] Creating ClassifyFst grammars. This might take some time...
[NeMo I 2022-09-24 00:27:12 data:188] Loading dataset from /DataEmotionalTTS/datasets_v2/openslr110/thorsten-emotional_v02/train_manifest_phonemes.json.


0it [00:00, ?it/s][NeMo W 2022-09-24 00:27:12 tts_tokenizers:144] Text: [viː ʃɛtsən ziː diː ʃɑ̃sən aɪn zoː aɪnən pɾoːtsɛs tsuː ɡəvɪnən ] contains unknown char: [̃]. Symbol will be skipped.
[NeMo W 2022-09-24 00:27:12 tts_tokenizers:144] Text: [viː ʃɛtsən ziː diː ʃɑ̃sən aɪn zoː aɪnən pɾoːtsɛs tsuː ɡəvɪnən ] contains unknown char: [̃]. Symbol will be skipped.
[NeMo W 2022-09-24 00:27:12 tts_tokenizers:144] Text: [viː ʃɛtsən ziː diː ʃɑ̃sən aɪn zoː aɪnən pɾoːtsɛs tsuː ɡəvɪnən ] contains unknown char: [̃]. Symbol will be skipped.
[NeMo W 2022-09-24 00:27:12 tts_tokenizers:144] Text: [viː ʃɛtsən ziː diː ʃɑ̃sən aɪn zoː aɪnən pɾoːtsɛs tsuː ɡəvɪnən ] contains unknown char: [̃]. Symbol will be skipped.
2310it [00:00, 23094.96it/s][NeMo W 2022-09-24 00:27:12 tts_tokenizers:144] Text: [viː ʃɛtsən ziː diː ʃɑ̃sən aɪn zoː aɪnən pɾoːtsɛs tsuː ɡəvɪnən ] contains unknown char: [̃]. Symbol will be skipped.
3358it [00:00, 22794.58it/s]

[NeMo I 2022-09-24 00:27:12 data:227] Loaded dataset with 3358 files.
[NeMo I 2022-09-24 00:27:12 data:229] Dataset contains 4.15 hours.
[NeMo I 2022-09-24 00:27:12 data:327] Pruned 0 files. Final dataset contains 3358 files
[NeMo I 2022-09-24 00:27:12 data:329] Pruned 0.00 hours. Final dataset contains 4.15 hours.





In [6]:
dataloader = torch.utils.data.DataLoader(
        dataset=dataset, batch_size=1, collate_fn=dataset._collate_fn, num_workers=cfg.dataloader_params.num_workers
    )

In [7]:
preprocess_ds_for_fastpitch_align(dataloader)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 3358/3358 [21:49<00:00,  2.56it/s]

Pitch statistics for emotionid:  1  =  amused
PITCH_MEAN=178.7041778564453, PITCH_STD=63.29555892944336
PITCH_MIN=70.91536712646484, PITCH_MAX=482.6029968261719
Pitch statistics for emotionid:  2  =  angry
PITCH_MEAN=224.8560028076172, PITCH_STD=62.71169662475586
PITCH_MIN=65.4063949584961, PITCH_MAX=466.16375732421875
Pitch statistics for emotionid:  3  =  disgusted
PITCH_MEAN=174.06405639648438, PITCH_STD=46.19647216796875
PITCH_MIN=65.4063949584961, PITCH_MAX=560.8067016601562
Pitch statistics for emotionid:  4  =  drunk
PITCH_MEAN=166.42800903320312, PITCH_STD=63.60173034667969
PITCH_MIN=65.4063949584961, PITCH_MAX=1680.521240234375
Pitch statistics for emotionid:  5  =  sleepy
PITCH_MEAN=135.488037109375, PITCH_STD=40.35115432739258
PITCH_MIN=65.4063949584961, PITCH_MAX=2093.004638671875
Pitch statistics for emotionid:  6  =  surprised
PITCH_MEAN=140.9545135498047, PITCH_STD=35.95205307006836
PITCH_MIN=65.4063949584961, PITCH_MAX=349.2282409667969
Pitch statistics for emotionid:  


