In [1]:
import os
# import pickle
import _pickle as cPickle
import gc
import random
from tqdm import tqdm
import numpy as np
import torch

from datasets import load_dataset, load_metric
import math
from itertools import groupby

import wandb

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
os.environ["WANDB_DISABLED"] = "true"

cache_dir = "/data4/yoomcache"
model_cache_dir = os.path.join(cache_dir, 'huggingface')
data_cache_dir = os.path.join(cache_dir, 'datasets')
checkpoint_dir = os.path.join(cache_dir, 'checkpoint')

seed = 0
random.seed(0)
np.random.seed(seed)
torch.manual_seed(seed)

import IPython
import seaborn as sns
sns.set(rc = {'figure.figsize':(16,8)})

In [80]:
data_dir = '/data4/TTS/'
# data_fname = 'VCTK-Corpus/dataset-vctk-16k.pkl'
# data_fname = 'LibriTTS/train-clean-100.pkl'
data_fname = 'LibriTTS/dev-clean.pkl'

with open(os.path.join(data_dir, data_fname), 'rb') as f:
    gc.disable()
    dataset = cPickle.load(f)
    gc.enable()

print('entire dataset length: {}'.format(len(dataset['text'])))

for k in dataset.keys():
    if k == 'sample_rate': continue
    del dataset[k][int(len(dataset[k])):]
dataset_size = len(dataset['text'])


max_audio_length = 0
for arr in dataset['audio_array'][:dataset_size]:
    if len(arr) > max_audio_length:
        max_audio_length = len(arr)
print('maximum audio length: {}'.format(max_audio_length))

entire dataset length: 5736
maximum audio length: 481280


In [3]:
dataset.keys()

dict_keys(['page', 'index', 'audio_path', 'audio_array', 'text', 'sample_rate'])

In [4]:
import pandas as pd

In [5]:
df = pd.DataFrame(dataset)

In [10]:
df.sort_values('index')

Unnamed: 0,page,index,audio_path,audio_array,text,sample_rate
29248,p225,p225_001,/data4/TTS/VCTK/VCTK-Corpus/wav48/p225/p225_00...,"[0.0038153944, 0.0069943625, 0.0059126774, 0.0...",Please call Stella.,16000
29279,p225,p225_002,/data4/TTS/VCTK/VCTK-Corpus/wav48/p225/p225_00...,"[-0.0036491496, -0.006820807, -0.0059936647, -...",Ask her to bring these things with her from th...,16000
29351,p225,p225_003,/data4/TTS/VCTK/VCTK-Corpus/wav48/p225/p225_00...,"[-0.0028066966, -0.00484558, -0.0038843188, -0...","Six spoons of fresh snow peas, five thick slab...",16000
29188,p225,p225_004,/data4/TTS/VCTK/VCTK-Corpus/wav48/p225/p225_00...,"[0.002831708, 0.0052296943, 0.0044813827, 0.00...",We also need a small plastic snake and a big t...,16000
29307,p225,p225_005,/data4/TTS/VCTK/VCTK-Corpus/wav48/p225/p225_00...,"[0.00010675909, 0.00035040374, 0.00026865466, ...",She can scoop these things into three red bags...,16000
...,...,...,...,...,...,...
29454,p376,p376_291,/data4/TTS/VCTK/VCTK-Corpus/wav48/p376/p376_29...,"[-0.0039445944, -0.0070481566, -0.0062631024, ...","""I have tonight resigned as a member of the Eu...",16000
29469,p376,p376_292,/data4/TTS/VCTK/VCTK-Corpus/wav48/p376/p376_29...,"[0.007278188, 0.013384368, 0.011897836, 0.0127...","""The move was welcomed by the Roman Catholic C...",16000
29437,p376,p376_293,/data4/TTS/VCTK/VCTK-Corpus/wav48/p376/p376_29...,"[-0.0060262484, -0.011149624, -0.009517272, -0...","""That might be a punishment but it would also ...",16000
29617,p376,p376_294,/data4/TTS/VCTK/VCTK-Corpus/wav48/p376/p376_29...,"[-0.00473665, -0.008488858, -0.0071424493, -0....","""He has retired.""",16000


In [14]:
df.page.unique()

array(['p311', 'p263', 'p251', 'p326', 'p374', 'p340', 'p286', 'p246',
       'p247', 'p280', 'p302', 'p233', 'p228', 'p335', 'p237', 'p254',
       'p287', 'p336', 'p298', 'p343', 'p317', 'p273', 'p257', 'p347',
       'p281', 'p278', 'p360', 'p285', 'p236', 'p271', 'p245', 'p310',
       'p312', 'p238', 'p297', 'p303', 'p351', 'p250', 'p318', 'p274',
       'p330', 'p341', 'p306', 'p305', 'p255', 'p288', 'p259', 'p275',
       'p230', 'p227', 'p276', 'p272', 'p314', 'p268', 'p253', 'p339',
       'p334', 'p260', 'p231', 'p248', 'p345', 'p249', 'p295', 'p243',
       'p258', 'p277', 'p252', 'p362', 'p256', 'p229', 'p284', 'p225',
       'p376', 'p232', 'p262', 'p333', 'p270', 'p239', 'p267', 'p282',
       'p269', 'p261', 'p234', 'p329', 'p266', 'p364', 'p363', 'p294',
       'p292', 'p307', 'p293', 'p313', 'p300', 'p299', 'p361', 'p264',
       'p240', 'p226', 'p241', 'p308', 'p279', 'p304', 'p323', 'p301',
       'p265', 'p283', 'p244', 'p316'], dtype=object)

In [None]:
df.loc[df.chapter=='64301', ['index', 'normalized_text']].sort_values('index')

In [18]:
for page in df.page.unique()[:3]:
    print(page)
    print(' '.join(df.loc[df.page == page, 'text']))
    print()

p311

p263

p251
He thought he had a month to cross the street. I think a move would create a lot of interest. But it failed. Last night was a key episode. The results are sometimes contentious. They do not work for Glasgow City Council. We have to move forward. We have to sort it out ourselves. That's the principal difference between an artist and a dog. They did not replace it with a conviction for culpable homicide. Aristotle thought that the rainbow was caused by reflection of the sun's rays by the rain.  We have a free vote on these things in our party. War is not the action. It is not either national standards or local control. The court was shown a video of the house on that day. You can see it on the training ground and around the hotel. It would still have been a good film, but very different. Despite the result, it was still a great moment. Rebuilding public transport was never going to happen overnight. You are not going in blind. It just shows the arrogance of Labour and th

In [None]:
for txt in df.normalized_text:
    for word in txt.split(' '):
        if word.upper() == word and len(word) > 1:
            print(txt)
            break

In [8]:
re.sub('\s\.', '.', 'bbbawef . aaaaa')

'bbbawef. aaaaa'

In [1]:
import re

In [93]:
_RE_REPLACE_PARENTHESIS = (r'[(){}_\[\]]', '')
_RE_REPLACE_QUESTIONMARK = ('\s\?', '?')
_RE_REPLACE_EXCLAMATIONMARK = ('\s\!', '!')
_RE_REPLACE_DOT = ('\s\.', '.')
_RE_COMBINE_WHITESPACE = (r'\s+', ' ')

re_list = [
    _RE_REPLACE_PARENTHESIS,
    _RE_REPLACE_QUESTIONMARK,
    _RE_REPLACE_EXCLAMATIONMARK,
    _RE_REPLACE_DOT,
    _RE_COMBINE_WHITESPACE, 
]


for i, txt in enumerate(df.text):
    retxt = txt
    retxt = retxt.lstrip(' .,?!')
    if retxt[:-1] in ['"', "'"]:
        retxt.strip('\"\'')    
    for pattern, repl in re_list:
        retxt = re.sub(pattern, repl, retxt).strip()
    if txt.count('"') == 1:
        print(txt, retxt, sep='\n', end='\n\n')
    df.loc[i, 'retext'] = retxt

In [91]:
df

Unnamed: 0,page,index,audio_path,audio_array,text,sample_rate,retext
0,p311,p311_139,/data4/TTS/VCTK/VCTK-Corpus/wav48/p311/p311_13...,"[-0.0062520076, -0.011541563, -0.010065744, -0...",They got the job done.,16000,They got the job done.
1,p311,p311_375,/data4/TTS/VCTK/VCTK-Corpus/wav48/p311/p311_37...,"[-0.00071739114, -0.0012887787, -0.0010795597,...",We don't ask for much.,16000,We don't ask for much.
2,p311,p311_093,/data4/TTS/VCTK/VCTK-Corpus/wav48/p311/p311_09...,"[0.0024772577, 0.004634639, 0.0041303067, 0.00...",It is set in Paris.,16000,It is set in Paris.
3,p311,p311_251,/data4/TTS/VCTK/VCTK-Corpus/wav48/p311/p311_25...,"[-0.0021431618, -0.0037750585, -0.0032232087, ...","Mentally, you have to be tough.",16000,"Mentally, you have to be tough."
4,p311,p311_110,/data4/TTS/VCTK/VCTK-Corpus/wav48/p311/p311_11...,"[0.0016572413, 0.0032106584, 0.0028606544, 0.0...",It is not really used by many people.,16000,It is not really used by many people.
...,...,...,...,...,...,...,...
44065,p316,p316_299,/data4/TTS/VCTK/VCTK-Corpus/wav48/p316/p316_29...,"[0.0013067361, 0.0023353891, 0.0020710027, 0.0...",I am quite excited about it.,16000,I am quite excited about it.
44066,p316,p316_374,/data4/TTS/VCTK/VCTK-Corpus/wav48/p316/p316_37...,"[-0.0013757308, -0.0026514267, -0.0023597914, ...",We said it will happen one night.,16000,We said it will happen one night.
44067,p316,p316_073,/data4/TTS/VCTK/VCTK-Corpus/wav48/p316/p316_07...,"[0.008805143, 0.016143003, 0.014221777, 0.0152...",He was also a great man.,16000,He was also a great man.
44068,p316,p316_089,/data4/TTS/VCTK/VCTK-Corpus/wav48/p316/p316_08...,"[-0.00470766, -0.0086214105, -0.007555666, -0....",The driver escaped injury.,16000,The driver escaped injury.
