# Notebook for preprocessing Wikipedia (English) dataset

### Initilizing phonemizer and tokenizer

In [6]:
import yaml

config_path = "Configs/config.yml" # you can change it to anything else
config = yaml.safe_load(open(config_path))

In [7]:
from phonemize import phonemize

In [8]:
import phonemizer
global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)

In [9]:
from transformers import TransfoXLTokenizer
tokenizer = TransfoXLTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to



In [10]:
tokenizer.tokenize("Hello, how are you?") # just to make sure it works

['Hello', ',', 'how', 'are', 'you', '?']

In [11]:
phonemize("Hello, how are you?", global_phonemizer, tokenizer)

{'input_ids': [14049, 2, 433, 37, 304, 788],
 'phonemes': ['həlˈoʊ', ',', 'hˈaʊ', 'ɑːɹ', 'juː', '?']}

### Process dataset

In [5]:
from datasets import load_dataset
dataset = load_dataset("wikipedia", "20220301.en", trust_remote_code=True)['train'] # you can use other version of this dataset

In [6]:
root_directory = "./wiki_phoneme" # set up root directory for multiprocessor processing

In [7]:
import os
num_shards = 50000

def process_shard(i):
    directory = root_directory + "/shard_" + str(i)
    if os.path.exists(directory):
        print("Shard %d already exists!" % i)
        return
    print('Processing shard %d ...' % i)
    shard = dataset.shard(num_shards=num_shards, index=i)
    processed_dataset = shard.map(lambda t: phonemize(t['text'], global_phonemizer, tokenizer), remove_columns=['text'])
    if not os.path.exists(directory):
        os.makedirs(directory)
    processed_dataset.save_to_disk(directory)

In [8]:
from pebble import ProcessPool
from concurrent.futures import TimeoutError

#### Note: You will need to run the following cell multiple times to process all shards because some will fail. Depending on how fast you process each shard, you will need to change the timeout to a longer value to make more shards processed before being killed.


In [9]:
import os

# Lấy số core của CPU
num_cores = os.cpu_count()
print(f"Số core của CPU là: {num_cores}")

Số core của CPU là: 80


In [None]:
max_workers = num_cores # change this to the number of CPU cores your machine has 


with ProcessPool(max_workers=max_workers) as pool:
    pool.map(process_shard, range(num_shards), timeout=60)

Processing shard 0 ...
Processing shard 1 ...
Processing shard 2 ...
Processing shard 3 ...
Processing shard 4 ...
Processing shard 5 ...
Processing shard 6 ...
Processing shard 7 ...Processing shard 8 ...

Processing shard 9 ...
Processing shard 10 ...
Processing shard 11 ...Processing shard 13 ...Processing shard 12 ...Processing shard 14 ...Processing shard 15 ...




Processing shard 17 ...
Processing shard 18 ...
Processing shard 16 ...
Processing shard 19 ...
Processing shard 20 ...
Processing shard 21 ...
Processing shard 22 ...Processing shard 23 ...Processing shard 24 ...
Processing shard 25 ...
Processing shard 26 ...


Processing shard 28 ...Processing shard 29 ...
Processing shard 30 ...
Processing shard 31 ...Processing shard 32 ...Processing shard 33 ...Processing shard 34 ...
Processing shard 35 ...

Processing shard 36 ...


Processing shard 38 ...Processing shard 37 ...
Processing shard 39 ...Processing shard 43 ...Processing shard 42 ...Processing shard 41 ...Processi

Map:   8%|▊         | 10/130 [00:42<08:25,  4.21s/ examples]
Map:  12%|█▏        | 15/130 [00:41<03:51,  2.01s/ examples]

Processing shard 80 ...


Map:   5%|▌         | 7/130 [00:46<14:22,  7.01s/ examples]]

Processing shard 81 ...


Map:  12%|█▏        | 15/130 [00:46<07:57,  4.15s/ examples]

Processing shard 82 ...


Map:  15%|█▌        | 20/130 [00:47<05:47,  3.16s/ examples]

Processing shard 83 ...

Map:  18%|█▊        | 23/130 [00:48<08:03,  4.52s/ examples]


Processing shard 84 ...

Map:  34%|███▍      | 44/130 [00:47<00:21,  3.96 examples/s]

Processing shard 85 ...


Map:   7%|▋         | 9/130 [00:48<09:08,  4.53s/ examples]


Processing shard 86 ...

Map:   8%|▊         | 10/130 [00:46<13:54,  6.96s/ examples]


Processing shard 87 ...Processing shard 88 ...


Map:   8%|▊         | 11/130 [00:46<07:13,  3.65s/ examples]


Processing shard 89 ...

Map:   9%|▉         | 12/130 [00:49<09:44,  4.95s/ examples]

Processing shard 90 ...

Map:  16%|█▌        | 21/130 [00:47<04:14,  2.34s/ examples]



Processing shard 91 ...

Map:  12%|█▏        | 16/130 [00:49<05:36,  2.95s/ examples]




Map:   9%|▉         | 12/130 [00:52<05:16,  2.68s/ examples]

Processing shard 92 ...
Processing shard 93 ...
Processing shard 94 ...
Processing shard 95 ...
Processing shard 96 ...
Processing shard 97 ...
Processing shard 98 ...
Processing shard 99 ...
Processing shard 100 ...
Processing shard 101 ...
Processing shard 102 ...
Processing shard 103 ...
Processing shard 104 ...
Processing shard 105 ...
Processing shard 106 ...
Processing shard 107 ...
Processing shard 108 ...
Processing shard 109 ...
Processing shard 110 ...
Processing shard 111 ...
Processing shard 112 ...
Processing shard 113 ...
Processing shard 114 ...
Processing shard 115 ...
Processing shard 116 ...
Processing shard 117 ...
Processing shard 118 ...
Processing shard 119 ...
Processing shard 120 ...
Processing shard 121 ...
Processing shard 122 ...Processing shard 123 ...

Processing shard 124 ...
Processing shard 125 ...
Processing shard 126 ...
Processing shard 127 ...
Processing shard 128 ...
Processing shard 129 ...Processing shard 130 ...
Processing shard 131 ...

Processi

Map:   0%|          | 0/130 [00:00<?, ? examples/s]

Processing shard 158 ...
Processing shard 159 ...
Processing shard 160 ...


Map:   5%|▍         | 6/130 [00:20<06:59,  3.38s/ examples]]
Map:  33%|███▎      | 43/130 [00:20<00:46,  1.88 examples/s]

Processing shard 161 ...

Map:  21%|██        | 27/130 [00:15<01:09,  1.48 examples/s]




Map:  28%|██▊       | 36/130 [00:31<03:14,  2.07s/ examples]

Processing shard 162 ...

Map:  17%|█▋        | 22/130 [00:33<04:23,  2.44s/ examples]




Map:  44%|████▍     | 57/130 [00:45<00:33,  2.20 examples/s]

Processing shard 163 ...

Map:  14%|█▍        | 18/130 [00:44<05:38,  3.02s/ examples]




Map:  15%|█▍        | 19/130 [00:41<03:00,  1.63s/ examples]

Processing shard 164 ...

Map:  36%|███▌      | 47/130 [00:43<01:39,  1.20s/ examples]


Processing shard 165 ...

Map:   8%|▊         | 11/130 [00:45<07:40,  3.87s/ examples]


Processing shard 166 ...

Map:  26%|██▌       | 34/130 [00:43<01:23,  1.15 examples/s]


Processing shard 167 ...


Map:  13%|█▎        | 17/130 [00:42<05:13,  2.78s/ examples]

Processing shard 168 ...

Map:  28%|██▊       | 36/130 [00:44<01:00,  1.55 examples/s]

Processing shard 169 ...

Map:  45%|████▍     | 58/130 [00:45<00:34,  2.06 examples/s]




Map:  47%|████▋     | 61/130 [00:46<00:23,  2.98 examples/s]




Map:  45%|████▌     | 59/130 [00:45<00:36,  1.94 examples/s]

Processing shard 170 ...

Map:  59%|█████▉    | 77/130 [00:47<00:20,  2.63 examples/s]


Processing shard 171 ...


Map:   9%|▉         | 12/130 [00:46<07:54,  4.02s/ examples]

Processing shard 172 ...

Map:   2%|▏         | 2/130 [00:00<00:21,  6.06 examples/s]


Processing shard 173 ...

Map:  27%|██▋       | 35/130 [00:44<01:21,  1.16 examples/s]




Map:   5%|▌         | 7/130 [00:46<13:34,  6.62s/ examples]


Processing shard 174 ...


Map:   9%|▉         | 12/130 [00:46<05:37,  2.86s/ examples]

Processing shard 175 ...

Map:  42%|████▏     | 54/130 [00:47<00:39,  1.91 examples/s]


Processing shard 176 ...Processing shard 177 ...

Map:  62%|██████▏   | 80/130 [00:51<00:49,  1.01 examples/s]




Map:  15%|█▌        | 20/130 [00:48<04:26,  2.43s/ examples]


Processing shard 178 ...

Map:  50%|█████     | 65/130 [00:48<00:29,  2.22 examples/s]




Map:   8%|▊         | 11/130 [00:05<01:09,  1.72 examples/s]

Processing shard 179 ...

Map:  10%|█         | 13/130 [00:48<03:59,  2.04s/ examples]

Processing shard 180 ...


Map:  67%|██████▋   | 87/130 [00:52<00:31,  1.38 examples/s]

Processing shard 181 ...


Map:  53%|█████▎    | 69/130 [00:50<00:30,  2.00 examples/s]


Processing shard 182 ...

Map:  45%|████▌     | 59/130 [00:50<00:31,  2.26 examples/s]


Processing shard 183 ...
Processing shard 184 ...

Map:  15%|█▌        | 20/130 [00:47<03:10,  1.74s/ examples]

Processing shard 185 ...
Processing shard 186 ...

Map:   9%|▉         | 12/130 [00:05<01:03,  1.87 examples/s]




Map:  12%|█▏        | 16/130 [00:52<07:30,  3.95s/ examples]


Processing shard 187 ...


Map:  27%|██▋       | 35/130 [00:47<05:07,  3.23s/ examples]

Processing shard 188 ...

Map:  68%|██████▊   | 88/130 [00:53<00:27,  1.54 examples/s]

Processing shard 189 ...


Map:  17%|█▋        | 22/130 [00:53<03:45,  2.09s/ examples]




Map:  54%|█████▍    | 70/130 [00:51<00:29,  2.03 examples/s]

Processing shard 191 ...Processing shard 190 ...

Map:  38%|███▊      | 49/130 [00:50<00:57,  1.42 examples/s]


Processing shard 192 ...

Map:  16%|█▌        | 21/130 [00:24<01:47,  1.01 examples/s]




Map:   8%|▊         | 11/130 [00:52<08:25,  4.25s/ examples]


Processing shard 193 ...


Map:  16%|█▌        | 21/130 [00:48<02:25,  1.33s/ examples]

Processing shard 194 ...

Map:  17%|█▋        | 22/130 [00:24<01:20,  1.34 examples/s]




Map:  50%|█████     | 65/130 [00:52<01:13,  1.13s/ examples]

Processing shard 195 ...Processing shard 196 ...

Processing shard 197 ...Processing shard 198 ...

Map:  10%|█         | 13/130 [00:05<01:01,  1.89 examples/s]





Map:  46%|████▌     | 60/130 [00:50<00:35,  1.99 examples/s]

Processing shard 199 ...

Map:   8%|▊         | 11/130 [00:52<09:05,  4.58s/ examples]


Processing shard 200 ...

Map:  18%|█▊        | 23/130 [00:25<01:00,  1.77 examples/s]




Map:  39%|███▉      | 51/130 [00:52<00:55,  1.42 examples/s]

Processing shard 201 ...
Processing shard 202 ...
Processing shard 203 ...
Processing shard 204 ...
Processing shard 205 ...
Processing shard 206 ...
Processing shard 207 ...
Processing shard 208 ...
Processing shard 209 ...
Processing shard 210 ...
Processing shard 211 ...
Processing shard 212 ...
Processing shard 213 ...
Processing shard 214 ...
Processing shard 215 ...
Processing shard 216 ...
Processing shard 217 ...
Processing shard 218 ...
Processing shard 219 ...
Processing shard 220 ...
Processing shard 221 ...
Processing shard 222 ...
Processing shard 223 ...
Processing shard 224 ...Processing shard 225 ...

Processing shard 226 ...
Processing shard 227 ...
Processing shard 228 ...Processing shard 229 ...

Processing shard 230 ...
Processing shard 231 ...
Processing shard 232 ...
Processing shard 233 ...
Processing shard 234 ...
Processing shard 235 ...
Processing shard 236 ...Processing shard 237 ...

Processing shard 238 ...
Processing shard 239 ...
Processing shard 240 ...


### Collect all shards to form the processed dataset

In [10]:
from datasets import load_from_disk, concatenate_datasets

output = [dI for dI in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory,dI))]
datasets = []
for o in output:
    directory = root_directory + "/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print("%s loaded" % o)
    except:
        continue

shard_135 loaded
shard_635 loaded
shard_638 loaded
shard_637 loaded
shard_636 loaded


In [13]:
dataset = concatenate_datasets(datasets)
dataset.save_to_disk(config['data_folder'])
print('Dataset saved to %s' % config['data_folder'])

Saving the dataset (0/1 shards):   0%|          | 0/650 [00:00<?, ? examples/s]

Saving the dataset (1/1 shards): 100%|██████████| 650/650 [00:00<00:00, 29162.94 examples/s]

Dataset saved to wikipedia_20220301.en.processed





In [14]:
# check the dataset size
dataset

Dataset({
    features: ['id', 'url', 'title', 'input_ids', 'phonemes'],
    num_rows: 650
})

In [55]:
dataset[0]['id'], dataset[0]['url'], dataset[0]['title']

('35266', 'https://en.wikipedia.org/wiki/0s%20BC', '0s BC')

In [56]:
print(len(dataset[0]['input_ids']))
print(dataset[0]['input_ids'][:10])
print([tokenizer.decode([t]) for t in dataset[0]['input_ids'][:10]])

582
[13, 6158, 2024, 28, 1, 354, 83, 769, 2024, 5]
['The', 'zero', 'BC', 'were', 'the', 'period', 'between', 'nine', 'BC', 'and']


In [57]:
print(len(dataset[0]['phonemes']))
print(dataset[0]['phonemes'][:10])

582
['ðə', 'zˈiəɹoʊ', 'bˌiːsˈiː', 'wɜː', 'ðə', 'pˈiəɹɪəd', 'bᵻtwˈiːn', 'nˈaɪn', 'bˌiːsˈiː', 'ænd']


### Remove unneccessary tokens from the pre-trained tokenizer
The pre-trained tokenizer contains a lot of tokens that are not used in our dataset, so we need to remove these tokens. We also want to predict the word in lower cases because cases do not matter that much for TTS. Pruning the tokenizer is much faster than training a new tokenizer from scratch. 

In [15]:
from simple_loader import FilePathDataset, build_dataloader

file_data = FilePathDataset(dataset)
loader = build_dataloader(file_data, num_workers=32, batch_size=128)

In [17]:
special_token = config['dataset_params']['word_separator']
special_token

3039

In [18]:
# get all unique tokens in the entire dataset

from tqdm import tqdm

unique_index = [special_token]
for _, batch in enumerate(tqdm(loader)):
    unique_index.extend(batch)
    unique_index = list(set(unique_index))

100%|██████████| 5/5 [00:01<00:00,  3.55it/s]


In [20]:
# get each token's lower case

lower_tokens = []
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    if word.lower() != word:
        t = tokenizer.encode([word.lower()])[0]
        lower_tokens.append(t)
    else:
        lower_tokens.append(t)

100%|██████████| 15024/15024 [00:00<00:00, 17429.79it/s]


In [21]:
lower_tokens = (list(set(lower_tokens)))

In [22]:
# redo the mapping for lower number of tokens

token_maps = {}
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    word = word.lower()
    new_t = tokenizer.encode([word.lower()])[0]
    token_maps[t] = {'word': word, 'token': lower_tokens.index(new_t)}

100%|██████████| 15024/15024 [00:04<00:00, 3639.40it/s]


In [24]:
token_maps

{262144: {'word': 'pomroy', 'token': 23},
 1: {'word': 'the', 'token': 0},
 2: {'word': ',', 'token': 1},
 3: {'word': '.', 'token': 2},
 4: {'word': 'of', 'token': 3},
 5: {'word': 'and', 'token': 4},
 6: {'word': 'to', 'token': 5},
 7: {'word': 'in', 'token': 6},
 8: {'word': 'a', 'token': 7},
 9: {'word': '=', 'token': 8},
 10: {'word': '"', 'token': 9},
 11: {'word': 'was', 'token': 10},
 32774: {'word': 'amidst', 'token': 6882},
 13: {'word': 'the', 'token': 0},
 14: {'word': "'s", 'token': 13},
 15: {'word': 'on', 'token': 14},
 16: {'word': 'that', 'token': 15},
 17: {'word': 'for', 'token': 16},
 18: {'word': 'as', 'token': 17},
 19: {'word': 'with', 'token': 18},
 20: {'word': 'by', 'token': 19},
 21: {'word': ')', 'token': 20},
 22: {'word': '(', 'token': 21},
 23: {'word': 'is', 'token': 22},
 24: {'word': '<unk>', 'token': 23},
 25: {'word': 'his', 'token': 24},
 26: {'word': 'from', 'token': 25},
 27: {'word': 'at', 'token': 26},
 28: {'word': 'were', 'token': 27},
 29: {'

In [25]:
import pickle
with open(config['dataset_params']['token_maps'], 'wb') as handle:
    pickle.dump(token_maps, handle)
print('Token mapper saved to %s' % config['dataset_params']['token_maps'])

Token mapper saved to token_maps.pkl


### Test the dataset with dataloader


In [26]:
from dataloader import build_dataloader

train_loader = build_dataloader(dataset, batch_size=32, num_workers=0, dataset_config=config['dataset_params'])

177


In [61]:
from dataloader import build_dataloader

train_loader = build_dataloader(dataset, batch_size=32, num_workers=0)

177


In [62]:
_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader))

In [31]:
words.shape

torch.Size([32, 512])

In [None]:
labels.shape # labels are the original phoneme tokens

torch.Size([32, 512])

In [None]:
phonemes.shape # phonemes are the phoneme tokens after masked

torch.Size([32, 512])

In [39]:
len(input_lengths)

32

In [41]:
len(masked_indices)

32