# Notebook for preprocessing Wikipedia (English) dataset

### Initilizing phonemizer and tokenizer

In [1]:
import yaml

config_path = "Configs/config.yml" # you can change it to anything else
config = yaml.safe_load(open(config_path))

In [2]:
from phonemize import phonemize

In [3]:
import phonemizer
global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True, language_switch="remove-flags")

In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(config['dataset_params']['tokenizer']) # you can use any other tokenizers if you want to

In [64]:
root_directory = "/mnt/data/wiki_phoneme"


### Process dataset

In [5]:
from datasets import load_dataset
dataset = load_dataset("wikimedia/wikipedia", "20231101.hi")['train'] # you can use other version of this dataset

In [5]:
from multiprocessing import Pool
from tqdm import tqdm


In [6]:
import os
num_shards = 50000

def process_shard(i):
    directory = root_directory + "/shard_" + str(i)
    if os.path.exists(directory):
        print("Shard %d already exists!" % i)
        return
    print('Processing shard %d ...' % i)
    shard = dataset.shard(num_shards=num_shards, index=i)
    processed_dataset = shard.map(lambda t: phonemize(t['text'], global_phonemizer, tokenizer), remove_columns=['text'])
    if not os.path.exists(directory):
        os.makedirs(directory)
    processed_dataset.save_to_disk(directory)

In [9]:
def process_shards_parallel(num_processes=96):
    """
    Process all shards using multiprocessing
    """
    print(f"Starting processing with {num_processes} processes")
    
    with Pool(processes=num_processes) as pool:
        # Process shards with progress bar
        for _ in tqdm(
            pool.imap_unordered(process_shard, range(num_shards)),
            total=num_shards,
            desc="Processing shards"
        ):
            pass


In [None]:
process_shards_parallel(num_processes=96)  

### Collect all shards to form the processed dataset

In [65]:
from datasets import load_from_disk, concatenate_datasets
import os 

output = [dI for dI in os.listdir(root_directory) if os.path.isdir(os.path.join(root_directory,dI))]
datasets = []
for o in output:
    directory = root_directory + "/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        # print("%s loaded" % o)
    except:
        print("Continued")
        continue

In [None]:
dataset = concatenate_datasets(datasets)
# dataset.save_to_disk(config['data_folder'])
print('Dataset saved to %s' % config['data_folder'])

In [None]:
dataset.push_to_hub(
    "wasimmadha/plbert-dataset-hindi",  # replace with your desired repository name
    private=True,  # set to True if you want it private
    token="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"  # token will be used from the previous login
)


In [None]:
# check the dataset size
dataset

### Remove unneccessary tokens from the pre-trained tokenizer
The pre-trained tokenizer contains a lot of tokens that are not used in our dataset, so we need to remove these tokens. We also want to predict the word in lower cases because cases do not matter that much for TTS. Pruning the tokenizer is much faster than training a new tokenizer from scratch. 

In [11]:
# from datasets import load_dataset
# from simple_loader import FilePathDataset, build_dataloader
    
# # Load dataset from hub
# dataset = load_dataset("wasimmadha/plbert-dataset", token="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")


In [None]:
from simple_loader import FilePathDataset, build_dataloader

file_data1 = FilePathDataset(dataset)
loader = build_dataloader(file_data, num_workers=32, batch_size=128)

In [84]:
loader = build_dataloader(file_data1, num_workers=32, batch_size=128)

In [None]:
for i, batch in loader:
    print(batch)

In [None]:
file_data[0]['input_ids']

In [None]:
file_data1[0]['phonemes']

In [72]:
import pickle

with open("/home/ubuntu/PL-BERT/multilingual-pl-bert/token_maps.pkl", 'rb') as handle:
    token_maps = pickle.load(handle)

In [None]:
token_maps[50100]

In [None]:
file_data[0]

In [None]:
tokenizer.encode(text="कन्हैयालाल सेठिया")

In [58]:
special_token = config['dataset_params']['word_separator']

In [None]:
# get all unique tokens in the entire dataset

from tqdm import tqdm

unique_index = [special_token]
for _, batch in enumerate(tqdm(loader)):
    unique_index.extend(batch)
    unique_index = list(unique_index)

In [None]:
# get each token's lower case

lower_tokens = []
for t in tqdm(unique_index):
    word = tokenizer.decode(t)
    if word.lower() != word:
        t = tokenizer.encode(word.lower())
        lower_tokens.append(t)
    else:
        lower_tokens.append(t)

In [None]:
lower_tokens = (list(set(lower_tokens)))

In [None]:
lower_tokens

In [None]:
# redo the mapping for lower number of tokens

token_maps = {}
for t in tqdm(unique_index):
    word = tokenizer.decode([t])
    word = word.lower()
    new_t = tokenizer.encode([word.lower()])[0]
    token_maps[t] = {'word': word, 'token': lower_tokens.index(new_t)}

In [None]:
import pickle
with open(config['dataset_params']['token_maps'], 'wb') as handle:
    pickle.dump(token_maps, handle)
print('Token mapper saved to %s' % config['dataset_params']['token_maps'])

### Test the dataset with dataloader


In [None]:
from dataloader import build_dataloader

train_loader = build_dataloader(dataset, batch_size=32, num_workers=0, dataset_config=config['dataset_params'])

In [20]:
_, (words, labels, phonemes, input_lengths, masked_indices) = next(enumerate(train_loader))

In [None]:
words

In [None]:
labels

In [None]:
phonemes

In [None]:
input_lengths

In [None]:
masked_indices