Used in Chinese #6

mayfool · 2023-04-18T06:01:45Z

In Chinese datasets, do English phonemes correspond to Chinese pinyin in papers?Are there any other changes needed?

yl4579 · 2023-04-18T21:00:46Z

You can use Pinyin but I prefer IPA since it is more general. You can refer to yl4579/StyleTTS#10 for more details about how to phonemize Chinese characters into IPA. You can either make the grapheme word-level or character-level. Character-level is probably easier and no alignment is needed. I will release the preprocessing recipe for Japanese later at the word level that involves alignment.

mayfool · 2023-04-24T03:43:41Z

Thansk for reply, I'll have a try latter as you said.

innnky · 2023-11-07T02:54:13Z

You can use Pinyin but I prefer IPA since it is more general. You can refer to yl4579/StyleTTS#10 for more details about how to phonemize Chinese characters into IPA. You can either make the grapheme word-level or character-level. Character-level is probably easier and no alignment is needed. I will release the preprocessing recipe for Japanese later at the word level that involves alignment.

Can you please share the code for aligning Japanese word levels with phoneme levels, or some simple ideas and code snippets?

yl4579 · 2023-11-07T05:57:05Z

@innnky Here's some code snippets I used for Japanese (note that this is not IPA, for IPA you need to convert kana to IPA using this mapping here: yl4579/StyleTTS#10 (comment))

from datasets import load_dataset
dataset = load_dataset("wiki40b", "ja", split='train', beam_runner='DirectRunner')

from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-v2")

Some utility functions:

def alphabetreading(text):
    alphabet_dict = {"A": "エイ",
                "B": "ビー",
                 "C": "シー",
                 "D": "ディー",
                 "E": "イー",
                 "F": "エフ",
                 "G": "ジー",
                 "H": "エイチ",
                "I":"アイ",
                "J":"ジェイ",
                "K":"ケイ",
                "L":"エル",
                "M":"エム",
                "N":"エヌ",
                "O":"オー",
                "P":"ピー",
                "Q":"キュー",
                "R":"アール",
                "S":"エス",
                "T":"ティー",
                "U":"ユー",
                "V":"ヴィー",
                "W":"ダブリュー",
                "X":"エックス",
                "Y":"ワイ",
                "Z":"ゼッド"}
    text = text.upper()
    text_ret = ""
    for t in text:
        if t in alphabet_dict:
            text_ret += alphabet_dict[t]
        else:
            text_ret += t
    return text_ret
import unicodedata
import pyopenjtalk
import jaconv

def extract_features(text):
    NJD_print, JPCommon_make_label = pyopenjtalk.run_frontend(unicodedata.normalize('NFKC', text).replace('\n', ''))
    
    words = []

    for n in NJD_print:
        k = n.split(',')
        kana = unicodedata.normalize("NFKC", k[9])
        tokens = unicodedata.normalize("NFKC", k[0])
        
        if k[1] == "記号":
            
            words.append({'tokens': tokenizer.encode([tokens])[1], 'words': tokens, 'kana': alphabetreading(tokens)})
        else:
            if kana == "ュ" or kana == "ャ" or kana == "ョ":
                words[-1]['words'] += k[0]
                words[-1]['kana'] += kana
            else:
                words.append({'tokens': tokenizer.encode([tokens])[1], 'words': tokens, 'kana': kana})
    
    return words

def phonemize(text):
    texts = text.split('。')
    words = []
    for text in texts:
        if text == "":
            continue
        if text[-1] != "。":
            text += "。"
        words.extend(extract_features(text))
    input_ids = [w['tokens'] for w in words]
    phonemes = [w['kana'] for w in words]
    
    return {'input_ids' : input_ids, 'phonemes': phonemes}

Then process shards

import os
def process_shard(i):
    directory = "./wiki_phoneme/shard_" + str(i)
    if os.path.exists(directory):
        print("Shard %d already exists!" % i)
        return
    print('Processing shard %d ...' % i)
    shard = dataset.shard(num_shards=1000, index=i)
    processed_dataset = shard.map(lambda t: phonemize(''.join(t['text'].split('_START_PARAGRAPH_')[1:])), remove_columns=['text'])
    if not os.path.exists(directory):
        os.makedirs(directory)
    processed_dataset.save_to_disk(directory)

from pebble import ProcessPool
from concurrent.futures import TimeoutError

with ProcessPool(max_workers=32) as pool:
    pool.map(process_shard, range(1000), timeout=120)

Lastly put them together:

directory = "wiki_phoneme"
import os
output = [dI for dI in os.listdir(directory) if os.path.isdir(os.path.join(directory,dI))]

from datasets import load_from_disk, concatenate_datasets

datasets = []

for o in output:
    directory = "./wiki_phoneme/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print("%s loaded" % o)
    except:
        continue

innnky · 2023-11-07T06:18:05Z

@innnky Here's some code snippets I used for Japanese (note that this is not IPA, for IPA you need to convert kana to IPA using this mapping here: yl4579/StyleTTS#10 (comment))

from datasets import load_dataset
dataset = load_dataset("wiki40b", "ja", split='train', beam_runner='DirectRunner')

from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-v2")

Some utility functions:

def alphabetreading(text):
    alphabet_dict = {"A": "エイ",
                "B": "ビー",
                 "C": "シー",
                 "D": "ディー",
                 "E": "イー",
                 "F": "エフ",
                 "G": "ジー",
                 "H": "エイチ",
                "I":"アイ",
                "J":"ジェイ",
                "K":"ケイ",
                "L":"エル",
                "M":"エム",
                "N":"エヌ",
                "O":"オー",
                "P":"ピー",
                "Q":"キュー",
                "R":"アール",
                "S":"エス",
                "T":"ティー",
                "U":"ユー",
                "V":"ヴィー",
                "W":"ダブリュー",
                "X":"エックス",
                "Y":"ワイ",
                "Z":"ゼッド"}
    text = text.upper()
    text_ret = ""
    for t in text:
        if t in alphabet_dict:
            text_ret += alphabet_dict[t]
        else:
            text_ret += t
    return text_ret
import unicodedata
import pyopenjtalk
import jaconv

def extract_features(text):
    NJD_print, JPCommon_make_label = pyopenjtalk.run_frontend(unicodedata.normalize('NFKC', text).replace('\n', ''))
    
    words = []

    for n in NJD_print:
        k = n.split(',')
        kana = unicodedata.normalize("NFKC", k[9])
        tokens = unicodedata.normalize("NFKC", k[0])
        
        if k[1] == "記号":
            
            words.append({'tokens': tokenizer.encode([tokens])[1], 'words': tokens, 'kana': alphabetreading(tokens)})
        else:
            if kana == "ュ" or kana == "ャ" or kana == "ョ":
                words[-1]['words'] += k[0]
                words[-1]['kana'] += kana
            else:
                words.append({'tokens': tokenizer.encode([tokens])[1], 'words': tokens, 'kana': kana})
    
    return words

def phonemize(text):
    texts = text.split('。')
    words = []
    for text in texts:
        if text == "":
            continue
        if text[-1] != "。":
            text += "。"
        words.extend(extract_features(text))
    input_ids = [w['tokens'] for w in words]
    phonemes = [w['kana'] for w in words]
    
    return {'input_ids' : input_ids, 'phonemes': phonemes}

Then process shards

import os
def process_shard(i):
    directory = "./wiki_phoneme/shard_" + str(i)
    if os.path.exists(directory):
        print("Shard %d already exists!" % i)
        return
    print('Processing shard %d ...' % i)
    shard = dataset.shard(num_shards=1000, index=i)
    processed_dataset = shard.map(lambda t: phonemize(''.join(t['text'].split('_START_PARAGRAPH_')[1:])), remove_columns=['text'])
    if not os.path.exists(directory):
        os.makedirs(directory)
    processed_dataset.save_to_disk(directory)

from pebble import ProcessPool
from concurrent.futures import TimeoutError

with ProcessPool(max_workers=32) as pool:
    pool.map(process_shard, range(1000), timeout=120)

Lastly put them together:

directory = "wiki_phoneme"
import os
output = [dI for dI in os.listdir(directory) if os.path.isdir(os.path.join(directory,dI))]

from datasets import load_from_disk, concatenate_datasets

datasets = []

for o in output:
    directory = "./wiki_phoneme/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print("%s loaded" % o)
    except:
        continue

Thanks for reply！！❤️❤️

yl4579 closed this as completed Apr 19, 2023

yl4579 mentioned this issue Mar 7, 2024

Training a Japanese model, pitch accent and IPA yl4579/StyleTTS2#186

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Used in Chinese #6

Used in Chinese #6

mayfool commented Apr 18, 2023

yl4579 commented Apr 18, 2023

mayfool commented Apr 24, 2023

innnky commented Nov 7, 2023

yl4579 commented Nov 7, 2023 •

edited

Loading

innnky commented Nov 7, 2023

Used in Chinese #6

Used in Chinese #6

Comments

mayfool commented Apr 18, 2023

yl4579 commented Apr 18, 2023

mayfool commented Apr 24, 2023

innnky commented Nov 7, 2023

yl4579 commented Nov 7, 2023 • edited Loading

innnky commented Nov 7, 2023

yl4579 commented Nov 7, 2023 •

edited

Loading