Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Used in Chinese #6

Closed
mayfool opened this issue Apr 18, 2023 · 5 comments
Closed

Used in Chinese #6

mayfool opened this issue Apr 18, 2023 · 5 comments

Comments

@mayfool
Copy link

mayfool commented Apr 18, 2023

In Chinese datasets, do English phonemes correspond to Chinese pinyin in papers?Are there any other changes needed?

@yl4579
Copy link
Owner

yl4579 commented Apr 18, 2023

You can use Pinyin but I prefer IPA since it is more general. You can refer to yl4579/StyleTTS#10 for more details about how to phonemize Chinese characters into IPA. You can either make the grapheme word-level or character-level. Character-level is probably easier and no alignment is needed. I will release the preprocessing recipe for Japanese later at the word level that involves alignment.

@yl4579 yl4579 closed this as completed Apr 19, 2023
@mayfool
Copy link
Author

mayfool commented Apr 24, 2023

Thansk for reply, I'll have a try latter as you said.

@innnky
Copy link

innnky commented Nov 7, 2023

You can use Pinyin but I prefer IPA since it is more general. You can refer to yl4579/StyleTTS#10 for more details about how to phonemize Chinese characters into IPA. You can either make the grapheme word-level or character-level. Character-level is probably easier and no alignment is needed. I will release the preprocessing recipe for Japanese later at the word level that involves alignment.

Can you please share the code for aligning Japanese word levels with phoneme levels, or some simple ideas and code snippets?

@yl4579
Copy link
Owner

yl4579 commented Nov 7, 2023

@innnky Here's some code snippets I used for Japanese (note that this is not IPA, for IPA you need to convert kana to IPA using this mapping here: yl4579/StyleTTS#10 (comment))

from datasets import load_dataset
dataset = load_dataset("wiki40b", "ja", split='train', beam_runner='DirectRunner')

from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-v2")

Some utility functions:

def alphabetreading(text):
    alphabet_dict = {"A": "エイ",
                "B": "ビー",
                 "C": "シー",
                 "D": "ディー",
                 "E": "イー",
                 "F": "エフ",
                 "G": "ジー",
                 "H": "エイチ",
                "I":"アイ",
                "J":"ジェイ",
                "K":"ケイ",
                "L":"エル",
                "M":"エム",
                "N":"エヌ",
                "O":"オー",
                "P":"ピー",
                "Q":"キュー",
                "R":"アール",
                "S":"エス",
                "T":"ティー",
                "U":"ユー",
                "V":"ヴィー",
                "W":"ダブリュー",
                "X":"エックス",
                "Y":"ワイ",
                "Z":"ゼッド"}
    text = text.upper()
    text_ret = ""
    for t in text:
        if t in alphabet_dict:
            text_ret += alphabet_dict[t]
        else:
            text_ret += t
    return text_ret
import unicodedata
import pyopenjtalk
import jaconv

def extract_features(text):
    NJD_print, JPCommon_make_label = pyopenjtalk.run_frontend(unicodedata.normalize('NFKC', text).replace('\n', ''))
    
    words = []

    for n in NJD_print:
        k = n.split(',')
        kana = unicodedata.normalize("NFKC", k[9])
        tokens = unicodedata.normalize("NFKC", k[0])
        
        if k[1] == "記号":
            
            words.append({'tokens': tokenizer.encode([tokens])[1], 'words': tokens, 'kana': alphabetreading(tokens)})
        else:
            if kana == "ュ" or kana == "ャ" or kana == "ョ":
                words[-1]['words'] += k[0]
                words[-1]['kana'] += kana
            else:
                words.append({'tokens': tokenizer.encode([tokens])[1], 'words': tokens, 'kana': kana})
    
    return words

def phonemize(text):
    texts = text.split('。')
    words = []
    for text in texts:
        if text == "":
            continue
        if text[-1] != "。":
            text += "。"
        words.extend(extract_features(text))
    input_ids = [w['tokens'] for w in words]
    phonemes = [w['kana'] for w in words]
    
    return {'input_ids' : input_ids, 'phonemes': phonemes} 

Then process shards

import os
def process_shard(i):
    directory = "./wiki_phoneme/shard_" + str(i)
    if os.path.exists(directory):
        print("Shard %d already exists!" % i)
        return
    print('Processing shard %d ...' % i)
    shard = dataset.shard(num_shards=1000, index=i)
    processed_dataset = shard.map(lambda t: phonemize(''.join(t['text'].split('_START_PARAGRAPH_')[1:])), remove_columns=['text'])
    if not os.path.exists(directory):
        os.makedirs(directory)
    processed_dataset.save_to_disk(directory)

from pebble import ProcessPool
from concurrent.futures import TimeoutError

with ProcessPool(max_workers=32) as pool:
    pool.map(process_shard, range(1000), timeout=120)

Lastly put them together:

directory = "wiki_phoneme"
import os
output = [dI for dI in os.listdir(directory) if os.path.isdir(os.path.join(directory,dI))]

from datasets import load_from_disk, concatenate_datasets

datasets = []

for o in output:
    directory = "./wiki_phoneme/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print("%s loaded" % o)
    except:
        continue

@innnky
Copy link

innnky commented Nov 7, 2023

@innnky Here's some code snippets I used for Japanese (note that this is not IPA, for IPA you need to convert kana to IPA using this mapping here: yl4579/StyleTTS#10 (comment))

from datasets import load_dataset
dataset = load_dataset("wiki40b", "ja", split='train', beam_runner='DirectRunner')

from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-v2")

Some utility functions:

def alphabetreading(text):
    alphabet_dict = {"A": "エイ",
                "B": "ビー",
                 "C": "シー",
                 "D": "ディー",
                 "E": "イー",
                 "F": "エフ",
                 "G": "ジー",
                 "H": "エイチ",
                "I":"アイ",
                "J":"ジェイ",
                "K":"ケイ",
                "L":"エル",
                "M":"エム",
                "N":"エヌ",
                "O":"オー",
                "P":"ピー",
                "Q":"キュー",
                "R":"アール",
                "S":"エス",
                "T":"ティー",
                "U":"ユー",
                "V":"ヴィー",
                "W":"ダブリュー",
                "X":"エックス",
                "Y":"ワイ",
                "Z":"ゼッド"}
    text = text.upper()
    text_ret = ""
    for t in text:
        if t in alphabet_dict:
            text_ret += alphabet_dict[t]
        else:
            text_ret += t
    return text_ret
import unicodedata
import pyopenjtalk
import jaconv

def extract_features(text):
    NJD_print, JPCommon_make_label = pyopenjtalk.run_frontend(unicodedata.normalize('NFKC', text).replace('\n', ''))
    
    words = []

    for n in NJD_print:
        k = n.split(',')
        kana = unicodedata.normalize("NFKC", k[9])
        tokens = unicodedata.normalize("NFKC", k[0])
        
        if k[1] == "記号":
            
            words.append({'tokens': tokenizer.encode([tokens])[1], 'words': tokens, 'kana': alphabetreading(tokens)})
        else:
            if kana == "ュ" or kana == "ャ" or kana == "ョ":
                words[-1]['words'] += k[0]
                words[-1]['kana'] += kana
            else:
                words.append({'tokens': tokenizer.encode([tokens])[1], 'words': tokens, 'kana': kana})
    
    return words

def phonemize(text):
    texts = text.split('。')
    words = []
    for text in texts:
        if text == "":
            continue
        if text[-1] != "。":
            text += "。"
        words.extend(extract_features(text))
    input_ids = [w['tokens'] for w in words]
    phonemes = [w['kana'] for w in words]
    
    return {'input_ids' : input_ids, 'phonemes': phonemes} 

Then process shards

import os
def process_shard(i):
    directory = "./wiki_phoneme/shard_" + str(i)
    if os.path.exists(directory):
        print("Shard %d already exists!" % i)
        return
    print('Processing shard %d ...' % i)
    shard = dataset.shard(num_shards=1000, index=i)
    processed_dataset = shard.map(lambda t: phonemize(''.join(t['text'].split('_START_PARAGRAPH_')[1:])), remove_columns=['text'])
    if not os.path.exists(directory):
        os.makedirs(directory)
    processed_dataset.save_to_disk(directory)

from pebble import ProcessPool
from concurrent.futures import TimeoutError

with ProcessPool(max_workers=32) as pool:
    pool.map(process_shard, range(1000), timeout=120)

Lastly put them together:

directory = "wiki_phoneme"
import os
output = [dI for dI in os.listdir(directory) if os.path.isdir(os.path.join(directory,dI))]

from datasets import load_from_disk, concatenate_datasets

datasets = []

for o in output:
    directory = "./wiki_phoneme/" + o
    try:
        shard = load_from_disk(directory)
        datasets.append(shard)
        print("%s loaded" % o)
    except:
        continue

Thanks for reply!!❤️❤️

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants