In [1]:
import os
import re
import glob
import pandas as pd
import librosa
import soundfile as sf
from tqdm import tqdm
import subprocess
from collections import Counter
import MeCab
# import pyopenjtalk

import torch

from transformers import (
    Wav2Vec2ForCTC, 
    Wav2Vec2CTCTokenizer, 
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor)

hira_list = [
    # Gojuon letters
    'あ','い','う','え','お' # a-row
    'か','き','く','け','こ', # ka-row
    'さ','し','す','せ','そ',# sa-row
    'た','ち','つ','て','と', # ta-row
    'な','に','ぬ','ね','の',# na-row
    'は','ふ','へ','ほ','ひ', # ha-row
    'ま','め','も','み','む',# ma-row
    'や','よ','ゆ', # ya-row
    'ら','り','る','れ','ろ', # ra-row
    'わ','を', # wa-row
    'ん', # n-row

    # Dakuon letters
    'が','ぎ','ぐ','げ','ご', # ga-row
    'ざ','ぜ','ず','じ','ぞ', # za-row
    'だ','ど','づ','で','ぢ', # da-row
    'ぼ','び','ぶ','べ','ば', # ba-row

    # Han-Dakuon letters
    'ぱ','ぴ','ぷ','ぺ','ぽ',

    # Yoon letters
    'ょ','ゅ','ゃ',
    'ぇ','ぁ','ぉ','ぃ','っ'] # half-width

In [None]:
class ASRDataset:
    def __init__(self):
        self.main_dir = "Datasets/ASR-dataset"
        self.data = pd.concat([
            self.get_kokoro(),
            self.get_meian(),
            self.get_jsut(),
            self.get_commonvoice()
        ], ignore_index=True)
        self.data = self.resample(self.data).dropna()
        self.data['sentence'] = self.data['sentence'].apply(self.CleanKanji)
        self.data = self.data.dropna().reset_index(drop=True)
        # self.data['phonemes'] = self.data['sentence'].apply(pyopenjtalk.g2p) 
        self.data.to_csv(f"{self.main_dir}/ASRDataset.csv", encoding="utf-8", index=False)

    def get_kokoro(self):
        data = pd.read_csv(
            "Datasets\KOKORO-dataset\metadata.csv", 
            sep="|", encoding="utf-8", header=None)
        data.columns = ["path", "sentence", "transliteration"]
        data = data[["path", "sentence"]]
        data['path'] = data['path'].apply(
            lambda x: r"Datasets\KOKORO-dataset\wav/" + x + ".wav")
        return data

    def get_meian(self):
        data = pd.read_csv(r"Datasets\MEIAN-dataset\transcript.txt", sep="|", header=None)
        data.columns = ["path", "sentence", "tranliteration", "duration"]
        data = data[["path", "sentence"]]
        data['path'] = data['path'].apply(
            lambda x: r"Datasets\MEIAN-dataset\wav/" + x.split("/")[-1])
        return data

    def get_jsut(self):
        filenames, sentences = [], []
        for transcript in glob.glob(r"Datasets\JSUT-dataset\*\transcript_utf8.txt"):
            file_path = transcript.rsplit("\\", 1)[0]
            with open(transcript, "r", encoding="utf-8") as f:
                lines = f.readlines()
                for line in lines: 
                    filename, sentence = line.split(":")
                    filenames.append(os.path.join(file_path, "wav", filename) + ".wav")
                    sentences.append(sentence.strip("\n"))
        data = pd.DataFrame({'path': filenames, 'sentence': sentences}) 
        return data 

    def get_commonvoice(self):
        data = pd.read_csv(r"Datasets\CommonVoice-dataset\validated.tsv", sep="\t")
        data = data[['path', 'sentence']]    
        data['path'] = data['path'].apply(
            lambda x: r"Datasets\CommonVoice-dataset\mp3/" + x)
        return data

    def resample(self, data):
        for i, in_path in tqdm(enumerate(data['path']), total=len(data['path'])):
            in_path = in_path.replace("\\", "/")
            out_path = f"{self.main_dir}/wav_cleaned"
            filename = in_path.rsplit("/", 1)[-1]
            if in_path.endswith("mp3"):
                filename = filename.replace("mp3", "wav")
                out_path = os.path.join(out_path, filename)
                if not os.path.exists(out_path):
                    subprocess.call([
                        "ffmpeg", "-i", in_path,"-acodec", "pcm_s16le", 
                        "-ar", "16000", out_path])
            else:
                sample_rate = librosa.get_samplerate(in_path)
                out_path = os.path.join(out_path, filename)
                if not os.path.exists(out_path):
                    if sample_rate != 16000:
                        subprocess.call([
                            "ffmpeg", "-i", in_path, "-ar", "16000", out_path])
            data['path'][i] = filename  
        return data       

    def CleanKanji(self, sentence):
        wakati = MeCab.Tagger("-Owakati")
        symbols = r"[（.*?）！-～.,;..._。、-〿・■（）：ㇰ-ㇿ㈠-㉃㊀-㋾㌀-㍿「」『』→ー -~‘–※π—ゐ’“”]"
        sentence = re.sub(symbols, "", sentence)
        sentence = wakati.parse(sentence).strip("\n")              
        return sentence

# data = ASRDataset().data
# data

In [125]:
def clean_phonemes(phonemes):   
    for i, phoneme in enumerate(phonemes):
        if i < len(phonemes) - 1:
            if phonemes[i] == "cl":
                if phonemes[i+1] == "pau":
                    phonemes = phonemes[:i] + [phonemes[i+2]] + phonemes[i+2:]
                else:
                    phonemes[i] = phonemes[i+1]
            if phonemes[i] == "pau":
                if phonemes[i+1] == "N":
                    phonemes = phonemes[:i] + phonemes[i+1:]
    for i, phoneme in enumerate(phonemes):
        phonemes[i] = phonemes[i].replace("U", "u")
        phonemes[i] = phonemes[i].replace("I", "i")
        phonemes[i] = phonemes[i].replace("pau", " ")
    return phonemes

data = pd.read_csv("Datasets\ASR-dataset\ASRDataset.csv", encoding="utf-8").dropna()
data = data[~data['phonemes'].str.contains("ty")]
data['phonemes'] = data['phonemes'].str.split()
data['phonemes'] = data['phonemes'].apply(clean_phonemes)
data['hiragana'] = data['phonemes'].apply("".join)
data

Unnamed: 0,path,sentence,phonemes,hiragana
0,kusamakura-by-soseki-natsume-00001.wav,草枕 夏目 漱石,"[k, u, s, a, m, a, k, u, r, a, , n, a, ts, u,...",kusamakura natsume sooseki
1,kusamakura-by-soseki-natsume-00002.wav,やま みち を 登り ながら こう 考え たち に 働け ば かど が 立つ,"[y, a, m, a, , m, i, ch, i, , o, , n, o, b,...",yama michi o nobori nagara kou kaNgae tachi ni...
2,kusamakura-by-soseki-natsume-00003.wav,じょう に さおさせ ば 流さ れる 意地 を とおせ ば きゅうくつ だ,"[j, o, o, , n, i, , s, a, o, s, a, s, e, , ...",joo ni saosase ba ryuusa reru iji o toose ba k...
3,kusamakura-by-soseki-natsume-00004.wav,とかく に 人 の 世 は 住み にくい 住み にく さ が こうじる と,"[t, o, k, a, k, u, , n, i, , h, i, t, o, , ...",tokaku ni hito no yo wa sumi nikui sumi niku s...
4,kusamakura-by-soseki-natsume-00005.wav,安い 所 へ 引き越し たく なる どこ へ 越し て も,"[y, a, s, u, i, , t, o, k, o, r, o, , e, , ...",yasui tokoro e hikikoshi taku naru doko e kosh...
...,...,...,...,...
44367,common_voice_ja_27369273.wav,月並み な アイデア から ヒント を 見つけ だす,"[ts, u, k, i, n, a, m, i, , n, a, , a, i, d,...",tsukinami na aidea kara hiNto o mitsuke dasu
44368,common_voice_ja_27388440.wav,直観 と は ただ 我 の 自己 が 世界 の 形成 作用 と し て 世界 の 中 に 含...,"[ch, o, k, k, a, N, , t, o, , w, a, , t, a,...",chokkaN to wa tada waga no jiko ga sekai no ke...
44369,common_voice_ja_27388441.wav,斯く あっ た から 斯く す べし と し て,"[k, a, k, u, , a, t, t, a, , k, a, r, a, , ...",kaku atta kara kaku su beshi to shi te
44370,common_voice_ja_27388443.wav,いつ も 自己 自身 の 中 に 自己 を 越え た もの 超越 的 なる もの を 含む ...,"[i, ts, u, , m, o, , j, i, k, o, , j, i, sh...",itsu mo jiko jishiN no naka ni jiko o koe ta m...


In [116]:
unique_phonemes = set()
for sentence in data['phonemes']:
    for phoneme in sentence:
        unique_phonemes.add(phoneme)

vowels = ['a', 'e', 'i', 'o', 'u']
markers = ['pau', 'cl']
special = ['N']
consonants = [phoneme for phoneme in unique_phonemes if 
    phoneme not in (vowels + markers + special)]

print("Vowels:\n", vowels)
print("Consonants:\n", consonants)
print("Special:\n", special)
print("Markers:\n", markers)

Vowels:
 ['a', 'e', 'i', 'o', 'u']
Consonants:
 ['ch', 'm', 'd', 'my', 'hy', 'sh', 'p', 't', 'k', 'r', 'b', 's', 'gy', 'py', 'g', 'n', 'h', 'f', 'ts', 'y', 'v', 'j', 'w', 'ny', 'ky', 'dy', 'by', 'z', ' ', 'ry']
Special:
 ['N']
Markers:
 ['pau', 'cl']


In [None]:
len(unique_phonemes)

In [None]:
def phoa(in_path):
    lines = []
    with open(in_path, 'r') as f:
        text = f.read().split("\n")
        i = 0
        while True:
            start_1, end_1, phoneme_1 = text[i].split(' ')
            if phoneme_1 in consonants: 
                start_2, end_2, phoneme_2 = text.pop(i+1).split(' ')
                mora =  (phoneme_1 + phoneme_2).lower()
                line = " ".join([str(start_1), str(end_2), mora])
                lines.append(line)
            else:
                if phoneme_1 == 'o':
                    phoneme_1 = 'wo'
                line = " ".join([str(start_1), str(end_1), phoneme_1])
                lines.append(line)
            if text[i] == text[-1]:
                break
            i += 1
    lines = "\n".join(lines)
    return lines

def phoneme2mora(phonemes):
    for i, phoneme in enumerate(phonemes):
        if phoneme in consonants:
            mora = (phoneme)

    return phonemes
    


phoneme2mora(data['phonemes'][0])

In [None]:
def get_mora(in_path):
    lines = []
    with open(in_path, 'r') as f:
        text = f.read().split("\n")
        i = 0
        while True:
            start_1, end_1, phoneme_1 = text[i].split(' ')
            if phoneme_1 in consonants: 
                start_2, end_2, phoneme_2 = text.pop(i+1).split(' ')
                mora =  (phoneme_1 + phoneme_2).lower()
                lines.append(mora)
            else:
                if phoneme_1 == 'o':
                    phoneme_1 = 'wo'
                lines.append(phoneme_1)
            if text[i] == text[-1]:
                break
            i += 1
    return lines

In [None]:
vocab = []
for sentence in data['phonemes']:
    for kana in sentence:
        vocab.append(kana)

counter = Counter(vocab)
list(counter.most_common()[::-1])

In [None]:
len(counter)

In [None]:
feature_extractor = Wav2Vec2FeatureExtractor(
    feature_size=1,
    sampling_rate=16000,
    padding_value=0.0,
    do_normalize=True,
    return_attention_mask=True
)

tokenizer = Wav2Vec2CTCTokenizer("./vocab.json")

processor = Wav2Vec2Processor(
    feature_extractor=feature_extractor,
    tokenizer=tokenizer)

processor

In [None]:
model = Wav2Vec2ForCTC.from_pretrained(
    "facebook/wav2vec2-large-xlsr-53",
    attention_dropout=0.1,
    hidden_dropout=0.1,
    feat_proj_dropout=0.0,
    mask_time_prob=0.05,
    layerdrop=0.1,
    gradient_checkpointing=True,
    ctc_loss_reduction="mean",
    pad_token_id=processor.tokenizer.pad_token_id,
    vocab_size=len(processor.tokenizer)
)

model