# 音素のlabファイルから、形態素の単位のlabファイルを作成する

1. JMAC のふりがな部分を除いて `pyopenjtalk.run_frontend` で形態素ごとに読み情報を得る
2. JMAC のふりがなと比べて，異なっているところは結果を上書きする（無声音を表す `'` に注意する） 
3. 上記のルールでカタカナ表記の発音を取り出す
4. jaconv などでカタカナ→ひらがなに変換し，さらに jaconv の `hiragana2julius` で音素列に変換する
5. 形態素ごとに音素列を保存しておく
6. 音素列をつなげたものを Julius に入力してアラインメント結果を得る
7. 形態素ごとの音素列と比較して，形態素ごとに時間情報を得る

In [None]:
# conda activate pyopenjtalk_julius

# set path
import sys

sys.path.append("/home/takeshun256/PausePrediction")

# import standard library
from pprint import pprint
from pathlib import Path
import yaml
import pandas as pd
import pyopenjtalk
from pyopenjtalk import run_frontend, g2p
import jaconv
import re
from tqdm import tqdm

# import own library
from config import DATA_DIR, DATA_TAKESHUN256_DIR, SRC_DIR, DATA_IN_ROOT_DIR
from src.analyze_jmac.text_preprocessing import (
    AudiobookScriptPreprocessor as Preprocessor,
)
from src.analyze_jmac.mecab import mecab_wakati_generator, mecab_detailed_generator

# define path
corpus_name = "jmac"
exp_name = "03_VAD_Adjusted"

exp_dir = Path(DATA_TAKESHUN256_DIR) / corpus_name / exp_name
yaml_file_path = Path(DATA_IN_ROOT_DIR) / corpus_name / "text_audio_dict_new.yaml"

assert exp_dir.exists()
assert yaml_file_path.exists()

In [None]:
run_frontend("これは、[私|わたし]が小さい『ときに、村の[茂平|もへい]というおじいさんからきいたお話です。")

In [None]:
# テキストの前処理関数を定義


class TextPreprocessor:
    """テキストの前処理を行う

    1. JMAC のふりがな部分を除いて `pyopenjtalk.run_frontend` で形態素ごとに読み情報を得る
    2. JMAC のふりがなと比べて，異なっているところは結果を上書きする（無声音を表す `'` に注意する）
    3. 上記のルールでカタカナ表記の発音を取り出す
    4. jaconv などでカタカナ→ひらがなに変換し，さらに jaconv の `hiragana2julius` で音素列に変換する
    5. 形態素ごとに音素列を保存しておく
    # 6. 音素列をつなげたものを Julius に入力してアラインメント結果を得る
    # 7. 形態素ごとの音素列と比較して，形態素ごとに時間情報を得る

    Usage:
        >>> text_preprocessor = TextPreprocessor()
        >>> text_preprocessor.preprocess_text(text)

    Input:
        text (str): JMAC のテキスト, e.g., 'これは、[私|わたし]が小さいときに、村の[茂平|もへい]というおじいさんからきいたお話です。'

    """

    def __init__(self):
        self.mecab_wakati_generator = mecab_wakati_generator
        self.mecab_detailed_generator = mecab_detailed_generator

    @staticmethod
    def remove_brackets_to_kanji(x):
        """ブラケット内の漢字を取り出す.

        Example:
            入力: "お[菓子|かし]がひとつ" # [kanji|furigana]
            出力: "お菓子がひとつ"
        """
        return re.sub(r"\[(.+?)\|(.+?)\]", r"\1", x)

    @staticmethod
    def remove_brackets_to_furigana(x):
        """ブラケット内のふりがなを取り出す.

        Example:
            入力: "お[菓子|かし]がひとつ" # [kanji|furigana]
            出力: "おかしがひとつ"
        """
        return re.sub(r"\[(.+?)\|(.+?)\]", r"\2", x)

    def normalize(self, text):
        return jaconv.normalize(text)

    def get_jmac_blacket_dict(self, text):
        """テキストからブラケット内の「漢字」と「ふりがなのpron」の対応を取得する."""
        blacket_dict = {}
        for kanji, furigana in re.findall(r"\[(.+?)\|(.+?)\]", text):
            blacket_dict[kanji] = furigana
        blacket_dict = {
            k: "".join([njd["pron"] for njd in run_frontend(self.normalize(v))])
            for k, v in blacket_dict.items()
        }
        return blacket_dict

    def preprocess_text(self, text):
        self.original_text = text
        self.blacket_removed_kanji_text = self.remove_brackets_to_kanji(text)
        self.normalized_text = self.normalize(self.blacket_removed_kanji_text)
        self.njd_features = run_frontend(self.normalized_text)

        self.jmac_blacket_dict = self.get_jmac_blacket_dict(self.original_text)

        # 形態素単位の音素列のリストを作成
        self.morp_pron_list = []  # [[morphome, katakana], ...]

        # JMAC のふりがなと比べて，異なっているところは結果を上書きする（無声音を表す ' に注意する）
        for njd_feature in self.njd_features:
            # 記号は、のみを取り出す
            if njd_feature["pos"] == "記号":
                if njd_feature["string"] == "、":
                    pron = "、"
                else:
                    continue
            else:
                orig = njd_feature["string"]
                pron = njd_feature["pron"]
                if (
                    orig in self.jmac_blacket_dict
                    and self.jmac_blacket_dict[orig] != pron
                ):
                    pron = self.jmac_blacket_dict[orig]
            pron = pron.replace("’", "")  # 無声音を表す ' は削除する
            # 3. 上記のルールでカタカナ表記の発音を取り出す
            self.morp_pron_list.append([njd_feature["string"], pron])
        # print(f"self.morp_pron_list: {self.morp_pron_list}")
        # 4. jaconv などでカタカナ→ひらがなに変換し，さらに jaconv の `hiragana2julius` で音素列に変換する
        self.morp_phons_list = []  # [[morphome, [phoneme, ...]], ...
        for m, p in self.morp_pron_list:
            if p == "、":
                self.morp_phons_list.append([m, ["sp"]])
            else:
                self.morp_phons_list.append(
                    [m, jaconv.hiragana2julius(jaconv.kata2hira(p)).split(" ")]
                )

        morp_join = "".join([m for m, p in self.morp_phons_list])
        phons_join = " ".join([p for _, phons in self.morp_phons_list for p in phons])

        output_dict = {
            "morp_join": morp_join,
            "phons_join": phons_join,
            "morp_phons_list": self.morp_phons_list,
        }
        return output_dict

    def __call__(
        self,
        text,
    ):
        return self.preprocess_text(text)

origは、stringの形を通常の状態に戻したもの いた -> いる

In [None]:
text_preprocessor = TextPreprocessor()
s = " これは、[私|わたし]が小さ　いときに、村の[茂平|もへい]  という...おじ い さーん!?からき『   いた お話です。"
text_preprocessor(s)

やること
1. 対応表を作る
2. 音素列をファイルで保存する
3. Juliusをかける(手直しできたら)
4. labを得る
5. fix_align.pyでlabからlab2を作る
6. lab2から、形態素単位のlabを作る
7. 音声データから無音区間の情報を得て保存する
8. 各形態素間に無音区間があるかどうかを確認する
9. リストで、(形態素, True, 形態素, False, ...)のデータを作る
10. token化する
11. BERTで学習する
12. 精度を確認する, テストデータでも確認する

#### 1.対応表を作る

In [None]:
# 対応表の作成と保存

# define path
corpus_name = "jmac"
exp_name = "03_VAD_Adjusted"

exp_dir = Path(DATA_TAKESHUN256_DIR) / corpus_name / exp_name
yaml_file_path = Path(DATA_IN_ROOT_DIR) / corpus_name / "text_audio_dict_new.yaml"

output_yaml_path = exp_dir / "text_audio_dict_new_with_morp_phons.yaml"

assert exp_dir.exists()
assert yaml_file_path.exists()

with open(yaml_file_path, "r") as f:
    yaml_data = yaml.safe_load(f)


transcript_dict = {}
for audiobook_name, info in tqdm(yaml_data.items()):
    transcript_lines = []
    author = info["author"]
    book = info["book"]
    mp3 = info["mp3"]
    url = info["url"]
    wav = info["wav"]
    text = info["text"]

    # make text_dict
    text_dict = {}
    for chapter_idx, chapter_info in enumerate(text):
        chapter_idx_str = str(chapter_idx).zfill(3)
        # update chapter_info
        sent_str = chapter_info["sent"]
        morp_phons_dict = text_preprocessor(sent_str)
        morp_join = morp_phons_dict["morp_join"]
        phons_join = morp_phons_dict["phons_join"]
        morp_phons_list = morp_phons_dict["morp_phons_list"]

        chapter_info["morp_join"] = morp_join
        chapter_info["phons_join"] = phons_join
        chapter_info["morp_phons_list"] = morp_phons_list

        text_dict[chapter_idx_str] = chapter_info

    # make audiobook_dict
    audiobook_dict = {}
    audiobook_dict["author"] = author
    audiobook_dict["book"] = book
    audiobook_dict["mp3"] = mp3
    audiobook_dict["url"] = url
    audiobook_dict["wav"] = wav
    audiobook_dict["text"] = text_dict

    # make trascript_dict
    transcript_dict[audiobook_name] = audiobook_dict

pprint(transcript_dict["audiobook_0"])

with open(output_yaml_path, "w") as f:
    yaml.dump(transcript_dict, f, allow_unicode=True)

print(f"output_yaml_path: {output_yaml_path}")

In [None]:
"アイス、".strip("")

#### 2.音素列をファイルで保存する

In [None]:
morp_phons_yaml_path = exp_dir / "text_audio_dict_new_with_morp_phons.yaml"
output_dict_path = Path(DATA_TAKESHUN256_DIR) / "jmac_split_and_added_lab"

with open(morp_phons_yaml_path, "r") as f:
    morp_phons_yaml_data = yaml.safe_load(f)

# 各chapterのphons_joinを取り出して、それぞれファイルに保存する
for audiobook_name, info in tqdm(morp_phons_yaml_data.items()):
    for chapter_name, chapter_info in info["text"].items():
        phons_join = chapter_info["phons_join"]
        if not isinstance(phons_join, str):
            raise ValueError("phons_join is not str")
        output_path = (
            output_dict_path / audiobook_name / f"{audiobook_name}_{chapter_name}.txt"
        )
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, "w") as f:
            f.write(phons_join)
    print(f"[INFO] {audiobook_name} is saved.")

#### 3.Juliusをかける(手直しできたら)

- BERTの評価後に手直しする(無限ループを1ファイル単位でスキップできるようにする)
- `./all_audiobook_julius_segment.sh`でjuliusをかける
- ログ取るなら、`./all_audiobook_julius_segment.sh > /home/takeshun256/PausePrediction/logs/all_audiobook_julius_segment_2023-11-01_0630.log 2>&1`

- 大きなファイルが出るので、
    - `find /data2/takeshun256 -type f -size +100M`
    - `find /data2/takeshun256 -type f -size +100M -delete`

#### 4.labを得る

- 3番で生成完了している
- `check_file_counts_eq_julius_segment.sh` で生成されていないファイルがどれだけあるか確認する

```bash
🟢 audiobook_0 lab:178 wav:178 txt:178 log:178
🟢 audiobook_1 lab:178 wav:178 txt:178 log:178
🟢 audiobook_10 lab:112 wav:112 txt:112 log:112
🟢 audiobook_11 lab:112 wav:112 txt:112 log:112
🟢 audiobook_12 lab:112 wav:112 txt:112 log:112
🟢 audiobook_13 lab:112 wav:112 txt:112 log:112
🟢 audiobook_14 lab:112 wav:112 txt:112 log:112
🟢 audiobook_15 lab:134 wav:134 txt:134 log:134
🟢 audiobook_16 lab:134 wav:134 txt:134 log:134
🟢 audiobook_17 lab:355 wav:355 txt:355 log:355
🟢 audiobook_18 lab:311 wav:311 txt:311 log:311
🟢 audiobook_19 lab:103 wav:103 txt:103 log:103
🟢 audiobook_2 lab:178 wav:178 txt:178 log:178
🟢 audiobook_20 lab:103 wav:103 txt:103 log:103
🟢 audiobook_21 lab:471 wav:471 txt:471 log:471
🟢 audiobook_22 lab:471 wav:471 txt:471 log:471
🟢 audiobook_23 lab:471 wav:471 txt:471 log:471
🟢 audiobook_24 lab:471 wav:471 txt:471 log:471
🟢 audiobook_25 lab:160 wav:160 txt:160 log:160
🟢 audiobook_26 lab:164 wav:164 txt:164 log:164
🟢 audiobook_27 lab:164 wav:164 txt:164 log:164
❌ audiobook_28 lab:8 wav:472 txt:472 log:9
❌ audiobook_29 lab:9 wav:472 txt:472 log:10
🟢 audiobook_3 lab:178 wav:178 txt:178 log:178
❌ audiobook_30 lab:37 wav:472 txt:472 log:38
🟢 audiobook_31 lab:167 wav:167 txt:167 log:167
🟢 audiobook_32 lab:167 wav:167 txt:167 log:167
🟢 audiobook_33 lab:167 wav:167 txt:167 log:167
🟢 audiobook_34 lab:167 wav:167 txt:167 log:167
🟢 audiobook_35 lab:167 wav:167 txt:167 log:167
🟢 audiobook_36 lab:167 wav:167 txt:167 log:167
🟢 audiobook_37 lab:266 wav:266 txt:266 log:266
🟢 audiobook_38 lab:266 wav:266 txt:266 log:266
🟢 audiobook_39 lab:266 wav:266 txt:266 log:266
🟢 audiobook_4 lab:178 wav:178 txt:178 log:178
🟢 audiobook_40 lab:266 wav:266 txt:266 log:266
🟢 audiobook_41 lab:266 wav:266 txt:266 log:266
🟢 audiobook_42 lab:241 wav:241 txt:241 log:241
🟢 audiobook_43 lab:241 wav:241 txt:241 log:241
🟢 audiobook_44 lab:241 wav:241 txt:241 log:241
🟢 audiobook_45 lab:241 wav:241 txt:241 log:241
🟢 audiobook_46 lab:241 wav:241 txt:241 log:241
🟢 audiobook_47 lab:165 wav:165 txt:165 log:165
🟢 audiobook_48 lab:165 wav:165 txt:165 log:165
🟢 audiobook_49 lab:64 wav:64 txt:64 log:64
🟢 audiobook_5 lab:178 wav:178 txt:178 log:178
🟢 audiobook_50 lab:64 wav:64 txt:64 log:64
🟢 audiobook_51 lab:64 wav:64 txt:64 log:64
🟢 audiobook_52 lab:64 wav:64 txt:64 log:64
🟢 audiobook_53 lab:64 wav:64 txt:64 log:64
🟢 audiobook_54 lab:64 wav:64 txt:64 log:64
🟢 audiobook_55 lab:64 wav:64 txt:64 log:64
🟢 audiobook_56 lab:64 wav:64 txt:64 log:64
🟢 audiobook_57 lab:188 wav:188 txt:188 log:188
🟢 audiobook_58 lab:188 wav:188 txt:188 log:188
🟢 audiobook_59 lab:188 wav:188 txt:188 log:188
🟢 audiobook_6 lab:178 wav:178 txt:178 log:178
🟢 audiobook_60 lab:188 wav:188 txt:188 log:188
🟢 audiobook_61 lab:179 wav:179 txt:179 log:179
🟢 audiobook_62 lab:472 wav:472 txt:472 log:472
🟢 audiobook_63 lab:472 wav:472 txt:472 log:472
🟢 audiobook_64 lab:472 wav:472 txt:472 log:472
🟢 audiobook_65 lab:472 wav:472 txt:472 log:472
🟢 audiobook_66 lab:207 wav:207 txt:207 log:207
🟢 audiobook_67 lab:207 wav:207 txt:207 log:207
🟢 audiobook_68 lab:207 wav:207 txt:207 log:207
🟢 audiobook_69 lab:207 wav:207 txt:207 log:207
🟢 audiobook_7 lab:178 wav:178 txt:178 log:178
🟢 audiobook_70 lab:111 wav:111 txt:111 log:111
🟢 audiobook_71 lab:111 wav:111 txt:111 log:111
🟢 audiobook_72 lab:111 wav:111 txt:111 log:111
🟢 audiobook_73 lab:111 wav:111 txt:111 log:111
🟢 audiobook_8 lab:112 wav:112 txt:112 log:112
🟢 audiobook_9 lab:112 wav:112 txt:112 log:112
🟢 text lab:0 wav:0 txt:0 log:0
```

#### 5.fix_align.pyでlabからlab2を作る

- `./fix_align_all.sh > "/home/takeshun256/PausePrediction/logs/fix_align_all_2023-11-01_20:10.log" 2>&1` でlab2を作る

In [None]:
s = "/data2/takeshun256/jmac_split_and_added_lab/audiobook_30/audiobook_30_171.lab"
with open(s, "r") as f:
    s = f.readlines()
s = [s.strip() for s in s]
print(s)

#### 6.lab2から、形態素単位のlabを作る


1. 音素単位のlabファイルから、形態素単位のlabファイルを作成する
2. それぞれのlabファイルを、yamlファイルに保存する

In [None]:
# 音素単位のlabから、morphome単位のlabを作成する
def get_morp_lab_from_phon_lab(phon_lab: list, morp_phons_list: list):
    # morp_phons_listの初めと最後の「、」を削除する
    if morp_phons_list[0][0] == "、":
        morp_phons_list = morp_phons_list[1:]
    if morp_phons_list[-1][0] == "、":
        morp_phons_list = morp_phons_list[:-1]

    # assert (len(phon_lab) == sum([len(phons) for _, phons in morp_phons_list]) + 2), \
    #     f"len(phon_lab): {len(phon_lab)}, sum([len(phons) for _, phons in morp_phons_list]): {sum([len(phons) for _, phons in morp_phons_list])} + silB + silE \n phon_lab: {phon_lab} \n morp_phons_list: {morp_phons_list}"

    # phon_labを1つずつ取り出して、morp_phons_listのどのリストにはいるか戻らずに探索する
    phon_lab = iter([s.strip() for s in phon_lab])
    morp_lab = []

    morp_lab.append(next(phon_lab))  # silB
    for morp, phons in morp_phons_list:
        starts = []
        ends = []
        if morp == "、":
            start, end, phon = next(phon_lab).split()
            starts.append(start)
            ends.append(end)
            assert phon == "sp", f"phon: {phon}"
        else:
            i = 0
            while True:
                start, end, phon = next(phon_lab).split()
                starts.append(start)
                ends.append(end)
                if phon == "sp":
                    continue
                if phon == phons[i]:
                    i += 1
                if i == len(phons):
                    break

            # for _ in range(len(phons)):
            #     start, end, _ = next(phon_lab).split()
            #     starts.append(start)
            #     ends.append(end)

        morp_l = f"{min(starts)} {max(ends)} {morp}"
        morp_lab.append(morp_l)
    morp_lab.append(next(phon_lab))  # silE

    return morp_lab


phon_lab = [
    "0.0000000 0.4051875 silB\n",
    "0.4051875 0.5051875 k\n",
    "0.5051875 0.5351875 o\n",
    "0.5351875 0.5751875 r\n",
    "0.5751875 0.6451875 e\n",
    "0.6451875 0.8351875 w\n",
    "0.8351875 0.9751875 a\n",
    "0.9751875 1.6269375 sp\n",
    "1.6269375 1.7169375 w\n",
    "1.7169375 1.7769375 a\n",
    "1.7769375 1.8369375 t\n",
    "1.8369375 1.8669375 a\n",
    "1.8669375 2.0169375 sh\n",
    "2.0169375 2.0669375 i\n",
    "2.0669375 2.1069375 g\n",
    "2.1069375 2.1969375 a\n",
    "2.1969375 2.3369375 ch\n",
    "2.3369375 2.4169375 i:\n",
    "2.4169375 2.5169375 s\n",
    "2.5169375 2.5769375 a\n",
    "2.5769375 2.6469375 i\n",
    "2.6469375 2.6869375 t\n",
    "2.6869375 2.7469375 o\n",
    "2.7469375 2.8269375 k\n",
    "2.8269375 2.8769375 i\n",
    "2.8769375 2.9469375 n\n",
    "2.9469375 3.1169375 i\n",
    "3.1169375 3.7069375 sp\n",
    "3.7069375 3.7869375 m\n",
    "3.7869375 3.8269375 u\n",
    "3.8269375 3.8769375 r\n",
    "3.8769375 3.9569375 a\n",
    "3.9569375 4.0369375 n\n",
    "4.0369375 4.1069375 o\n",
    "4.1069375 4.1769375 m\n",
    "4.1769375 4.2669375 o\n",
    "4.2669375 4.3569375 h\n",
    "4.3569375 4.5069375 e\n",
    "4.5069375 4.5469375 i\n",
    "4.5469375 4.6069375 t\n",
    "4.6069375 4.6969375 o\n",
    "4.6969375 4.7269375 i\n",
    "4.7269375 5.0569375 u\n",
    "5.0569375 5.1669375 o\n",
    "5.1669375 5.2669375 j\n",
    "5.2669375 5.3269375 i:\n",
    "5.3269375 5.4469375 s\n",
    "5.4469375 5.5069375 a\n",
    "5.5069375 5.5969375 N\n",
    "5.5969375 5.6569375 k\n",
    "5.6569375 5.7169375 a\n",
    "5.7169375 5.7469375 r\n",
    "5.7469375 5.8169375 a\n",
    "5.8169375 5.9669375 k\n",
    "5.9669375 6.1069375 i:\n",
    "6.1069375 6.1469375 t\n",
    "6.1469375 6.1969375 a\n",
    "6.1969375 6.2369375 o\n",
    "6.2369375 6.3869375 h\n",
    "6.3869375 6.4169375 a\n",
    "6.4169375 6.4769375 n\n",
    "6.4769375 6.5169375 a\n",
    "6.5169375 6.6369375 sh\n",
    "6.6369375 6.6769375 i\n",
    "6.6769375 6.7269375 d\n",
    "6.7269375 6.7869375 e\n",
    "6.7869375 6.9969375 s\n",
    "6.9969375 7.0269375 u\n",
    "7.0269375 7.2850000 silE\n",
]
morp_phons_list = [
    ["これ", ["k", "o", "r", "e"]],
    ["は", ["w", "a"]],
    ["、", ["sp"]],
    ["私", ["w", "a", "t", "a", "sh", "i"]],
    ["が", ["g", "a"]],
    ["小さい", ["ch", "i:", "s", "a", "i"]],
    ["とき", ["t", "o", "k", "i"]],
    ["に", ["n", "i"]],
    ["、", ["sp"]],
    ["村", ["m", "u", "r", "a"]],
    ["の", ["n", "o"]],
    ["茂平", ["m", "o", "h", "e", "i"]],
    ["と", ["t", "o"]],
    ["いう", ["i", "u"]],
    ["おじいさん", ["o", "j", "i:", "s", "a", "N"]],
    ["から", ["k", "a", "r", "a"]],
    ["きい", ["k", "i:"]],
    ["た", ["t", "a"]],
    ["お話", ["o", "h", "a", "n", "a", "sh", "i"]],
    ["です", ["d", "e", "s", "u"]],
    ["、", ["sp"]],
]

get_morp_lab_from_phon_lab(phon_lab, morp_phons_list)

In [None]:
phon_lab = [
    "0.0000000 0.4051875 silB\n",
    "0.4051875 0.5051875 k\n",
    "0.5051875 0.5351875 o\n",
    "0.5351875 0.5751875 r\n",
    "0.5751875 0.6451875 e\n",
    "0.6451875 0.8351875 w\n",
    "0.8351875 0.9751875 a\n",
    "0.9751875 1.6269375 sp\n",
    "1.6269375 1.7169375 w\n",
    "1.7169375 1.7769375 a\n",
    "1.7769375 1.8369375 t\n",
    "1.8369375 1.8669375 a\n",
    "1.8669375 2.0169375 sh\n",
    "2.0169375 2.0669375 i\n",
    "2.0669375 2.1069375 g\n",
    "2.1069375 2.1969375 a\n",
    "2.1969375 2.3369375 ch\n",
    "2.3369375 2.4169375 i:\n",
    "2.4169375 2.5169375 s\n",
    "2.5169375 2.5769375 a\n",
    "2.5769375 2.6469375 i\n",
    "2.6469375 2.6869375 t\n",
    "2.6869375 2.7469375 o\n",
    "2.7469375 2.8269375 k\n",
    "2.8269375 2.8769375 i\n",
    "2.8769375 2.9469375 n\n",
    "2.9469375 3.1169375 i\n",
    "3.1169375 3.7069375 sp\n",
    "3.7069375 3.7869375 m\n",
    "3.7869375 3.8269375 u\n",
    "3.8269375 3.8769375 r\n",
    "3.8769375 3.9569375 a\n",
    "3.9569375 4.0369375 n\n",
    "4.0369375 4.1069375 o\n",
    "4.1069375 4.1769375 m\n",
    "4.1769375 4.2669375 o\n",
    "4.2669375 4.3569375 h\n",
    "4.3569375 4.5069375 e\n",
    "4.5069375 4.5469375 i\n",
    "4.5469375 4.6069375 t\n",
    "4.6069375 4.6969375 o\n",
    "4.6969375 4.7269375 i\n",
    "4.7269375 5.0569375 u\n",
    "5.0569375 5.1669375 o\n",
    "5.1669375 5.2669375 j\n",
    "5.2669375 5.3269375 i:\n",
    "5.3269375 5.4469375 s\n",
    "5.4469375 5.5069375 a\n",
    "5.5069375 5.5969375 N\n",
    "5.5969375 5.6569375 k\n",
    "5.6569375 5.7169375 a\n",
    "5.7169375 5.7469375 r\n",
    "5.7469375 5.8169375 a\n",
    "5.8169375 5.9669375 k\n",
    "5.9669375 6.1069375 i:\n",
    "6.1069375 6.1469375 t\n",
    "6.1469375 6.1969375 a\n",
    "6.1969375 6.2369375 o\n",
    "6.2369375 6.3869375 h\n",
    "6.3869375 6.4169375 a\n",
    "6.4169375 6.4769375 n\n",
    "6.4769375 6.5169375 a\n",
    "6.5169375 6.6369375 sh\n",
    "6.6369375 6.6769375 i\n",
    "6.6769375 6.7269375 d\n",
    "6.7269375 6.7869375 e\n",
    "6.7869375 6.9969375 s\n",
    "6.9969375 7.0269375 u\n",
    "7.0269375 7.2850000 silE\n",
]
phon_lab = [s.strip() for s in phon_lab]
phon_lab

In [None]:
phon_lab = [
    "0.0000000 0.3808750 silB\n",
    "0.3808750 0.4713125 t\n",
    "0.4713125 0.5213125 o\n",
    "0.5213125 0.6713125 s\n",
    "0.6713125 0.7212500 a\n",
    "0.7212500 0.8613125 k\n",
    "0.8613125 0.9113125 e\n",
    "0.9113125 1.1313125 N\n",
    "1.1313125 1.1613125 d\n",
    "1.1613125 1.3775000 e\n",
    "1.3775000 2.1078750 sp\n",
    "2.1078750 2.1778750 m\n",
    "2.1778750 2.3178750 e\n",
    "2.3178750 2.4378750 o\n",
    "2.4378750 2.5578750 o\n",
    "2.5578750 2.7078750 s\n",
    "2.7078750 2.7978750 a\n",
    "2.7978750 2.9178750 e\n",
    "2.9178750 3.0078750 n\n",
    "3.0078750 3.1178750 a\n",
    "3.1178750 3.1678750 g\n",
    "3.1678750 3.2578750 a\n",
    "3.2578750 3.3078750 r\n",
    "3.3078750 3.5244375 a\n",
    "3.5244375 3.8215625 sp\n",
    "3.8215625 3.9963750 k\n",
    "3.9963750 4.1910625 sp\n",
    "4.1910625 4.3510625 a:\n",
    "4.3510625 4.4810625 s\n",
    "4.4810625 4.5910625 a\n",
    "4.5910625 4.7110625 N\n",
    "4.7110625 4.7410625 k\n",
    "4.7410625 4.8810625 i\n",
    "4.8810625 4.9610625 ts\n",
    "4.9610625 5.0110625 u\n",
    "5.0110625 5.0810625 n\n",
    "5.0810625 5.1410000 e\n",
    "5.1410000 5.2410625 n\n",
    "5.2410625 5.3310625 o\n",
    "5.3310625 5.4310625 t\n",
    "5.4310625 5.5010625 o\n",
    "5.5010625 5.6110625 k\n",
    "5.6110625 5.7010625 o\n",
    "5.7010625 5.7710625 r\n",
    "5.7710625 5.8610625 o\n",
    "5.8610625 6.0610625 e\n",
    "6.0610625 6.5841875 k\n",
    "6.5841875 6.6241875 o\n",
    "6.6241875 6.7141875 r\n",
    "6.7141875 6.8041875 o\n",
    "6.8041875 6.8841875 g\n",
    "6.8841875 6.9991875 e\n",
    "6.9991875 7.0393125 t\n",
    "7.0393125 7.1416875 e\n",
    "7.1416875 7.2360000 k\n",
    "7.2360000 7.2760000 i\n",
    "7.2760000 7.3460000 m\n",
    "7.3460000 7.4260000 a\n",
    "7.4260000 7.5660000 sh\n",
    "7.5660000 7.5960000 i\n",
    "7.5960000 7.6883125 t\n",
    "7.6883125 7.9083125 a\n",
    "7.9083125 8.2800000 silE\n",
]
morp_phons_list = [
    ["と", ["t", "o"]],
    ["叫ん", ["s", "a", "k", "e", "N"]],
    ["で", ["d", "e"]],
    ["、", ["sp"]],
    ["眼", ["m", "e"]],
    ["を", ["o"]],
    ["抑え", ["o", "s", "a", "e"]],
    ["ながら", ["n", "a", "g", "a", "r", "a"]],
    ["、", ["sp"]],
    ["母さん", ["k", "a:", "s", "a", "N"]],
    ["狐", ["k", "i", "ts", "u", "n", "e"]],
    ["の", ["n", "o"]],
    ["ところ", ["t", "o", "k", "o", "r", "o"]],
    ["へ", ["e"]],
    ["ころげ", ["k", "o", "r", "o", "g", "e"]],
    ["て", ["t", "e"]],
    ["来", ["k", "i"]],
    ["まし", ["m", "a", "sh", "i"]],
    ["た", ["t", "a"]],
]
print(len(phon_lab))
print(sum([len(phons) for _, phons in morp_phons_list]) + 2)

# 音素を並べる
print(" ".join([s.strip().split(" ")[2] for s in phon_lab[1:-1]]))
print(" ".join([p for _, phons in morp_phons_list for p in phons]))
get_morp_lab_from_phon_lab(phon_lab, morp_phons_list)

In [None]:
# 1. 音素単位のlabファイルから、形態素単位のlabファイルを作成する
morp_phons_yaml_path = exp_dir / "text_audio_dict_new_with_morp_phons.yaml"
lab_dict_path = Path(DATA_TAKESHUN256_DIR) / "jmac_split_and_added_lab"

with open(morp_phons_yaml_path, "r") as f:
    morp_phons_yaml_data = yaml.safe_load(f)

In [None]:
print(len(morp_phons_yaml_data))

In [None]:
# # 1. 音素単位のlabファイルから、形態素単位のlabファイルを作成する
# morp_phons_yaml_path = exp_dir / "text_audio_dict_new_with_morp_phons.yaml"
# lab_dict_path = Path(DATA_TAKESHUN256_DIR) / "jmac_split_and_added_lab"

# with open(morp_phons_yaml_path, "r") as f:
#     morp_phons_yaml_data = yaml.safe_load(f)

# 各chapterについて、lab2ファイルを読み込んで、形態素単位のlabmファイルを作成する
new_morp_phons_yaml_data = {}
for audiobook_name, info in morp_phons_yaml_data.items():
    audiobook_dict = {}
    print(f"[INFO] audiobook_name: {audiobook_name}")
    for chapter_name, chapter_info in info["text"].items():
        phon_lab_path = (
            lab_dict_path / audiobook_name / f"{audiobook_name}_{chapter_name}.lab2"
        )
        morp_lab_path = (
            lab_dict_path / audiobook_name / f"{audiobook_name}_{chapter_name}.labm"
        )
        print(f"[INFO] phon_lab_path: {phon_lab_path}")
        if not phon_lab_path.exists():
            print(f"[INFO] {phon_lab_path} is not exist.")
            continue

        with open(phon_lab_path, "r") as f:
            phon_lab = f.readlines()

        if phon_lab == []:
            print(f"[INFO] {phon_lab_path} is empty.")
            with open(morp_lab_path, "w") as f:
                f.write("")
            continue

        morp_phons_list = chapter_info["morp_phons_list"]

        # かなりズレていたら、スキップする
        if (
            abs(len(phon_lab) - sum([len(phons) for _, phons in morp_phons_list]) - 2)
            > 10
        ):
            print(f"[INFO] {phon_lab_path} is too different.")
            continue

        # phon_labgがsilB silEのみの場合は、スキップする
        if len(phon_lab) <= 2:
            print(f"[INFO] {phon_lab_path} is only silB silE.")
            continue

        morp_lab = get_morp_lab_from_phon_lab(phon_lab, morp_phons_list)

        with open(morp_lab_path, "w") as f:
            for s in morp_lab:
                f.write(f"{s}\n")

    print(f"[INFO] {audiobook_name} is saved.")

In [None]:
# 2. それぞれのlabファイルを読み込んで、phon_lab, morp_labとして追加する
morp_phons_yaml_path = exp_dir / "text_audio_dict_new_with_morp_phons.yaml"
lab_dict_path = Path(DATA_TAKESHUN256_DIR) / "jmac_split_and_added_lab"
output_yaml_path = exp_dir / "text_audio_dict_new_with_morp_phons_and_lab.yaml"

with open(morp_phons_yaml_path, "r") as f:
    morp_phons_yaml_data = yaml.safe_load(f)

# 各chapterについて、lab2ファイルを読み込んで、それぞれのchapterのphon_labとして追加する
new_morp_phons_yaml_data = {}
for audiobook_name, info in tqdm(morp_phons_yaml_data.items()):
    audiobook_dict = {}
    for chapter_name, chapter_info in info["text"].items():
        phon_lab_path = (
            lab_dict_path / audiobook_name / f"{audiobook_name}_{chapter_name}.lab2"
        )
        morp_lab_path = (
            lab_dict_path / audiobook_name / f"{audiobook_name}_{chapter_name}.labm"
        )

        # if not phon_lab_path.exists() or not morp_lab_path.exists():
        #     print(f"[INFO] {phon_lab_path} or {morp_lab_path} is not exist.")
        #     continue

        # with open(phon_lab_path, "r") as f:
        #     lab_lines = f.readlines()
        # lab_lines = [s.strip() for s in lab_lines]
        # chapter_info["phon_lab"] = lab_lines

        # with open(morp_lab_path, "r") as f:
        #     lab_lines = f.readlines()
        # lab_lines = [s.strip() for s in lab_lines]
        # chapter_info["morp_lab"] = lab_lines

        if not phon_lab_path.exists():
            print(f"[INFO] {phon_lab_path} is not exist.")
        else:
            with open(phon_lab_path, "r") as f:
                lab_lines = f.readlines()
            lab_lines = [s.strip() for s in lab_lines]
            chapter_info["phon_lab"] = lab_lines

        if not morp_lab_path.exists():
            print(f"[INFO] {morp_lab_path} is not exist.")
        else:
            with open(morp_lab_path, "r") as f:
                lab_lines = f.readlines()
            lab_lines = [s.strip() for s in lab_lines]
            chapter_info["morp_lab"] = lab_lines

        audiobook_dict[chapter_name] = chapter_info
    new_morp_phons_yaml_data[audiobook_name] = audiobook_dict
    print(f"[INFO] {audiobook_name} is saved.")

with open(output_yaml_path, "w") as f:
    yaml.dump(new_morp_phons_yaml_data, f, allow_unicode=True)

print(f"output_yaml_path: {output_yaml_path}")

# get_pause_rangesを実行後、ここから実行する

#### 7.音声データから無音区間の情報を得て保存する

- `get_pause_ranges.ipynb` で無音区間の情報を得て保存する
- `exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause.yaml"`

#### 8.各形態素間に無音区間があるかどうかを確認する

In [None]:
morp_phons_yaml_path_small = (
    # exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_small.yaml"
    exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_small_fix_runencode.yaml"
)

with open(morp_phons_yaml_path_small, "r") as f:
    morp_phons_yaml_data_small = yaml.safe_load(f)

# picke
# import pickle
# with open(morp_phons_yaml_path_small, "rb") as f:
#     morp_phons_yaml_data_small = pickle.load(f)

In [None]:
morp_phons_yaml_data_small["audiobook_0"]["000"]

In [None]:
# for debug
chap = morp_phons_yaml_data_small["audiobook_0"]["000"]
morp_lab = chap["morp_lab"]
pause_ranges_str = chap["pause_ranges_str"]
pprint(morp_lab)
print(pause_ranges_str)

sr = 24000

pause_ranges_str = [s.split(" ") for s in pause_ranges_str]
pause_range = [
    [float(start) / sr, float(end) / sr, float(length) / sr]
    for start, end, length in pause_ranges_str
]
pprint(pause_range)

morp_pause_list = []
for i, s in enumerate(morp_lab):
    start, end, morp = s.strip().split(" ")
    start = float(start)
    end = float(end)

    print(f"start: {start}, end: {end}, morp: {morp}")
    for pause_start, pause_end, pause_length in pause_range:
        if pause_start <= start <= pause_end:
            print(f"pause_start: {pause_start}, pause_end: {pause_end}")
            morp_pause_list.append("[PAUSE]")
    morp_pause_list.append(morp)
    for pause_start, pause_end, pause_length in pause_range:
        if pause_start <= end <= pause_end:
            morp_pause_list.append("[PAUSE]")
    print(morp_pause_list)

print(morp_pause_list)

In [None]:
output_yaml_path = (
    # exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_small_str.yaml"
    exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_small_fix_encode_str.yaml"
)
sr = 24000


output_yaml_data = {}
# 各 morp_lab について、それぞれの時刻が無音区間かどうかを判定する
for audiobook_name, info in tqdm(morp_phons_yaml_data_small.items()):
    for chapter_name, chapter_info in info.items():
        morp_lab = chapter_info["morp_lab"]
        pause_ranges_str = chapter_info["pause_ranges_str"]
        pause_ranges_str = [s.split(" ") for s in pause_ranges_str]
        pause_ranges = [
            [float(start) / sr, float(end) / sr, float(length) / sr]
            for start, end, length in pause_ranges_str
        ]

        morp_pause_list = []

        # morp_labを結合する際に、その間に無音区間がある場合は、無音区間[PAUSE]を挿入する
        for i, s in enumerate(morp_lab):
            start, end, morp = s.strip().split(" ")
            start = float(start)
            end = float(end)

            for pause_start, pause_end, pause_length in pause_ranges:
                if pause_start <= start <= pause_end:
                    morp_pause_list.append("[PAUSE]")
            morp_pause_list.append(morp)
            for pause_start, pause_end, pause_length in pause_ranges:
                if pause_start <= end <= pause_end:
                    morp_pause_list.append("[PAUSE]")


        # print(morp_pause_list)
        # morp_pause_listを結合して、morp_pause_clipを作成する
        # rule1. 連続している[PAUSE]は1つにまとめる
        # rule2. silB, silEを削除する
        # rule3, spを[PAUSE]に変換する
        morp_pause_clip = []
        for i, s in enumerate(morp_pause_list):
            if s == "[PAUSE]":
                if len(morp_pause_clip) != 0:
                    if morp_pause_clip[-1] == "[PAUSE]":
                        continue
            if s == "silB" or s == "silE":
                continue
            # if s == "、":
            #     s = "[PAUSE]"
            morp_pause_clip.append(s)
        morp_pause_str = "".join(morp_pause_clip)

        # chapter_infoに追加する
        chapter_info["morp_pause_str"] = morp_pause_str
        chapter_info["morp_pause_clip"] = morp_pause_clip
        info[chapter_name] = chapter_info
    output_yaml_data[audiobook_name] = info
    print(f"[INFO] {audiobook_name} is saved.")

with open(output_yaml_path, "w") as f:
    yaml.dump(output_yaml_data, f, allow_unicode=True)

In [None]:
output_yaml_data["audiobook_0"]["000"]

### ポーズ長のデータも作成する

In [None]:
output_yaml_path = (
    # exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_small_str.yaml"
    exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_with_length_small_fix_encode_str.yaml"
)
sr = 24000


# 各pause_rangeに対して、その中のmorpsを取得する。
# 、を除いたmorpsの数が1以上の場合は、assertする。
# pause_rangeの中に含まれる、morpのendが含まれる場合は、そのmorpの後ろにpause_lengthを追加する。
# pause_rangeの中に含まれる、morpのstartが含まれる場合は、そのmorpの前にpause_lengthを追加する。
# pause_rangeの中に含まれる、morpのstartとendが0この場合は、そのpause_rangeが含まれるmorpが「、」の場合は「、」の前にpause_lengthを追加する。それ以外の場合はpause_rangeを削除する。


output_yaml_data = {}
# 各 morp_lab について、それぞれの時刻が無音区間かどうかを判定する
for audiobook_name, info in tqdm(morp_phons_yaml_data_small.items()):
    for chapter_name, chapter_info in info.items():
        morp_lab = chapter_info["morp_lab"]
        pause_ranges_str = chapter_info["pause_ranges_str"]
        pause_ranges_str = [s.split(" ") for s in pause_ranges_str]
        pause_ranges = [
            [float(start) / sr, float(end) / sr, float(length) / sr]
            for start, end, length in pause_ranges_str
        ]

        morp_pause_list = []
        continue_break = False
        # 、を除いたmorpsの数が1以上の場合は、assertする。
        for pause_start, pause_end, pause_length in pause_ranges:
            for i, s in enumerate(morp_lab):
                start, end, morp = s.strip().split(" ")
                start = float(start)
                end = float(end)
                if pause_start <= start <= pause_end and pause_start <= end <= pause_end:
                    if morp not in ["、", "silB", "silE"]:
                        # raise Exception(f"pause_start: {pause_start}, pause_end: {pause_end}, morp: {morp}")
                        print(f"pause_start: {pause_start}, pause_end: {pause_end}, morp: {morp}")
                        continue_break = True
        if continue_break:
            print(f"[INFO] {audiobook_name} {chapter_name} is skipped.")
            continue
        
        
        # morp_labを結合する際に、その間に無音区間がある場合は、無音区間[PAUSE]を挿入する
        for i, s in enumerate(morp_lab):
            start, end, morp = s.strip().split(" ")
            start = float(start)
            end = float(end)
            
            # 、を除いたmorpsの数が1以上の場合は、assertする。
            # for pause_start, pause_end, pause_length in pause_ranges:
            #     if pause_start <= start <= pause_end and pause_start <= end <= pause_end:
            #         if morp != "、":
            #             morps = [m for m in morp_pause_list if m != "、"]
            #             assert len(morps) < 1, f"len(morps): {len(morps)}, morps: {morps}"
            #         break

            # pause_rangeの中に含まれる、morpのendが含まれる場合は、そのmorpの後ろにpause_lengthを追加する。
            # pause_rangeの中に含まれる、morpのstartとendが0この場合は、そのpause_rangeが含まれるmorpが「、」の場合は「、」の前にpause_lengthを追加する。それ以外の場合はpause_rangeを削除する。
            # = morphが「、」の場合は、その中に含まれるpause rangeがないか見て、あればそのpause_rangeの前にpause_lengthを追加する。
            # pause_rangeの中に含まれる、morpのstartが含まれる場合は、そのmorpの前にpause_lengthを追加する。
            for pause_start, pause_end, pause_length in pause_ranges:
                if pause_start <= start <= pause_end:
                    morp_pause_list.append(f"[PAUSE {pause_length}]")
            
            if morp == "、":
                for pause_start, pause_end, pause_length in pause_ranges:
                    if start <= pause_start <= end and start <= pause_end <= end:
                        morp_pause_list.append(f"[PAUSE {pause_length}]")
                        break
            
            morp_pause_list.append(morp)
            for pause_start, pause_end, pause_length in pause_ranges:
                if pause_start <= end <= pause_end:
                    morp_pause_list.append(f"[PAUSE {pause_length}]")
            
            


        # print(morp_pause_list)
        # morp_pause_listを結合して、morp_pause_clipを作成する
        # rule1. 連続している[PAUSE]は1つにまとめる
        # rule2. silB, silEを削除する
        # rule3, spを[PAUSE]に変換する
        morp_pause_clip = []
        for i, s in enumerate(morp_pause_list):
            if "PAUSE" in s:
                if len(morp_pause_clip) != 0:
                    if "PAUSE" in morp_pause_clip[-1]:
                        continue
            if s == "silB" or s == "silE":
                continue
            # if s == "、":
            #     s = "[PAUSE]"
            morp_pause_clip.append(s)
        morp_pause_str = "".join(morp_pause_clip)

        # chapter_infoに追加する
        chapter_info["morp_pause_str"] = morp_pause_str
        chapter_info["morp_pause_clip"] = morp_pause_clip
        info[chapter_name] = chapter_info
    output_yaml_data[audiobook_name] = info
    print(f"[INFO] {audiobook_name} is saved.")

with open(output_yaml_path, "w") as f:
    yaml.dump(output_yaml_data, f, allow_unicode=True)

## 促音のポーズは入れない計測もする

In [None]:
output_yaml_path = (
    # exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_small_str.yaml"
    exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_with_length_small_fix_encode_str_wo_sokuon.yaml"
)
sr = 24000


# 各pause_rangeに対して、その中のmorpsを取得する。
# 、を除いたmorpsの数が1以上の場合は、assertする。
# pause_rangeの中に含まれる、morpのendが含まれる場合は、そのmorpの後ろにpause_lengthを追加する。
# pause_rangeの中に含まれる、morpのstartが含まれる場合は、そのmorpの前にpause_lengthを追加する。
# pause_rangeの中に含まれる、morpのstartとendが0この場合は、そのpause_rangeが含まれるmorpが「、」の場合は「、」の前にpause_lengthを追加する。それ以外の場合はpause_rangeを削除する。


output_yaml_data = {}
# 各 morp_lab について、それぞれの時刻が無音区間かどうかを判定する
for audiobook_name, info in tqdm(morp_phons_yaml_data_small.items()):
    for chapter_name, chapter_info in info.items():
        morp_lab = chapter_info["morp_lab"]
        pause_ranges_str = chapter_info["pause_ranges_str"]
        pause_ranges_str = [s.split(" ") for s in pause_ranges_str]
        pause_ranges = [
            [float(start) / sr, float(end) / sr, float(length) / sr]
            for start, end, length in pause_ranges_str
        ]

        morp_pause_list = []
        continue_break = False
        # 、を除いたmorpsの数が1以上の場合は、assertする。
        for pause_start, pause_end, pause_length in pause_ranges:
            for i, s in enumerate(morp_lab):
                start, end, morp = s.strip().split(" ")
                start = float(start)
                end = float(end)
                if pause_start <= start <= pause_end and pause_start <= end <= pause_end:
                    if morp not in ["、", "silB", "silE"]:
                        # raise Exception(f"pause_start: {pause_start}, pause_end: {pause_end}, morp: {morp}")
                        print(f"pause_start: {pause_start}, pause_end: {pause_end}, morp: {morp}")
                        continue_break = True
        if continue_break:
            print(f"[INFO] {audiobook_name} {chapter_name} is skipped.")
            continue
        
        
        # morp_labを結合する際に、その間に無音区間がある場合は、無音区間[PAUSE]を挿入する
        for i, s in enumerate(morp_lab):
            start, end, morp = s.strip().split(" ")
            start = float(start)
            end = float(end)
            
            # 、を除いたmorpsの数が1以上の場合は、assertする。
            # for pause_start, pause_end, pause_length in pause_ranges:
            #     if pause_start <= start <= pause_end and pause_start <= end <= pause_end:
            #         if morp != "、":
            #             morps = [m for m in morp_pause_list if m != "、"]
            #             assert len(morps) < 1, f"len(morps): {len(morps)}, morps: {morps}"
            #         break

            # pause_rangeの中に含まれる、morpのendが含まれる場合は、そのmorpの後ろにpause_lengthを追加する。
            # pause_rangeの中に含まれる、morpのstartとendが0この場合は、そのpause_rangeが含まれるmorpが「、」の場合は「、」の前にpause_lengthを追加する。それ以外の場合はpause_rangeを削除する。
            # = morphが「、」の場合は、その中に含まれるpause rangeがないか見て、あればそのpause_rangeの前にpause_lengthを追加する。
            # pause_rangeの中に含まれる、morpのstartが含まれる場合は、そのmorpの前にpause_lengthを追加する。
            for pause_start, pause_end, pause_length in pause_ranges:
                if pause_start <= start <= pause_end:
                    morp_pause_list.append(f"[PAUSE {pause_length}]")
            
            if morp == "、":
                for pause_start, pause_end, pause_length in pause_ranges:
                    if start <= pause_start <= end and start <= pause_end <= end:
                        morp_pause_list.append(f"[PAUSE {pause_length}]")
                        break
            
            morp_pause_list.append(morp)
            for pause_start, pause_end, pause_length in pause_ranges:
                if pause_start <= end <= pause_end:
                    morp_pause_list.append(f"[PAUSE {pause_length}]")
            
            


        # print(morp_pause_list)
        # morp_pause_listを結合して、morp_pause_clipを作成する
        # rule1. 連続している[PAUSE]は1つにまとめる
        # rule2. silB, silEを削除する
        # rule3, spを[PAUSE]に変換する
        morp_pause_clip = []
        for i, s in enumerate(morp_pause_list):
            if "PAUSE" in s:
                if len(morp_pause_clip) != 0:
                    if "PAUSE" in morp_pause_clip[-1]:
                        continue
            if s == "silB" or s == "silE":
                continue
            # if s == "、":
            #     s = "[PAUSE]"
            morp_pause_clip.append(s)
        
        #  PAUSEの直前のmorpの最後の文字が「っ」の場合は、PAUSEを削除する
        morp_pause_clip_new = []
        for i, s in enumerate(morp_pause_clip):
            if ("PAUSE" in s) and (i != 0):
                if (morp_pause_clip[i-1][-1] == "っ" or morp_pause_clip[i-1][-1] == "ッ"):
                    print(f"[INFO] {morp_pause_clip[i-1]} == 促音のため、PAUSEを削除します。")
                else:
                    morp_pause_clip_new.append(s)
            else:
                morp_pause_clip_new.append(s)
        
        morp_pause_str = "".join(morp_pause_clip_new)

        # chapter_infoに追加する
        chapter_info["morp_pause_str"] = morp_pause_str
        chapter_info["morp_pause_clip"] = morp_pause_clip_new
        info[chapter_name] = chapter_info
    output_yaml_data[audiobook_name] = info
    print(f"[INFO] {audiobook_name} is saved.")

with open(output_yaml_path, "w") as f:
    yaml.dump(output_yaml_data, f, allow_unicode=True)

#### 9.リストで、(形態素, True, 形態素, False, ...)のデータを作る

In [None]:
morp_phons_yaml_path_small = (
    exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_small_fix_encode_str.yaml"
)


with open(morp_phons_yaml_path_small, "r") as f:
    morp_phons_yaml_data_small = yaml.safe_load(f)


df = []
for audiobook_name, info in tqdm(morp_phons_yaml_data_small.items()):
    for chapter_name, chapter_info in info.items():
        if (
            "morp_pause_str" not in chapter_info
            or "morp_pause_clip" not in chapter_info
        ):
            continue
        morp_pause_str = chapter_info["morp_pause_str"]
        morp_pause_clip = chapter_info["morp_pause_clip"]

        df.append([audiobook_name, chapter_name, morp_pause_str, morp_pause_clip])

df = pd.DataFrame(
    df, columns=["audiobook_name", "chapter_name", "morp_pause_str", "morp_pause_clip"]
)

df.head()

In [None]:
pprint(df["morp_pause_clip"].iloc[0])

In [None]:
def insert_no_pause(ss):
    # [NO_PAUSE]を挿入する
    result = []
    for i in range(len(ss)):
        # リストの先頭と最後にも[NO_PAUSE]を入れるための条件分岐
        if i == 0 and ss[i] != "[PAUSE]":
            result.append("[NO_PAUSE]")
        elif i > 0 and ss[i-1] != "[PAUSE]" and ss[i] != "[PAUSE]":
            result.append("[NO_PAUSE]")
        result.append(ss[i])
        if i == len(ss) - 1 and ss[i] != "[PAUSE]":
            result.append("[NO_PAUSE]")
    return result

# insert_no_pause(df["morp_pause_clip"].iloc[0])

df["morp_pause_clip_no_pause"] = df["morp_pause_clip"].apply(insert_no_pause)

df.head()

In [None]:
exp_dir

In [None]:
df.to_csv(exp_dir / "bert_traindata_pause_position.csv", index=False)

# pickle
import pickle
df.to_pickle(exp_dir / "bert_traindata_pause_position.pkl")

In [None]:
ss = [
    "[PAUSE]",
    "これ",
    "は",
    "[PAUSE]",
    "私",
    "が",
    "小さい",
    "とき",
    "に",
    "[PAUSE]",
    "村",
    "の",
    "茂平",
    "と",
    "いう",
    "おじいさん",
    "から",
    "きい",
    "た",
    "お話",
    "[PAUSE]",
    "です",
]
ss = df["morp_pause_clip"].iloc[0]

# リストをイテレートして[PAUSE]がない場所に[NO_PAUSE]を挿入
result = []
for i in range(len(ss)):
    # リストの先頭と最後にも[NO_PAUSE]を入れるための条件分岐
    if i == 0 and ss[i] != "[PAUSE]":
        result.append("[NO_PAUSE]")
    elif i > 0 and ss[i-1] != "[PAUSE]" and ss[i] != "[PAUSE]":
        result.append("[NO_PAUSE]")
    result.append(ss[i])
    if i == len(ss) - 1 and ss[i] != "[PAUSE]":
        result.append("[NO_PAUSE]")

print(result)

# [PAUSE]を1, [NO_PAUSE]を0に変換し、それ以外はそのまま保持
final_list = [1 if word == "[PAUSE]" else 0 if word == "[NO_PAUSE]" else word for word in result]
print(final_list)


In [None]:
ss = [
    "[PAUSE]",
    "これ",
    "は",
    "[PAUSE]",
    "私",
    "が",
    "小さい",
    "とき",
    "に",
    "[PAUSE]",
    "村",
    "の",
    "茂平",
    "と",
    "いう",
    "おじいさん",
    "から",
    "きい",
    "た",
    "お話",
    "[PAUSE]",
    "です",
]
print(ss)

In [None]:
# morp_pause_clipについて、各形態素の間に[PAUSE]があるかどうかを判定するためのデータを作成する
df.loc[0, "morp_pause_clip"]

#### 9.リストで、(形態素, True, 形態素, False, ...)のデータを作る lengthも

In [None]:
morp_phons_yaml_path_small = (
    exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_with_length_small_fix_encode_str.yaml"
)


with open(morp_phons_yaml_path_small, "r") as f:
    morp_phons_yaml_data_small = yaml.safe_load(f)


df = []
for audiobook_name, info in tqdm(morp_phons_yaml_data_small.items()):
    for chapter_name, chapter_info in info.items():
        if (
            "morp_pause_str" not in chapter_info
            or "morp_pause_clip" not in chapter_info
        ):
            continue
        morp_pause_str = chapter_info["morp_pause_str"]
        morp_pause_clip = chapter_info["morp_pause_clip"]

        df.append([audiobook_name, chapter_name, morp_pause_str, morp_pause_clip])

df = pd.DataFrame(
    df, columns=["audiobook_name", "chapter_name", "morp_pause_str", "morp_pause_clip"]
)

df.head()

In [None]:
df

In [None]:
pprint(df["morp_pause_clip"].iloc[11])

In [None]:
def insert_no_pause(ss):
    # [NO_PAUSE]を挿入する
    result = []
    for i in range(len(ss)):
        # リストの先頭と最後にも[NO_PAUSE]を入れるための条件分岐
        if i == 0 and "PAUSE" not in ss[i]:
            result.append("[NO_PAUSE]")
        elif i > 0 and "PAUSE" not in ss[i-1] and "PAUSE" not in ss[i]:
            result.append("[NO_PAUSE]")
        result.append(ss[i])
        if i == len(ss) - 1 and "PAUSE" not in ss[i]:
            result.append("[NO_PAUSE]")
    return result

# insert_no_pause(df["morp_pause_clip"].iloc[0])

df["morp_pause_clip_no_pause"] = df["morp_pause_clip"].apply(insert_no_pause)

df.head()

In [None]:
df.to_csv(exp_dir / "bert_traindata_pause_position_with_length.csv", index=False)

# pickle
import pickle
df.to_pickle(exp_dir / "bert_traindata_pause_position_with_length.pkl")

In [None]:
df.iloc[0]["morp_pause_clip_no_pause"]

#### 9.リストで、(形態素, True, 形態素, False, ...)のデータを作る lengthも, 促音のポーズは入れない計測もする

In [None]:
morp_phons_yaml_path_small = (
    exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_with_length_small_fix_encode_str_wo_sokuon.yaml"
)


with open(morp_phons_yaml_path_small, "r") as f:
    morp_phons_yaml_data_small = yaml.safe_load(f)


df = []
for audiobook_name, info in tqdm(morp_phons_yaml_data_small.items()):
    for chapter_name, chapter_info in info.items():
        if (
            "morp_pause_str" not in chapter_info
            or "morp_pause_clip" not in chapter_info
        ):
            continue
        morp_pause_str = chapter_info["morp_pause_str"]
        morp_pause_clip = chapter_info["morp_pause_clip"]

        df.append([audiobook_name, chapter_name, morp_pause_str, morp_pause_clip])

df = pd.DataFrame(
    df, columns=["audiobook_name", "chapter_name", "morp_pause_str", "morp_pause_clip"]
)

df.head()

In [None]:
df

In [None]:
pprint(df["morp_pause_clip"].iloc[11])

In [None]:
def insert_no_pause(ss):
    # [NO_PAUSE]を挿入する
    result = []
    for i in range(len(ss)):
        # リストの先頭と最後にも[NO_PAUSE]を入れるための条件分岐
        if i == 0 and "PAUSE" not in ss[i]:
            result.append("[NO_PAUSE]")
        elif i > 0 and "PAUSE" not in ss[i-1] and "PAUSE" not in ss[i]:
            result.append("[NO_PAUSE]")
        result.append(ss[i])
        if i == len(ss) - 1 and "PAUSE" not in ss[i]:
            result.append("[NO_PAUSE]")
    return result

# insert_no_pause(df["morp_pause_clip"].iloc[0])

df["morp_pause_clip_no_pause"] = df["morp_pause_clip"].apply(insert_no_pause)

df.head()

In [None]:
df.to_csv(exp_dir / "bert_traindata_pause_position_with_length_wo_sokuon.csv", index=False)

# pickle
import pickle
df.to_pickle(exp_dir / "bert_traindata_pause_position_with_length_wo_sokuon.pkl")

In [None]:
df.iloc[0]["morp_pause_clip_no_pause"]

## 80msのtime閾値のポーズデータを用いて、促音のポーズは入れない学習データ作成

In [None]:
morp_phons_yaml_path_small = (
    # exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_small.yaml"
    exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_small_fix_runencode_80ms.yaml"
)

with open(morp_phons_yaml_path_small, "r") as f:
    morp_phons_yaml_data_small = yaml.safe_load(f)

# picke
# import pickle
# with open(morp_phons_yaml_path_small, "rb") as f:
#     morp_phons_yaml_data_small = pickle.load(f)

In [None]:
c = 0
for audiobook_name, info in tqdm(morp_phons_yaml_data_small.items()):
    for chapter_name, chapter_info in info.items():
        c += 1
print(c)

In [None]:
output_yaml_path = (
    # exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_small_str.yaml"
    exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_with_length_small_fix_encode_str_wo_sokuon_80ms.yaml"
)
sr = 24000


# 各pause_rangeに対して、その中のmorpsを取得する。
# 、を除いたmorpsの数が1以上の場合は、assertする。
# pause_rangeの中に含まれる、morpのendが含まれる場合は、そのmorpの後ろにpause_lengthを追加する。
# pause_rangeの中に含まれる、morpのstartが含まれる場合は、そのmorpの前にpause_lengthを追加する。
# pause_rangeの中に含まれる、morpのstartとendが0この場合は、そのpause_rangeが含まれるmorpが「、」の場合は「、」の前にpause_lengthを追加する。それ以外の場合はpause_rangeを削除する。


output_yaml_data = {}
# 各 morp_lab について、それぞれの時刻が無音区間かどうかを判定する
for audiobook_name, info in tqdm(morp_phons_yaml_data_small.items()):
    for chapter_name, chapter_info in info.items():
        morp_lab = chapter_info["morp_lab"]
        pause_ranges_str = chapter_info["pause_ranges_str"]
        pause_ranges_str = [s.split(" ") for s in pause_ranges_str]
        pause_ranges = [
            [float(start) / sr, float(end) / sr, float(length) / sr]
            for start, end, length in pause_ranges_str
        ]

        morp_pause_list = []
        continue_break = False
        # 、を除いたmorpsの数が1以上の場合は、assertする。
        for pause_start, pause_end, pause_length in pause_ranges:
            for i, s in enumerate(morp_lab):
                start, end, morp = s.strip().split(" ")
                start = float(start)
                end = float(end)
                if pause_start <= start <= pause_end and pause_start <= end <= pause_end:
                    if morp not in ["、", "silB", "silE"]:
                        # raise Exception(f"pause_start: {pause_start}, pause_end: {pause_end}, morp: {morp}")
                        print(f"pause_start: {pause_start}, pause_end: {pause_end}, morp: {morp}")
                        continue_break = True
        if continue_break:
            print(f"[INFO] {audiobook_name} {chapter_name} is skipped.")
            continue
        
        
        # morp_labを結合する際に、その間に無音区間がある場合は、無音区間[PAUSE]を挿入する
        for i, s in enumerate(morp_lab):
            start, end, morp = s.strip().split(" ")
            start = float(start)
            end = float(end)
            
            # 、を除いたmorpsの数が1以上の場合は、assertする。
            # for pause_start, pause_end, pause_length in pause_ranges:
            #     if pause_start <= start <= pause_end and pause_start <= end <= pause_end:
            #         if morp != "、":
            #             morps = [m for m in morp_pause_list if m != "、"]
            #             assert len(morps) < 1, f"len(morps): {len(morps)}, morps: {morps}"
            #         break

            # pause_rangeの中に含まれる、morpのendが含まれる場合は、そのmorpの後ろにpause_lengthを追加する。
            # pause_rangeの中に含まれる、morpのstartとendが0この場合は、そのpause_rangeが含まれるmorpが「、」の場合は「、」の前にpause_lengthを追加する。それ以外の場合はpause_rangeを削除する。
            # = morphが「、」の場合は、その中に含まれるpause rangeがないか見て、あればそのpause_rangeの前にpause_lengthを追加する。
            # pause_rangeの中に含まれる、morpのstartが含まれる場合は、そのmorpの前にpause_lengthを追加する。
            for pause_start, pause_end, pause_length in pause_ranges:
                if pause_start <= start <= pause_end:
                    morp_pause_list.append(f"[PAUSE {pause_length}]")
            
            if morp == "、":
                for pause_start, pause_end, pause_length in pause_ranges:
                    if start <= pause_start <= end and start <= pause_end <= end:
                        morp_pause_list.append(f"[PAUSE {pause_length}]")
                        break
            
            morp_pause_list.append(morp)
            for pause_start, pause_end, pause_length in pause_ranges:
                if pause_start <= end <= pause_end:
                    morp_pause_list.append(f"[PAUSE {pause_length}]")
            
            


        # print(morp_pause_list)
        # morp_pause_listを結合して、morp_pause_clipを作成する
        # rule1. 連続している[PAUSE]は1つにまとめる
        # rule2. silB, silEを削除する
        # rule3, spを[PAUSE]に変換する
        morp_pause_clip = []
        for i, s in enumerate(morp_pause_list):
            if "PAUSE" in s:
                if len(morp_pause_clip) != 0:
                    if "PAUSE" in morp_pause_clip[-1]:
                        continue
            if s == "silB" or s == "silE":
                continue
            # if s == "、":
            #     s = "[PAUSE]"
            morp_pause_clip.append(s)
        
        #  PAUSEの直前のmorpの最後の文字が「っ」の場合は、PAUSEを削除する
        morp_pause_clip_new = []
        for i, s in enumerate(morp_pause_clip):
            if ("PAUSE" in s) and (i != 0):
                if (morp_pause_clip[i-1][-1] == "っ" or morp_pause_clip[i-1][-1] == "ッ"):
                    print(f"[INFO] {morp_pause_clip[i-1]} == 促音のため、PAUSEを削除します。")
                else:
                    morp_pause_clip_new.append(s)
            else:
                morp_pause_clip_new.append(s)
        
        morp_pause_str = "".join(morp_pause_clip_new)

        # chapter_infoに追加する
        chapter_info["morp_pause_str"] = morp_pause_str
        chapter_info["morp_pause_clip"] = morp_pause_clip_new
        info[chapter_name] = chapter_info
    output_yaml_data[audiobook_name] = info
    print(f"[INFO] {audiobook_name} is saved.")

with open(output_yaml_path, "w") as f:
    yaml.dump(output_yaml_data, f, allow_unicode=True)

In [None]:
morp_phons_yaml_path_small = (
    exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_with_length_small_fix_encode_str_wo_sokuon_80ms.yaml"
)


with open(morp_phons_yaml_path_small, "r") as f:
    morp_phons_yaml_data_small = yaml.safe_load(f)


df = []
for audiobook_name, info in tqdm(morp_phons_yaml_data_small.items()):
    for chapter_name, chapter_info in info.items():
        if (
            "morp_pause_str" not in chapter_info
            or "morp_pause_clip" not in chapter_info
        ):
            continue
        morp_pause_str = chapter_info["morp_pause_str"]
        morp_pause_clip = chapter_info["morp_pause_clip"]

        df.append([audiobook_name, chapter_name, morp_pause_str, morp_pause_clip])

df = pd.DataFrame(
    df, columns=["audiobook_name", "chapter_name", "morp_pause_str", "morp_pause_clip"]
)

df.head()

In [None]:
def insert_no_pause(ss):
    # [NO_PAUSE]を挿入する
    result = []
    for i in range(len(ss)):
        # リストの先頭と最後にも[NO_PAUSE]を入れるための条件分岐
        if i == 0 and "PAUSE" not in ss[i]:
            result.append("[NO_PAUSE]")
        elif i > 0 and "PAUSE" not in ss[i-1] and "PAUSE" not in ss[i]:
            result.append("[NO_PAUSE]")
        result.append(ss[i])
        if i == len(ss) - 1 and "PAUSE" not in ss[i]:
            result.append("[NO_PAUSE]")
    return result

# insert_no_pause(df["morp_pause_clip"].iloc[0])

df["morp_pause_clip_no_pause"] = df["morp_pause_clip"].apply(insert_no_pause)

df.head()

In [None]:
df.to_csv(exp_dir / "bert_traindata_pause_position_with_length_wo_sokuon_80ms.csv", index=False)

# pickle
import pickle
df.to_pickle(exp_dir / "bert_traindata_pause_position_with_length_wo_sokuon_80ms.pkl")

## 100msのtime閾値のポーズデータを用いて、促音のポーズは入れない学習データ作成

In [None]:
morp_phons_yaml_path_small = (
    # exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_small.yaml"
    exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_small_fix_runencode_100ms.yaml"
)

with open(morp_phons_yaml_path_small, "r") as f:
    morp_phons_yaml_data_small = yaml.safe_load(f)

# picke
# import pickle
# with open(morp_phons_yaml_path_small, "rb") as f:
#     morp_phons_yaml_data_small = pickle.load(f)

In [None]:
c = 0
for audiobook_name, info in tqdm(morp_phons_yaml_data_small.items()):
    for chapter_name, chapter_info in info.items():
        c += 1
print(c)

In [None]:
output_yaml_path = (
    # exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_small_str.yaml"
    exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_with_length_small_fix_encode_str_wo_sokuon_100ms.yaml"
)
sr = 24000


# 各pause_rangeに対して、その中のmorpsを取得する。
# 、を除いたmorpsの数が1以上の場合は、assertする。
# pause_rangeの中に含まれる、morpのendが含まれる場合は、そのmorpの後ろにpause_lengthを追加する。
# pause_rangeの中に含まれる、morpのstartが含まれる場合は、そのmorpの前にpause_lengthを追加する。
# pause_rangeの中に含まれる、morpのstartとendが0この場合は、そのpause_rangeが含まれるmorpが「、」の場合は「、」の前にpause_lengthを追加する。それ以外の場合はpause_rangeを削除する。


output_yaml_data = {}
# 各 morp_lab について、それぞれの時刻が無音区間かどうかを判定する
for audiobook_name, info in tqdm(morp_phons_yaml_data_small.items()):
    for chapter_name, chapter_info in info.items():
        morp_lab = chapter_info["morp_lab"]
        pause_ranges_str = chapter_info["pause_ranges_str"]
        pause_ranges_str = [s.split(" ") for s in pause_ranges_str]
        pause_ranges = [
            [float(start) / sr, float(end) / sr, float(length) / sr]
            for start, end, length in pause_ranges_str
        ]

        morp_pause_list = []
        continue_break = False
        # 、を除いたmorpsの数が1以上の場合は、assertする。
        for pause_start, pause_end, pause_length in pause_ranges:
            for i, s in enumerate(morp_lab):
                start, end, morp = s.strip().split(" ")
                start = float(start)
                end = float(end)
                if pause_start <= start <= pause_end and pause_start <= end <= pause_end:
                    if morp not in ["、", "silB", "silE"]:
                        # raise Exception(f"pause_start: {pause_start}, pause_end: {pause_end}, morp: {morp}")
                        print(f"pause_start: {pause_start}, pause_end: {pause_end}, morp: {morp}")
                        continue_break = True
        if continue_break:
            print(f"[INFO] {audiobook_name} {chapter_name} is skipped.")
            continue
        
        
        # morp_labを結合する際に、その間に無音区間がある場合は、無音区間[PAUSE]を挿入する
        for i, s in enumerate(morp_lab):
            start, end, morp = s.strip().split(" ")
            start = float(start)
            end = float(end)
            
            # 、を除いたmorpsの数が1以上の場合は、assertする。
            # for pause_start, pause_end, pause_length in pause_ranges:
            #     if pause_start <= start <= pause_end and pause_start <= end <= pause_end:
            #         if morp != "、":
            #             morps = [m for m in morp_pause_list if m != "、"]
            #             assert len(morps) < 1, f"len(morps): {len(morps)}, morps: {morps}"
            #         break

            # pause_rangeの中に含まれる、morpのendが含まれる場合は、そのmorpの後ろにpause_lengthを追加する。
            # pause_rangeの中に含まれる、morpのstartとendが0この場合は、そのpause_rangeが含まれるmorpが「、」の場合は「、」の前にpause_lengthを追加する。それ以外の場合はpause_rangeを削除する。
            # = morphが「、」の場合は、その中に含まれるpause rangeがないか見て、あればそのpause_rangeの前にpause_lengthを追加する。
            # pause_rangeの中に含まれる、morpのstartが含まれる場合は、そのmorpの前にpause_lengthを追加する。
            for pause_start, pause_end, pause_length in pause_ranges:
                if pause_start <= start <= pause_end:
                    morp_pause_list.append(f"[PAUSE {pause_length}]")
            
            if morp == "、":
                for pause_start, pause_end, pause_length in pause_ranges:
                    if start <= pause_start <= end and start <= pause_end <= end:
                        morp_pause_list.append(f"[PAUSE {pause_length}]")
                        break
            
            morp_pause_list.append(morp)
            for pause_start, pause_end, pause_length in pause_ranges:
                if pause_start <= end <= pause_end:
                    morp_pause_list.append(f"[PAUSE {pause_length}]")
            
            


        # print(morp_pause_list)
        # morp_pause_listを結合して、morp_pause_clipを作成する
        # rule1. 連続している[PAUSE]は1つにまとめる
        # rule2. silB, silEを削除する
        # rule3, spを[PAUSE]に変換する
        morp_pause_clip = []
        for i, s in enumerate(morp_pause_list):
            if "PAUSE" in s:
                if len(morp_pause_clip) != 0:
                    if "PAUSE" in morp_pause_clip[-1]:
                        continue
            if s == "silB" or s == "silE":
                continue
            # if s == "、":
            #     s = "[PAUSE]"
            morp_pause_clip.append(s)
        
        #  PAUSEの直前のmorpの最後の文字が「っ」の場合は、PAUSEを削除する
        morp_pause_clip_new = []
        for i, s in enumerate(morp_pause_clip):
            if ("PAUSE" in s) and (i != 0):
                if (morp_pause_clip[i-1][-1] == "っ" or morp_pause_clip[i-1][-1] == "ッ"):
                    print(f"[INFO] {morp_pause_clip[i-1]} == 促音のため、PAUSEを削除します。")
                else:
                    morp_pause_clip_new.append(s)
            else:
                morp_pause_clip_new.append(s)
        
        morp_pause_str = "".join(morp_pause_clip_new)

        # chapter_infoに追加する
        chapter_info["morp_pause_str"] = morp_pause_str
        chapter_info["morp_pause_clip"] = morp_pause_clip_new
        info[chapter_name] = chapter_info
    output_yaml_data[audiobook_name] = info
    print(f"[INFO] {audiobook_name} is saved.")

with open(output_yaml_path, "w") as f:
    yaml.dump(output_yaml_data, f, allow_unicode=True)

In [None]:
morp_phons_yaml_path_small = (
    exp_dir / "text_audio_dict_new_with_morp_phons_and_lab_with_pause_with_length_small_fix_encode_str_wo_sokuon_100ms.yaml"
)


with open(morp_phons_yaml_path_small, "r") as f:
    morp_phons_yaml_data_small = yaml.safe_load(f)


df = []
for audiobook_name, info in tqdm(morp_phons_yaml_data_small.items()):
    for chapter_name, chapter_info in info.items():
        if (
            "morp_pause_str" not in chapter_info
            or "morp_pause_clip" not in chapter_info
        ):
            continue
        morp_pause_str = chapter_info["morp_pause_str"]
        morp_pause_clip = chapter_info["morp_pause_clip"]

        df.append([audiobook_name, chapter_name, morp_pause_str, morp_pause_clip])

df = pd.DataFrame(
    df, columns=["audiobook_name", "chapter_name", "morp_pause_str", "morp_pause_clip"]
)

df.head()

In [None]:
def insert_no_pause(ss):
    # [NO_PAUSE]を挿入する
    result = []
    for i in range(len(ss)):
        # リストの先頭と最後にも[NO_PAUSE]を入れるための条件分岐
        if i == 0 and "PAUSE" not in ss[i]:
            result.append("[NO_PAUSE]")
        elif i > 0 and "PAUSE" not in ss[i-1] and "PAUSE" not in ss[i]:
            result.append("[NO_PAUSE]")
        result.append(ss[i])
        if i == len(ss) - 1 and "PAUSE" not in ss[i]:
            result.append("[NO_PAUSE]")
    return result

# insert_no_pause(df["morp_pause_clip"].iloc[0])

df["morp_pause_clip_no_pause"] = df["morp_pause_clip"].apply(insert_no_pause)

df.head()

In [None]:
df.to_csv(exp_dir / "bert_traindata_pause_position_with_length_wo_sokuon_100ms.csv", index=False)

# pickle
import pickle
df.to_pickle(exp_dir / "bert_traindata_pause_position_with_length_wo_sokuon_100ms.pkl")