# 文間ポーズの学習データを作成する

- オーディオブック内の文章について、文章から、文章の間のポーズの長さを学習する

- BERTのモデルを使って、文章の間のポーズの長さを予測する

## ライブラリのインポート

In [None]:
import csv
import yaml
import os
import glob
import sys


sys.path.append("/home/takeshun256/PausePrediction")

sys.path.append("/home/takeshun256/PausePrediction/src/analyze_jmac")
sys.path.append("/home/takeshun256/PausePrediction/src/vad_tool")
from audiobook_yaml_parser import extract_yaml_data
from py_webrtcvad_test import getVadSection
from vad_tool import VAD_Segmenter
from audiobook_dataset_builder import AudiobookDatasetBuilder
from audiobook_script_extractor import AudiobookScriptExtractor


from julius_lab_analysis import JuliusLabAnalyzer

import numpy as np
import scipy
import scipy.io
from scipy.io import wavfile
import scipy.io.wavfile
import scipy.ndimage
import scipy.signal

%matplotlib inline
from matplotlib import pyplot as plt
import japanize_matplotlib
import webrtcvad
from pprint import pprint
import pandas as pd
import seaborn as sns
import librosa
import struct
import librosa.display
from IPython.display import Audio
from tqdm import tqdm
from typing import List, Tuple, Dict, Any, Union
import soundfile as sf
from pathlib import Path
from sklearn.model_selection import train_test_split

from config import DATA_DIR, DATA_TAKESHUN256_DIR, SRC_DIR, DATA_IN_ROOT_DIR

# define path
corpus_name = "jmac"
exp_name = "03_VAD_Adjusted"

exp_dir = Path(DATA_TAKESHUN256_DIR) / corpus_name / exp_name
yaml_file_path = Path(DATA_IN_ROOT_DIR) / corpus_name / "text_audio_dict_new.yaml"

assert exp_dir.exists()
assert yaml_file_path.exists()


audiobook_yaml_path = Path(DATA_IN_ROOT_DIR) / corpus_name / "text_audio_dict_new.yaml"
speaker_csv_path = "/home/takeshun256/PausePrediction/data_pub/jmac/bookdata-speaker.csv"
speaker_gender_csv_path =  "/home/takeshun256/PausePrediction/data_pub/jmac/speaker_gender.csv"
# audio book data
with open(audiobook_yaml_path, "rb") as f:
    audiobook_dict = yaml.safe_load(f)

# speaker data
df_speaker = pd.read_csv(speaker_csv_path)
df_gender = pd.read_csv(speaker_gender_csv_path)

# 80msのtime閾値のデータ作成

## データの読み込み

In [None]:
pause_data_path = exp_dir / "pause_ranges" / "rule_based" / "df_lab_attached_morph_pause_rule_based_detected_-30_0.08.pkl"

df = pd.read_pickle(pause_data_path)
df = df["phoneme lab_filepath audiobook_id audiobook_id_int chapter_id chapter_id_int ruled_former_pause_-30_0.08 ruled_latter_pause_-30_0.08".split()]
df.rename(columns={"ruled_former_pause_-30_0.08": "former_pause", "ruled_latter_pause_-30_0.08": "latter_pause"}, inplace=True)
df.rename(columns={"phoneme": "morp"}, inplace=True)
df

In [None]:
# ============== 話者 ==============
def extract_speaker_label(audiobook_data: dict, speaker_df: pd.DataFrame):
    # wavファイル名をキーにして、話者名を取得 (mp3は分割されているため、wavファイル名の方で結合する)
    audio_names = []
    wavs = []
    for audio_name, audio_info in audiobook_data.items():
        audio_names.append(audio_name)
        wavs.append(Path(audio_info["wav"]).name)
    df_audio = pd.DataFrame({"audiobook_id": audio_names, "wav": wavs})
    df_speaker_one = speaker_df.copy()
    df_speaker_one["speaker"] = df_speaker_one["speaker"].apply(lambda x: x.split(",")[0]) # 複数の話者がいる場合、最初の話者を取得
    df_speaker_one = df_speaker_one[["speaker", "wav","book"]]
    display(df_speaker_one.head())
    before = len(df_audio)
    df_audio_speaker = pd.merge(df_audio, df_speaker_one, on="wav", how="inner")
    after = len(df_audio_speaker)
    assert before == after
    df_audio_speaker.drop(columns=["wav"], inplace=True)
    return df_audio_speaker

speaker_label_df = extract_speaker_label(audiobook_dict, df_speaker)
display(speaker_label_df.head())

df = pd.merge(df, speaker_label_df, on="audiobook_id")
display(df.head())

## データの前処理

In [None]:
# audiobook_id, chapter_idをキーとする
# morpを結合する
df_morp = df[["audiobook_id", "chapter_id", "morp"]].groupby(["audiobook_id", "chapter_id"]).agg(list)
df_morp.sort_values(["audiobook_id", "chapter_id"], inplace=True)
df_morp.reset_index(inplace=True)
df_morp["morp"] = df_morp["morp"].apply(lambda x: " ".join(x))
print(df_morp.shape)
display(df_morp.head())

# morp以外はaudiobook_id, chapter_idをキーとした場合に一意に定まる
df_other = df.drop(columns=["morp"]).drop_duplicates()
df_other.sort_values(["audiobook_id", "chapter_id"], inplace=True)
df_other.reset_index(drop=True, inplace=True)
print(df_other.shape)
display(df_other.head())

In [None]:
# morpとそれ以外を結合する
df_merged = pd.merge(df_other, df_morp, on=["audiobook_id", "chapter_id"])
df_merged["text"] = df_merged["morp"].map(lambda x: x.replace(" ", "").lstrip("silB").rstrip("silE"))
df_merged.rename(columns={"morp": "morp_join"}, inplace=True)
df_merged["morp_join_no_sil"] = df_merged["morp_join"].map(lambda x: x.lstrip("silB").rstrip("silE").strip())
print(df_merged.shape)
display(df_merged.head())

句点はどこかで削除されている。
読点はのこっている。

In [None]:
output_path = exp_dir / "bert_traindata_pause_between_sentences_80ms_-30db.pkl"
df_merged.to_pickle(output_path)

# 100msのtime閾値のデータ作成

## データの読み込み

In [None]:
pause_data_path = exp_dir / "pause_ranges" / "rule_based" / "df_lab_attached_morph_pause_rule_based_detected_-30_0.1.pkl"

df = pd.read_pickle(pause_data_path)
df = df["phoneme lab_filepath audiobook_id audiobook_id_int chapter_id chapter_id_int ruled_former_pause_-30_0.1 ruled_latter_pause_-30_0.1".split()]
df.rename(columns={"ruled_former_pause_-30_0.1": "former_pause", "ruled_latter_pause_-30_0.1": "latter_pause"}, inplace=True)
df.rename(columns={"phoneme": "morp"}, inplace=True)
df

In [None]:
# ============== 話者 ==============
def extract_speaker_label(audiobook_data: dict, speaker_df: pd.DataFrame):
    # wavファイル名をキーにして、話者名を取得 (mp3は分割されているため、wavファイル名の方で結合する)
    audio_names = []
    wavs = []
    for audio_name, audio_info in audiobook_data.items():
        audio_names.append(audio_name)
        wavs.append(Path(audio_info["wav"]).name)
    df_audio = pd.DataFrame({"audiobook_id": audio_names, "wav": wavs})
    df_speaker_one = speaker_df.copy()
    df_speaker_one["speaker"] = df_speaker_one["speaker"].apply(lambda x: x.split(",")[0]) # 複数の話者がいる場合、最初の話者を取得
    df_speaker_one = df_speaker_one[["speaker", "wav","book"]]
    display(df_speaker_one.head())
    before = len(df_audio)
    df_audio_speaker = pd.merge(df_audio, df_speaker_one, on="wav", how="inner")
    after = len(df_audio_speaker)
    assert before == after
    df_audio_speaker.drop(columns=["wav"], inplace=True)
    return df_audio_speaker

speaker_label_df = extract_speaker_label(audiobook_dict, df_speaker)
display(speaker_label_df.head())

df = pd.merge(df, speaker_label_df, on="audiobook_id")
display(df.head())

## データの前処理

In [None]:
# audiobook_id, chapter_idをキーとする
# morpを結合する
df_morp = df[["audiobook_id", "chapter_id", "morp"]].groupby(["audiobook_id", "chapter_id"]).agg(list)
df_morp.sort_values(["audiobook_id", "chapter_id"], inplace=True)
df_morp.reset_index(inplace=True)
df_morp["morp"] = df_morp["morp"].apply(lambda x: " ".join(x))
print(df_morp.shape)
display(df_morp.head())

# morp以外はaudiobook_id, chapter_idをキーとした場合に一意に定まる
df_other = df.drop(columns=["morp"]).drop_duplicates()
df_other.sort_values(["audiobook_id", "chapter_id"], inplace=True)
df_other.reset_index(drop=True, inplace=True)
print(df_other.shape)
display(df_other.head())

In [None]:
# morpとそれ以外を結合する
df_merged = pd.merge(df_other, df_morp, on=["audiobook_id", "chapter_id"])
df_merged["text"] = df_merged["morp"].map(lambda x: x.replace(" ", "").lstrip("silB").rstrip("silE"))
df_merged.rename(columns={"morp": "morp_join"}, inplace=True)
df_merged["morp_join_no_sil"] = df_merged["morp_join"].map(lambda x: x.lstrip("silB").rstrip("silE").strip())
print(df_merged.shape)
display(df_merged.head())

In [None]:
output_path = exp_dir / "bert_traindata_pause_between_sentences_100ms_-30db.pkl"
df_merged.to_pickle(output_path)

# ポーズ長の正規化

In [None]:
# ユニーク数をカウント
print("80ms")
print("audiobook_name のユニーク数:", df_merged["audiobook_id"].nunique())
print("speaker のユニーク数:", df_merged["speaker"].nunique())
print("book のユニーク数:", df_merged["book"].nunique())


In [None]:
# それぞれのカテゴリに含まれる数の最大値と最小値を確認(3未満だと分割した際に未知データになるため)
print(f"audiobook_count: {df_merged['audiobook_id'].value_counts().max()}〜{df_merged['audiobook_id'].value_counts().min()}")
print(f"speaker_count: {df_merged['speaker'].value_counts().max()}〜{df_merged['speaker'].value_counts().min()}")
print(f"book_count: {df_merged['book'].value_counts().max()}〜{df_merged['book'].value_counts().min()}")

fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# スピーカーの可視化
df_merged["speaker"].astype("category").cat.codes.value_counts().sort_index().plot(kind="bar", ax=axes[0, 0], title="speaker")

# オーディオブックIDの可視化
df_merged["audiobook_id"].astype("category").cat.codes.value_counts().sort_index().plot(kind="bar", ax=axes[0, 1], title="audiobook_id")

# 本のタイトルの可視化
df_merged["book"].astype("category").cat.codes.value_counts().sort_index().plot(kind="bar", ax=axes[1, 1], title="book")

plt.tight_layout()
plt.show()


In [None]:
# データの一覧
pause_time_threshold_mss = [80, 100]
# preprocess_types = ["none", "all", "audiobook", "narrative", "audiobook_narrative", "speaker", "book"]
preprocess_types = ["none", "all", "audiobook", "speaker", "book"]
num_labels = [1, 2]

# output dir
output_dir = exp_dir / "data_bert"
assert output_dir.exists()

# それぞれのディレクトリを作成
for pause_time_threshold_ms in pause_time_threshold_mss:
    for preprocess_type in preprocess_types:
        output_dir_each = output_dir / f"{pause_time_threshold_ms}ms" / preprocess_type
        output_dir_each.mkdir(parents=True, exist_ok=True)
        print(output_dir_each)

In [None]:
# 前処理
def preprocess_data_between_sentence(df_input: pd.DataFrame, preprocess_type, num_labels) -> pd.DataFrame:
    df = df_input.copy()
    # テキストに句点がない場合は、句点を追加する
    # df["text"] = df["text"].map(lambda x: x if x[-1] == "。" else x + "。") # 文章的に怪しいものもあるので、一旦コメントアウト

    # テキストの結合, textについて次のtextと結合する。。次というのはchapter_idが1つ大きいもの
    df['next_text'] = None  # 新しい列next_textを初期化

    for i in range(len(df) - 1):
        if df.loc[i, "chapter_id_int"] + 1 == df.loc[i + 1, "chapter_id_int"]:
            df.loc[i, 'next_text'] = df.loc[i + 1, 'text']

    # どれくらいの割合で次のテキストが見つかったか確認
    print("次のテキストが見つかった割合: ", len(df[df["next_text"].notnull()]),  "/" ,len(df))

    # テキストを [SEP] で結合する
    df["concat_text"] = df["text"] + " [SEP] " + df["next_text"]
    
    # テキストの長さが0のものを除外 text=next_text=0のもの
    print(f"concat_textの長さが0のものを除外前のdf.shape: {df.shape}")
    df = df[df["concat_text"].notnull()]
    df = df[df["concat_text"] != " [SEP] "]
    df.reset_index(drop=True, inplace=True)
    print(f"concat_textの長さが0のものを除外後のdf.shape: {df.shape}")

    # ラベルとするポーズの割り当て latter_pauseをラベルとする
    df["label"] = df["latter_pause"].values

    # ラベルの割合を確認 0 かいなか
    print("ラベルの割合: ", len(df[df["label"] <= 0.2]),  "/" ,len(df))

    # 分類用のラベルを作成
    df["label_class"] = df["label"].map(lambda x: 1 if x <= 0.2 else 0)

    # 外れ値の割合を確認 0未満か10以上のもの
    print("外れ値の割合: ", len(df[(df["label"] < 0) | (df["label"] > 10)]),  "/" ,len(df))

    # 外れ値を除外
    df = df[(df["label"] >= 0) & (df["label"] <= 10)]

    # 必要な列のみに絞る
    df = df[["audiobook_id", "chapter_id", "speaker", "book", "concat_text", "label", "label_class"]]
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    # 表示
    print(f"はずれ値除去後のdf.shape: {df.shape}")
    
    # audiobookのグループ数がthreshold未満のものを除外
    threshold = 10
    audiobook_counts = df["audiobook_id"].value_counts()
    print(f"audiobookのグループ数が{threshold}未満のaudiobook_id: {audiobook_counts[audiobook_counts < threshold]}")
    audiobook_counts = audiobook_counts[audiobook_counts >= threshold]
    df = df[df["audiobook_id"].isin(audiobook_counts.index)]
    
    # train, val, testに分割する関数を定義
    def train_val_test_split(df: pd.DataFrame, test_size: float = 0.2, val_size: float = 0.25) -> pd.DataFrame:
        """train_test_splitを行って、train, val, testの列を追加する関数"""
        from sklearn.model_selection import train_test_split
        # ここではaudiobook_idをstratifyの基準として使用
        stratify_col = df["audiobook_id"]
        df_train, df_test = train_test_split(df, test_size=test_size, random_state=42, stratify=stratify_col)
        # stratify_colをdf_trainに基づいて再計算
        stratify_col_train = df_train["audiobook_id"]
        df_train, df_val = train_test_split(df_train, test_size=val_size, random_state=42, stratify=stratify_col_train)
        df_train["train_val_test"] = "train"
        df_val["train_val_test"] = "val"
        df_test["train_val_test"] = "test"
        assert len(set(df_train.index) & set(df_val.index)) == 0, "trainとvalに共通のインデックスが存在します"
        assert len(set(df_train.index) & set(df_test.index)) == 0, "trainとtestに共通のインデックスが存在します"
        assert len(set(df_val.index) & set(df_test.index)) == 0, "valとtestに共通のインデックスが存在します"
        return pd.concat([df_train, df_val, df_test])

    # train, val, testに分割
    df = train_val_test_split(df)
    
    # train, val, testごとにグループ数を計測する関数
    def count_groups_in_train_val_test(df: pd.DataFrame) -> pd.DataFrame:
        """train, val, testごとにグループ数を計測する関数"""
        nunique_audiobook = df.groupby("train_val_test")["audiobook_id"].nunique()
        nunique_speaker = df.groupby("train_val_test")["speaker"].nunique()
        nunique_book = df.groupby("train_val_test")["book"].nunique()
        group_counts = pd.DataFrame({
            "audiobook": nunique_audiobook,
            "speaker": nunique_speaker,
            "book": nunique_book
        })
        return group_counts

    # train, val, testごとにグループ数を計測して表示
    group_counts = count_groups_in_train_val_test(df)
    print(group_counts)
    
    # 正規化
    # 訓練データでmeanとvarを計算
    df_train = df[df["train_val_test"] == "train"]
    mean_all = df_train["label"].mean()
    var_all = df_train["label"].var()
    df["mean_all"] = mean_all
    df["var_all"] = var_all
    print(f"mean_all: {mean_all}")
    print(f"var_all: {var_all}")
    
    # audiobook_idごとに訓練データのmean, varを計算して、mean_audiobook_id, var_audiobook_idとして結合する
    df_mean = df_train.groupby("audiobook_id")["label"].mean().reset_index(name="mean_audiobook_id")
    df_var = df_train.groupby("audiobook_id")["label"].var().reset_index(name="var_audiobook_id")
    df = pd.merge(df, df_mean, on="audiobook_id", how="left")
    df = pd.merge(df, df_var, on="audiobook_id", how="left")

    # speakerごとに訓練データのmean, varを計算して、mean_speaker, var_speakerとして結合する
    df_mean = df_train.groupby("speaker")["label"].mean().reset_index(name="mean_speaker")
    df_var = df_train.groupby("speaker")["label"].var().reset_index(name="var_speaker")
    df = pd.merge(df, df_mean, on="speaker", how="left")
    df = pd.merge(df, df_var, on="speaker", how="left")
    
    # bookごとに訓練データのmean, varを計算して、mean_book, var_bookとして結合する
    df_mean = df_train.groupby("book")["label"].mean().reset_index(name="mean_book")
    df_var = df_train.groupby("book")["label"].var().reset_index(name="var_book")
    df = pd.merge(df, df_mean, on="book", how="left")
    df = pd.merge(df, df_var, on="book", how="left")
    
    print(f"df.shape: {df.shape}")
    print(f"分類用のラベルの割合: {len(df[df['label_class'] == 1])}", "/", f"{len(df)}")
    # display(df.head())
    
    # 正規化とでーたの選別
    if preprocess_type == "none":
        # df["means"] = df["mean_all"]
        # df["vars"] = df["var_all"]
        df["means"] = 0
        df["vars"] = 1
    elif preprocess_type == "all":
        df["label"] = (df["label"] - df["mean_all"]) / np.sqrt(df["var_all"])
        df["means"] = df["mean_all"]
        df["vars"] = df["var_all"]
    elif preprocess_type == "audiobook":
        df["label"] = (df["label"] - df["mean_audiobook_id"]) / np.sqrt(df["var_audiobook_id"])
        df["means"] = df["mean_audiobook_id"]
        df["vars"] = df["var_audiobook_id"]
    elif preprocess_type == "speaker":
        df["label"] = (df["label"] - df["mean_speaker"]) / np.sqrt(df["var_speaker"])
        df["means"] = df["mean_speaker"]
        df["vars"] = df["var_speaker"]
    elif preprocess_type == "book":
        df["label"] = (df["label"] - df["mean_book"]) / np.sqrt(df["var_book"])
        df["means"] = df["mean_book"]
        df["vars"] = df["var_book"]
    else:
        raise ValueError("preprocess_type is invalid")
    
    
    # 埋め込み用のIDを作成
    df["id_audiobook"] = df["audiobook_id"].astype("category").cat.codes
    df["id_speaker"] = df["speaker"].astype("category").cat.codes
    df["id_book"] = df["book"].astype("category").cat.codes
    df["id_none"] = 0
    df["id_all"] = 0
    
    # ID変換時の対応表を保存
    id_audiobook_dict = dict(enumerate(df["audiobook_id"].astype("category").cat.categories))
    id_speaker_dict = dict(enumerate(df["speaker"].astype("category").cat.categories))
    id_book_dict = dict(enumerate(df["book"].astype("category").cat.categories))
    id_dict = {"audiobook": id_audiobook_dict, "speaker": id_speaker_dict, "book": id_book_dict}
    # カレントディレクトリにcsvで保存
    for key, value in id_dict.items():
        pd.DataFrame(value.items(), columns=["id", key]).to_csv(f"id_dict/between_sentence/id_{key}.csv", index=False)
    
    df = df[["audiobook_id", "chapter_id", "concat_text", "label", "label_class", "means", "vars", "id_audiobook", "id_speaker", "id_book", "id_none", "id_all"]]
    df.rename(columns={"concat_text": "texts"}, inplace=True)
    if num_labels == 1:
        df = df[["audiobook_id", "chapter_id", "texts", "label", "means", "vars", "id_audiobook", "id_speaker", "id_book", "id_none", "id_all"]]
        df.rename(columns={"label": "labels"}, inplace=True)
    elif num_labels == 2:
        df = df[["audiobook_id", "chapter_id", "texts", "label_class", "means", "vars", "id_audiobook", "id_speaker", "id_book", "id_none", "id_all"]]
        df.rename(columns={"label_class": "labels"}, inplace=True)
    
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    print(f"df.shape: {df.shape}")
    display(df.head())

    return df

## 80msの場合

- 正規化方法を考える
  - none: 正規化しない
  - all: 全体のデータを使って標準化
  - audiobook: オーディオブックごとに標準化
  - (speaker: 話者ごとに標準化)

In [None]:
df = pd.read_pickle( exp_dir / "bert_traindata_pause_between_sentences_80ms_-30db.pkl")
preprocess_data_between_sentence(df, "none", 1)


正規化する前に、テキストの結合と、ポーズの割り当てと、外れ値除去を行う, 句点がなぜか削除されてしまっているので付け足す

In [None]:
df_train_80ms = pd.read_pickle( exp_dir / "bert_traindata_pause_between_sentences_80ms_-30db.pkl")
df_train_100ms = pd.read_pickle( exp_dir / "bert_traindata_pause_between_sentences_100ms_-30db.pkl")

In [None]:
for pause_time_threshold_ms in pause_time_threshold_mss:
    for preprocess_type in preprocess_types:
        for num_label in num_labels:
            print(f"pause_time_threshold_ms: {pause_time_threshold_ms}, preprocess_type: {preprocess_type}", f"num_label: {num_label}")
            if pause_time_threshold_ms == 80:
                df_train = df_train_80ms.copy()
            elif pause_time_threshold_ms == 100:
                df_train = df_train_100ms.copy()
            else:
                raise ValueError("pause_time_threshold_msが不正です")
            df_train_preprocessed = preprocess_data_between_sentence(df_train, preprocess_type, num_label)
            df_train_preprocessed.to_pickle(output_dir / f"{pause_time_threshold_ms}ms" / preprocess_type / f"bert_traindata_BetweenSentences_{num_label}label.pkl")
            # 分割
            test_size = 0.2
            val_size = 0.25
            train_val_df, test_df = train_test_split(df_train_preprocessed, test_size=test_size, random_state=42)
            train_df, val_df = train_test_split(train_val_df, test_size=val_size, random_state=42)
            train_df.to_pickle(output_dir / f"{pause_time_threshold_ms}ms" / preprocess_type / f"bert_traindata_BetweenSentences_{num_label}label_train.pkl")
            val_df.to_pickle(output_dir / f"{pause_time_threshold_ms}ms" / preprocess_type / f"bert_traindata_BetweenSentences_{num_label}label_val.pkl")
            test_df.to_pickle(output_dir / f"{pause_time_threshold_ms}ms" / preprocess_type / f"bert_traindata_BetweenSentences_{num_label}label_test.pkl")

            print("train_df.shape: ", train_df.shape, "val_df.shape: ", val_df.shape, "test_df.shape: ", test_df.shape)
            print("done")

# 以下は未実行(2024/02/03)

In [None]:
# 正規化しない
df["former_pause_none"] = df["former_pause"]
df["latter_pause_none"] = df["latter_pause"]

# 全体のポーズ長の平均と分散を計算する
# ハズレ値を除外する
mask = (df["former_pause"] < 30) & (df["latter_pause"] < 30)
mean_former_pause = df["former_pause_none"][mask].mean()
mean_latter_pause = df["latter_pause_none"][mask].mean()
std_former_pause = df["former_pause_none"][mask].std()
std_latter_pause = df["latter_pause_none"][mask].std()
print(f"mean_former_pause: {mean_former_pause}")
print(f"mean_latter_pause: {mean_latter_pause}")
print(f"std_former_pause: {std_former_pause}")
print(f"std_latter_pause: {std_latter_pause}")

# 全体のポーズ長の平均と分散で標準化する
df["former_pause_all"] = df["former_pause"].map(lambda x: (x - mean_former_pause) / std_former_pause)
df["latter_pause_all"] = df["latter_pause"].map(lambda x: (x - mean_latter_pause) / std_latter_pause)

# audiobook_idごとにポーズ長の平均と分散を計算する
df_audiobook_mean = df[mask].groupby("audiobook_id").agg({"former_pause": "mean", "latter_pause": "mean"})
df_audiobook_std = df[mask].groupby("audiobook_id").agg({"former_pause": "std", "latter_pause": "std"})
df_audiobook_mean.rename(columns={"former_pause": "mean_former_pause_audiobook", "latter_pause": "mean_latter_pause_audiobook"}, inplace=True)
df_audiobook_std.rename(columns={"former_pause": "std_former_pause_audiobook", "latter_pause": "std_latter_pause_audiobook"}, inplace=True)

# audiobook_idごとにポーズ長の平均と分散で標準化する
df = pd.merge(df, df_audiobook_mean, on="audiobook_id")
df = pd.merge(df, df_audiobook_std, on="audiobook_id")
df["former_pause_audiobook"] = df.apply(lambda x: (x["former_pause"] - x["mean_former_pause_audiobook"]) / x["std_former_pause_audiobook"], axis=1)
df["latter_pause_audiobook"] = df.apply(lambda x: (x["latter_pause"] - x["mean_latter_pause_audiobook"]) / x["std_latter_pause_audiobook"], axis=1)
df.drop(columns=["mean_former_pause_audiobook", "mean_latter_pause_audiobook", "std_former_pause_audiobook", "std_latter_pause_audiobook"], inplace=True)

display(df.head())


In [None]:
df.to_pickle(exp_dir / "bert_traindata_pause_between_sentences_80ms_-30db_normalized.pkl")

## 100msの場合

In [None]:
df = pd.read_pickle( exp_dir / "bert_traindata_pause_between_sentences_100ms_-30db.pkl")
df.head()

In [None]:
# 正規化しない
df["former_pause_none"] = df["former_pause"]
df["latter_pause_none"] = df["latter_pause"]

# 全体のポーズ長の平均と分散を計算する
# ハズレ値を除外する
mask = (df["former_pause"] < 30) & (df["latter_pause"] < 30)
mean_former_pause = df["former_pause_none"][mask].mean()
mean_latter_pause = df["latter_pause_none"][mask].mean()
std_former_pause = df["former_pause_none"][mask].std()
std_latter_pause = df["latter_pause_none"][mask].std()
print(f"mean_former_pause: {mean_former_pause}")
print(f"mean_latter_pause: {mean_latter_pause}")
print(f"std_former_pause: {std_former_pause}")
print(f"std_latter_pause: {std_latter_pause}")

# 全体のポーズ長の平均と分散で標準化する
df["former_pause_all"] = df["former_pause"].map(lambda x: (x - mean_former_pause) / std_former_pause)
df["latter_pause_all"] = df["latter_pause"].map(lambda x: (x - mean_latter_pause) / std_latter_pause)

# audiobook_idごとにポーズ長の平均と分散を計算する
df_audiobook_mean = df[mask].groupby("audiobook_id").agg({"former_pause": "mean", "latter_pause": "mean"})
df_audiobook_std = df[mask].groupby("audiobook_id").agg({"former_pause": "std", "latter_pause": "std"})
df_audiobook_mean.rename(columns={"former_pause": "mean_former_pause_audiobook", "latter_pause": "mean_latter_pause_audiobook"}, inplace=True)
df_audiobook_std.rename(columns={"former_pause": "std_former_pause_audiobook", "latter_pause": "std_latter_pause_audiobook"}, inplace=True)

# audiobook_idごとにポーズ長の平均と分散で標準化する
df = pd.merge(df, df_audiobook_mean, on="audiobook_id")
df = pd.merge(df, df_audiobook_std, on="audiobook_id")
df["former_pause_audiobook"] = df.apply(lambda x: (x["former_pause"] - x["mean_former_pause_audiobook"]) / x["std_former_pause_audiobook"], axis=1)
df["latter_pause_audiobook"] = df.apply(lambda x: (x["latter_pause"] - x["mean_latter_pause_audiobook"]) / x["std_latter_pause_audiobook"], axis=1)
df.drop(columns=["mean_former_pause_audiobook", "mean_latter_pause_audiobook", "std_former_pause_audiobook", "std_latter_pause_audiobook"], inplace=True)

display(df.head())


In [None]:
df.to_pickle(exp_dir / "bert_traindata_pause_between_sentences_100ms_-30db_normalized.pkl")

# 正規化前後の分布の比較

## 80msの場合

In [None]:
df = pd.read_pickle( exp_dir / "bert_traindata_pause_between_sentences_80ms_-30db_normalized.pkl")
display(df.describe())
display(df.head())

# ハズレ値を除外する
mask = (df["former_pause"] < 30) & (df["latter_pause"] < 30)
df = df[mask]

In [None]:
# 正規化前
plt.figure(figsize=(10, 5))
plt.title("Pause length distribution not normalized (80ms)", fontsize=20)
plt.hist(df["former_pause_none"], bins=100)
# 平均と分散を計算する
mean_former_pause = df["former_pause_none"].mean()
std_former_pause = df["former_pause_none"].std()
print(f"mean_former_pause: {mean_former_pause}")
print(f"std_former_pause: {std_former_pause}")
plt.axvline(mean_former_pause, color="red", linestyle="dashed", label=f"mean: {mean_former_pause:.2f}")
plt.axvline(mean_former_pause + std_former_pause, color="green", linestyle="dashed", label=f"mean + std: {mean_former_pause + std_former_pause:.2f}")
plt.axvline(mean_former_pause - std_former_pause, color="green", linestyle="dashed", label=f"mean - std: {mean_former_pause - std_former_pause:.2f}")

plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(20, 10))
plt.suptitle("Pause Length Between Sentences Distributions 80ms", fontsize=20)

plt.subplot(211)
plt.title("normalized by all", fontsize=20)
plt.hist(df["former_pause_all"], bins=100, color="skyblue", edgecolor="black")
# plt.xlim([-30, 30])
plt.legend()

plt.subplot(212)
plt.title("normalized by audiobook", fontsize=20)
plt.hist(df["former_pause_audiobook"], bins=100, color="lightgreen", edgecolor="black")
# plt.xlim([-30, 30])
plt.legend()


plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

## 100msの場合

In [None]:
df = pd.read_pickle( exp_dir / "bert_traindata_pause_between_sentences_100ms_-30db_normalized.pkl")
display(df.describe())

# ハズレ値を除外する
mask = (df["former_pause"] < 30) & (df["latter_pause"] < 30)
df = df[mask]

In [None]:
# 正規化前
plt.figure(figsize=(10, 5))
plt.title("Pause length distribution not normalized (100ms)", fontsize=20)
plt.hist(df["former_pause_none"], bins=100)
# 平均と分散を計算する
mean_former_pause = df["former_pause_none"].mean()
std_former_pause = df["former_pause_none"].std()
print(f"mean_former_pause: {mean_former_pause}")
print(f"std_former_pause: {std_former_pause}")
plt.axvline(mean_former_pause, color="red", linestyle="dashed", label=f"mean: {mean_former_pause:.2f}")
plt.axvline(mean_former_pause + std_former_pause, color="green", linestyle="dashed", label=f"mean + std: {mean_former_pause + std_former_pause:.2f}")
plt.axvline(mean_former_pause - std_former_pause, color="green", linestyle="dashed", label=f"mean - std: {mean_former_pause - std_former_pause:.2f}")

plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(20, 10))
plt.suptitle("Pause Length Between Sentences Distributions 100ms", fontsize=20)

plt.subplot(211)
plt.title("normalized by all", fontsize=20)
plt.hist(df["former_pause_all"], bins=100, color="skyblue", edgecolor="black")
# plt.xlim([-30, 30])
plt.legend()

plt.subplot(212)
plt.title("normalized by audiobook", fontsize=20)
plt.hist(df["former_pause_audiobook"], bins=100, color="lightgreen", edgecolor="black")
# plt.xlim([-30, 30])
plt.legend()


plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

80msと100msはほぼ同じ分布になっている。
=> これは、閾値が影響するのは silBとsilEのみであるためと考えられる。