# BERT学習用に作成された形態素/PAUSE/形態素のリストを読み込み、PAUSEの長さを正規化する

ポーズ長の正規化（z_normalize_pause_length.ipynb）
- (それぞれの値と、正規化後の分布を可視化): 優先度中
- 全体
- オーディオブック（=話者x文章作品）ごと
- 地の文か否か
- オーディオブックごと x 地の文か否か
- 話者ごと
- 文章作品ごと(あれば)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
from pathlib import Path
import pickle
import yaml
from pprint import pprint
import sys

sys.path.append("/home/takeshun256/PausePrediction")

# import own library
from config import DATA_DIR, DATA_TAKESHUN256_DIR, SRC_DIR, DATA_IN_ROOT_DIR

# define path
corpus_name = "jmac"
exp_name = "03_VAD_Adjusted"
exp_dir = Path(DATA_TAKESHUN256_DIR) / corpus_name / exp_name
audiobook_yaml_path = Path(DATA_IN_ROOT_DIR) / corpus_name / "text_audio_dict_new.yaml"
speaker_csv_path = "/home/takeshun256/PausePrediction/data_pub/jmac/bookdata-speaker.csv"
speaker_gender_csv_path =  "/home/takeshun256/PausePrediction/data_pub/jmac/speaker_gender.csv"

assert exp_dir.exists()
assert audiobook_yaml_path.exists()

# audio book data
with open(audiobook_yaml_path, "rb") as f:
    audiobook_dict = yaml.safe_load(f)

# speaker data
df_speaker = pd.read_csv(speaker_csv_path)
df_gender = pd.read_csv(speaker_gender_csv_path)


# 80ms
df_train_80ms = pd.read_pickle(exp_dir / "bert_traindata_pause_position_with_length_wo_sokuon_80ms.pkl")
df_train_80ms.drop(columns=["morp_pause_str", "morp_pause_clip"], inplace=True)

# 100ms
df_train_100ms = pd.read_pickle(exp_dir / "bert_traindata_pause_position_with_length_wo_sokuon_100ms.pkl")
df_train_100ms.drop(columns=["morp_pause_str", "morp_pause_clip"], inplace=True)

print("audio book data")
print(len(audiobook_dict))
pprint(audiobook_dict[list(audiobook_dict.keys())[0]])
print("speaker data")
display(df_speaker.head())
print("speaker gender data")
display(df_gender.head())
print("80ms")
print(df_train_80ms.shape)
print("100ms")
print(df_train_100ms.shape)
display(df_train_80ms.head())

## BERT学習データに、発話ごとに地の文かどうかのラベルを付与する

- 発話ごとに地の文かどうかのラベルを付与する
- オーディオブックごとに、話者情報, 本の情報を付与する

In [None]:
# audiobook_dictからキャラクター情報を抽出し、新しいラベル列を作成する関数

# ============== 地の文かどうか ==============
# Narration: 1, Others: 0
def extract_narrative_label(audiobook_data: dict):
    labels = []
    for audio_name, audio_info in audiobook_data.items():
        for chapter_n, chapter_info in enumerate(audio_info["text"]):
            # 0埋め3桁の章番号
            chapter_name = f"{chapter_n:03}"
            if chapter_info['character'] == 'narrative':
                labels.append([audio_name, chapter_name, 1])
            else:
                labels.append([audio_name, chapter_name, 0])
    labels = pd.DataFrame(labels, columns=["audiobook_name", "chapter_name", "is_narrative"])
    return labels

narrative_label_df = extract_narrative_label(audiobook_dict)
display(narrative_label_df.head())
# 80ms
df_train_80ms_labeled = pd.merge(df_train_80ms, narrative_label_df, on=["audiobook_name", "chapter_name"], how="left")
# 100ms
df_train_100ms_labeled = pd.merge(df_train_100ms, narrative_label_df, on=["audiobook_name", "chapter_name"], how="left")


# ============== 話者 ==============
def extract_speaker_label(audiobook_data: dict, speaker_df: pd.DataFrame):
    # wavファイル名をキーにして、話者名を取得 (mp3は分割されているため、wavファイル名の方で結合する)
    audio_names = []
    wavs = []
    for audio_name, audio_info in audiobook_data.items():
        audio_names.append(audio_name)
        wavs.append(Path(audio_info["wav"]).name)
    df_audio = pd.DataFrame({"audiobook_name": audio_names, "wav": wavs})
    df_speaker_one = speaker_df.copy()
    df_speaker_one["speaker"] = df_speaker_one["speaker"].apply(lambda x: x.split(",")[0]) # 複数の話者がいる場合、最初の話者を取得
    df_speaker_one = df_speaker_one[["speaker", "wav","book"]]
    display(df_speaker_one.head())
    before = len(df_audio)
    df_audio_speaker = pd.merge(df_audio, df_speaker_one, on="wav", how="inner")
    after = len(df_audio_speaker)
    assert before == after
    df_audio_speaker.drop(columns=["wav"], inplace=True)
    return df_audio_speaker

speaker_label_df = extract_speaker_label(audiobook_dict, df_speaker)
display(speaker_label_df.head())
# 80ms
df_train_80ms_labeled = pd.merge(df_train_80ms_labeled, speaker_label_df, on=["audiobook_name"], how="left")
# 100ms
df_train_100ms_labeled = pd.merge(df_train_100ms_labeled, speaker_label_df, on=["audiobook_name"], how="left")

# ============== check ==============


print("80ms")
print(df_train_80ms_labeled.shape)
print(f"isna: {df_train_80ms_labeled['is_narrative'].isna().sum()}")
print("100ms")
print(df_train_100ms_labeled.shape)
print(f"isna: {df_train_100ms_labeled['is_narrative'].isna().sum()}")
display(df_train_80ms_labeled.head())

In [None]:
# audibookのunique数
print("audiobook_name")
print(df_train_80ms_labeled["audiobook_name"].nunique())
print(df_train_100ms_labeled["audiobook_name"].nunique())

# speakerのunique数
print("speaker")
print(df_train_80ms_labeled["speaker"].nunique())
print(df_train_100ms_labeled["speaker"].nunique())

# bookのunique数
print("book")
print(df_train_80ms_labeled["book"].nunique())
print(df_train_100ms_labeled["book"].nunique())


In [None]:
# 文章例
print(df_train_80ms_labeled["morp_pause_clip_no_pause"].values[0])

## ポーズ長の外れ値を除去する

In [None]:
def delete_outlier(sentence_elements: list):
    if len(sentence_elements) == 0:
        return sentence_elements
    no_pause = "[NO_PAUSE]"
    f = lambda x: float(x.split()[1][:-1]) if x.startswith("[PAUSE") else 0
    # 文間ポーズの長さが0未満, 15秒以上のものを削除
    sentence_elements[0] = no_pause if f(sentence_elements[0]) < 0 or f(sentence_elements[0]) > 15 else sentence_elements[0]
    sentence_elements[-1] = no_pause if f(sentence_elements[-1]) < 0 or f(sentence_elements[-1]) > 15 else sentence_elements[-1]
    
    # 文中ポーズの長さが0未満, 3秒以上のものを削除
    for i in range(1, len(sentence_elements) - 1):
        sentence_elements[i] = no_pause if f(sentence_elements[i]) < 0 or f(sentence_elements[i]) > 3 else sentence_elements[i]
    return sentence_elements

# サンプル文章からポーズの長さを抽出
sample_sentence = [
    '[PAUSE -0.1]', 'これ', '[NO_PAUSE]', 'は', '[PAUSE -0.1]', '、', '[PAUSE 5.0]', '私', '[NO_PAUSE]', 'が', 
    '[NO_PAUSE]', '小さい', '[NO_PAUSE]', 'とき', '[NO_PAUSE]', 'に', '[PAUSE 0.6171666666666666]', '、', '[NO_PAUSE]', 
    '村', '[NO_PAUSE]', 'の', '[NO_PAUSE]', '茂平', '[NO_PAUSE]', 'と', '[NO_PAUSE]', 'いう', '[NO_PAUSE]', 'おじいさん', 
    '[NO_PAUSE]', 'から', '[NO_PAUSE]', 'きい', '[NO_PAUSE]', 'た', '[NO_PAUSE]', 'お話', '[NO_PAUSE]', 'です', 
    '[PAUSE 18.0]'
]

print(delete_outlier(sample_sentence))
    

In [None]:
# 80ms
df_train_80ms_labeled["morp_pause_clip_no_pause"] = df_train_80ms_labeled["morp_pause_clip_no_pause"].apply(lambda x: delete_outlier(x))

# 100ms
df_train_100ms_labeled["morp_pause_clip_no_pause"] = df_train_100ms_labeled["morp_pause_clip_no_pause"].apply(lambda x: delete_outlier(x))


## ポーズ長の正規化：全体、オーディオブックごと、地の文か否か、オーディオブックごと x 地の文か否か

In [None]:
# 文中のポーズの長さを抽出する関数
def extract_pause_lengths_in_sentence(sentence_elements):
    """文中のポーズの長さを抽出する関数"""
    # 文間ポーズを削除
    sentence_elements = sentence_elements[1:-1]
    
    # [PAUSE 0.435]のような要素の数値を抽出
    pause_lengths = []
    for element in sentence_elements:
        if element.startswith("[PAUSE"):
            pause_lengths.append(float(element.split()[1][:-1]))
    return pause_lengths

# サンプル文章からポーズの長さを抽出
sample_sentence = [
    '[PAUSE 0.435]', 'これ', '[NO_PAUSE]', 'は', '[PAUSE 0.6785]', '、', '[PAUSE 0.6785]', '私', '[NO_PAUSE]', 'が', 
    '[NO_PAUSE]', '小さい', '[NO_PAUSE]', 'とき', '[NO_PAUSE]', 'に', '[PAUSE 0.6171666666666666]', '、', '[NO_PAUSE]', 
    '村', '[NO_PAUSE]', 'の', '[NO_PAUSE]', '茂平', '[NO_PAUSE]', 'と', '[NO_PAUSE]', 'いう', '[NO_PAUSE]', 'おじいさん', 
    '[NO_PAUSE]', 'から', '[NO_PAUSE]', 'きい', '[NO_PAUSE]', 'た', '[NO_PAUSE]', 'お話', '[NO_PAUSE]', 'です', 
    '[PAUSE 0.2939583333333333]'
]

pause_lengths = extract_pause_lengths_in_sentence(sample_sentence)
print(pause_lengths)

In [None]:
# 入力されたdf全体のポーズ長の平均と分散を計算する関数
def calc_mean_var_pause_length(df, pause_length_column="morp_pause_clip_no_pause"):
    """入力されたdf全体のポーズ長の平均と分散を計算する関数"""
    pause_lengths = []
    for i in range(len(df)):
        pause_lengths.extend(extract_pause_lengths_in_sentence(df[pause_length_column].values[i]))

    pause_lengths = np.array(pause_lengths)
    return pause_lengths.mean(), pause_lengths.var()

# 80ms, audiobook_0のポーズ長の平均と分散
print("80ms")
print(calc_mean_var_pause_length(df_train_80ms_labeled[df_train_80ms_labeled["audiobook_name"] == "audiobook_0"]))

In [None]:
# 80ms, 全体のポーズ長の平均と分散
print("80ms all")
print(calc_mean_var_pause_length(df_train_80ms_labeled))

# 100ms, 全体のポーズ長の平均と分散
print("100ms all")
print(calc_mean_var_pause_length(df_train_100ms_labeled))

# 80ms, オーディオブックごとのポーズ長の平均と分散
print("80ms audiobook")
df_80ms_audiobook_mean_var = []
for audibook_name in df_train_80ms_labeled["audiobook_name"].unique():
    df_80ms_audiobook_mean_var.append(
        [audibook_name, *calc_mean_var_pause_length(df_train_80ms_labeled[df_train_80ms_labeled["audiobook_name"] == audibook_name])]
    )
df_80ms_audiobook_mean_var = pd.DataFrame(df_80ms_audiobook_mean_var, columns=["audiobook_name", "mean", "var"])
display(df_80ms_audiobook_mean_var.head())

# 100ms, オーディオブックごとのポーズ長の平均と分散
print("100ms audiobook")
df_100ms_audiobook_mean_var = []
for audibook_name in df_train_100ms_labeled["audiobook_name"].unique():
    df_100ms_audiobook_mean_var.append(
        [audibook_name, *calc_mean_var_pause_length(df_train_100ms_labeled[df_train_100ms_labeled["audiobook_name"] == audibook_name])]
    )
df_100ms_audiobook_mean_var = pd.DataFrame(df_100ms_audiobook_mean_var, columns=["audiobook_name", "mean", "var"])
display(df_100ms_audiobook_mean_var.head())

# 80ms, 地の文か否かでポーズ長の平均と分散を比較
print("80ms narrative")
print(calc_mean_var_pause_length(df_train_80ms_labeled[df_train_80ms_labeled["is_narrative"] == 1]))
print("80ms non-narrative")
print(calc_mean_var_pause_length(df_train_80ms_labeled[df_train_80ms_labeled["is_narrative"] == 0]))

# 100ms, 地の文か否かでポーズ長の平均と分散を比較
print("100ms narrative")
print(calc_mean_var_pause_length(df_train_100ms_labeled[df_train_100ms_labeled["is_narrative"] == 1]))
print("100ms non-narrative")
print(calc_mean_var_pause_length(df_train_100ms_labeled[df_train_100ms_labeled["is_narrative"] == 0]))

# 80ms, オーディオブックごとに地の文か否かでポーズ長の平均と分散を比較
print("80ms audiobook narrative vs non-narrative")
df_80ms_audiobook_mean_var_narrative = []
df_80ms_audiobook_mean_var_non_narrative = []
for audibook_name in df_train_80ms_labeled["audiobook_name"].unique():
    df_80ms_audiobook_mean_var_narrative.append(
        [audibook_name, *calc_mean_var_pause_length(df_train_80ms_labeled[(df_train_80ms_labeled["audiobook_name"] == audibook_name) & (df_train_80ms_labeled["is_narrative"] == 1)])]
    )
    df_80ms_audiobook_mean_var_non_narrative.append(
        [audibook_name, *calc_mean_var_pause_length(df_train_80ms_labeled[(df_train_80ms_labeled["audiobook_name"] == audibook_name) & (df_train_80ms_labeled["is_narrative"] == 0)])]
    )
df_80ms_audiobook_mean_var_narrative = pd.DataFrame(df_80ms_audiobook_mean_var_narrative, columns=["audiobook_name", "mean", "var"])
df_80ms_audiobook_mean_var_non_narrative = pd.DataFrame(df_80ms_audiobook_mean_var_non_narrative, columns=["audiobook_name", "mean", "var"])
display(df_80ms_audiobook_mean_var_narrative.head())
display(df_80ms_audiobook_mean_var_non_narrative.head())

# 100ms, オーディオブックごとに地の文か否かでポーズ長の平均と分散を比較
print("100ms audiobook narrative vs non-narrative")
df_100ms_audiobook_mean_var_narrative = []
df_100ms_audiobook_mean_var_non_narrative = []
for audibook_name in df_train_100ms_labeled["audiobook_name"].unique():
    df_100ms_audiobook_mean_var_narrative.append(
        [audibook_name, *calc_mean_var_pause_length(df_train_100ms_labeled[(df_train_100ms_labeled["audiobook_name"] == audibook_name) & (df_train_100ms_labeled["is_narrative"] == 1)])]
    )
    df_100ms_audiobook_mean_var_non_narrative.append(
        [audibook_name, *calc_mean_var_pause_length(df_train_100ms_labeled[(df_train_100ms_labeled["audiobook_name"] == audibook_name) & (df_train_100ms_labeled["is_narrative"] == 0)])]
    )
df_100ms_audiobook_mean_var_narrative = pd.DataFrame(df_100ms_audiobook_mean_var_narrative, columns=["audiobook_name", "mean", "var"])
df_100ms_audiobook_mean_var_non_narrative = pd.DataFrame(df_100ms_audiobook_mean_var_non_narrative, columns=["audiobook_name", "mean", "var"])
display(df_100ms_audiobook_mean_var_narrative.head())
display(df_100ms_audiobook_mean_var_non_narrative.head())

# 80ms, 話者ごとのポーズ長の平均と分散を比較
print("80ms speaker")
df_80ms_speaker_mean_var = []
for speaker in df_train_80ms_labeled["speaker"].unique():
    df_80ms_speaker_mean_var.append(
        [speaker, *calc_mean_var_pause_length(df_train_80ms_labeled[df_train_80ms_labeled["speaker"] == speaker])]
    )
df_80ms_speaker_mean_var = pd.DataFrame(df_80ms_speaker_mean_var, columns=["speaker", "mean", "var"])
display(df_80ms_speaker_mean_var.head())

# 100ms, 話者ごとのポーズ長の平均と分散を比較
print("100ms speaker")
df_100ms_speaker_mean_var = []
for speaker in df_train_100ms_labeled["speaker"].unique():
    df_100ms_speaker_mean_var.append(
        [speaker, *calc_mean_var_pause_length(df_train_100ms_labeled[df_train_100ms_labeled["speaker"] == speaker])]
    )
df_100ms_speaker_mean_var = pd.DataFrame(df_100ms_speaker_mean_var, columns=["speaker", "mean", "var"])
display(df_100ms_speaker_mean_var.head())

# 80ms, 文章作品ごとのポーズ長の平均と分散を比較
print("80ms book")
df_80ms_book_mean_var = []
for book in df_train_80ms_labeled["book"].unique():
    df_80ms_book_mean_var.append(
        [book, *calc_mean_var_pause_length(df_train_80ms_labeled[df_train_80ms_labeled["book"] == book])]
    )
df_80ms_book_mean_var = pd.DataFrame(df_80ms_book_mean_var, columns=["book", "mean", "var"])
display(df_80ms_book_mean_var.head())

# 100ms, 文章作品ごとのポーズ長の平均と分散を比較
print("100ms book")
df_100ms_book_mean_var = []
for book in df_train_100ms_labeled["book"].unique():
    df_100ms_book_mean_var.append(
        [book, *calc_mean_var_pause_length(df_train_100ms_labeled[df_train_100ms_labeled["book"] == book])]
    )
df_100ms_book_mean_var = pd.DataFrame(df_100ms_book_mean_var, columns=["book", "mean", "var"])
display(df_100ms_book_mean_var.head())


In [None]:
# 各ポーズ長について、与えられた平均と分散をもとに正規化する関数
def normalize_pause_length(pause_length: float, mean: float, var: float) -> float:
    """各ポーズ長について、与えられた平均と分散をもとに正規化する関数"""
    assert var > 0, "var must be positive"
    assert pause_length >= 0, "pause_length must be positive"
    assert mean >= 0, "mean must be positive"
    return (pause_length - mean) / np.sqrt(var)

# [PAUSE 0.4352]のポーズ長を正規化する関数
def normalize_pause_length_in_text(text: str, mean: float, var: float) -> str:
    """[PAUSE 0.4352]のポーズ長を正規化する関数"""
    if text.startswith("[PAUSE"):
        pause_length = float(text.split()[1][:-1])
        normalized_pause_length = normalize_pause_length(float(pause_length), mean, var)
        return f"[PAUSE {normalized_pause_length:.4f}]"
    else:
        return text

# PAUSE入りのテキストリスト内の文中ポーズ長を正規化する関数
def normalize_pause_lengths_in_texts(texts: list, mean: float, var: float) -> list:
    """PAUSE入りのテキストリスト内の文中ポーズ長を正規化する関数"""
    if len(texts) == 0:
        return texts
    normalized_texts = []
    # 文間ポーズはそのまま
    normalized_texts.append(texts[0])
    for text in texts[1:-1]:
        normalized_texts.append(normalize_pause_length_in_text(text, mean, var))
    normalized_texts.append(texts[-1])
    return normalized_texts

# サンプル文章のポーズ長を正規化
sample_sentence = [
    '[PAUSE 0.435]', 'これ', '[NO_PAUSE]', 'は', '[PAUSE 0.6785]', '、', '[PAUSE 0.6785]', '私', '[NO_PAUSE]', 'が', 
    '[NO_PAUSE]', '小さい', '[NO_PAUSE]', 'とき', '[NO_PAUSE]', 'に', '[PAUSE 0.6171666666666666]', '、', '[NO_PAUSE]', 
    '村', '[NO_PAUSE]', 'の', '[NO_PAUSE]', '茂平', '[NO_PAUSE]', 'と', '[NO_PAUSE]', 'いう', '[NO_PAUSE]', 'おじいさん', 
    '[NO_PAUSE]', 'から', '[NO_PAUSE]', 'きい', '[NO_PAUSE]', 'た', '[NO_PAUSE]', 'お話', '[NO_PAUSE]', 'です', 
    '[PAUSE 0.2939583333333333]'
]
sample_mean = 0.5
sample_var = 0.01

print(normalize_pause_length(0.435, sample_mean, sample_var))
print(normalize_pause_length_in_text(sample_sentence[0], sample_mean, sample_var))
pprint(normalize_pause_lengths_in_texts(sample_sentence, sample_mean, sample_var))


In [None]:
# グループを指定して正規化する関数
def normalize_pause_length_with_group(df: pd.DataFrame, group: str, added_col: str) -> pd.DataFrame:
    """グループを指定して正規化する関数"""
    df_normalized = df.copy()
    # 複数グループ指定に対応していないため、コメントアウトして、groupbyで置き換え
    # df_mean_var = []
    # for group_name in df[group].unique():
    #     df_mean_var.append(
    #         [group_name, *calc_mean_var_pause_length(df[df[group] == group_name])]
    #     )
    # df_mean_var = pd.DataFrame(df_mean_var, columns=[group, "mean", "var"])
    if group == "":
        mean, var = calc_mean_var_pause_length(df)
        df_normalized["mean"] = mean
        df_normalized["var"] = var
    else:
        df_mean_var = df.groupby(group).apply(lambda x: pd.Series([*calc_mean_var_pause_length(x)], index=["mean", "var"])).reset_index()
        df_normalized = pd.merge(df_normalized, df_mean_var, on=group, how="left")
    df_normalized[added_col] = df_normalized.apply(lambda x: normalize_pause_lengths_in_texts(x["morp_pause_clip_no_pause"], x["mean"], x["var"]), axis=1)
    # df_normalized = df_normalized.drop(["mean", "var"], axis=1)
    # mean, varをそれぞれrenameする
    rename_dict = {"": "all", "audiobook_name": "audiobook", "is_narrative": "narrative", "audiobook_name_is_narrative": "audiobook_narrative", "speaker": "speaker", "book": "book"}
    rename_group = rename_dict[group] if type(group) == str else "_".join([rename_dict[g] for g in group])
    df_normalized.rename(columns={"mean": f"mean_{rename_group}", "var": f"var_{rename_group}"}, inplace=True)
    return df_normalized

# 80ms, オーディオブックごとに正規化
display(df_train_80ms_labeled.head())
display(normalize_pause_length_with_group(df_train_80ms_labeled, "audiobook_name", "morp_pause_clip_no_pause_normalized").head())

# 80ms, オーディオブックごとに、地の文か否かで正規化
display(normalize_pause_length_with_group(df_train_80ms_labeled, "audiobook_name is_narrative".split(), "morp_pause_clip_no_pause_normalized").head())

# 80ms, 話者ごとに正規化
display(normalize_pause_length_with_group(df_train_80ms_labeled, "speaker", "morp_pause_clip_no_pause_normalized").head())

# 80ms, 文章作品ごとに正規化
display(normalize_pause_length_with_group(df_train_80ms_labeled, "book", "morp_pause_clip_no_pause_normalized").head())

# 80ms 全体で正規化
df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled, "", "morp_pause_clip_no_pause_normalized")

## 平均と分散を可視化, それぞれの値と、正規化後の分布を可視化

In [None]:
# 80ms 全体の正規化前後のポーズ長の平均と分散を分布上にプロット

plt.figure(figsize=(12, 5))
plt.suptitle("80ms pauses normalized by all", fontsize=20)
plt.subplot(121)
plt.title("before normalization")
plt.xlabel("pause length")
plt.ylabel("frequency") 
plt.hist(df_train_80ms_labeled["morp_pause_clip_no_pause"].apply(extract_pause_lengths_in_sentence).explode().astype(float), bins=100)
# 平均値と分散を表示
mean, var = calc_mean_var_pause_length(df_train_80ms_labeled)
plt.axvline(mean, color="red", label=f"mean = {mean:.4f}")
plt.axvline(mean + np.sqrt(var), color="green", label=f"mean + sqrt(var) = {mean + np.sqrt(var):.4f}")
plt.axvline(mean - np.sqrt(var), color="green", label=f"mean - sqrt(var) = {mean - np.sqrt(var):.4f}")
plt.legend()

plt.subplot(122)
plt.title("after normalization")
plt.xlabel("pause length")
plt.ylabel("frequency")
df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled, "", "morp_pause_clip_no_pause_normalized")
plt.hist(df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float), bins=100)
mean, var = calc_mean_var_pause_length(df_train_80ms_labeled_normalized, "morp_pause_clip_no_pause_normalized")
plt.axvline(mean, color="red", label=f"mean = {mean:.4f}")
plt.axvline(mean + np.sqrt(var), color="green", label=f"mean + sqrt(var) = {mean + np.sqrt(var):.4f}")
plt.axvline(mean - np.sqrt(var), color="green", label=f"mean - sqrt(var) = {mean - np.sqrt(var):.4f}")
plt.legend()
plt.show()


In [None]:
# 100ms 全体の正規化前後のポーズ長の平均と分散を分布上にプロット

plt.figure(figsize=(12, 5))
plt.suptitle("100ms pauses normalized by all", fontsize=20)
plt.subplot(121)
plt.title("before normalization")
plt.xlabel("pause length")
plt.ylabel("frequency") 
plt.hist(df_train_100ms_labeled["morp_pause_clip_no_pause"].apply(extract_pause_lengths_in_sentence).explode().astype(float), bins=100)
# 平均値と分散を表示
mean, var = calc_mean_var_pause_length(df_train_100ms_labeled)
plt.axvline(mean, color="red", label=f"mean = {mean:.4f}")
plt.axvline(mean + np.sqrt(var), color="green", label=f"mean + sqrt(var) = {mean + np.sqrt(var):.4f}")
plt.axvline(mean - np.sqrt(var), color="green", label=f"mean - sqrt(var) = {mean - np.sqrt(var):.4f}")
plt.legend()

plt.subplot(122)
plt.title("after normalization")
plt.xlabel("pause length")
plt.ylabel("frequency")
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "", "morp_pause_clip_no_pause_normalized")
plt.hist(df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float), bins=100)
mean, var = calc_mean_var_pause_length(df_train_100ms_labeled_normalized, "morp_pause_clip_no_pause_normalized")
plt.axvline(mean, color="red", label=f"mean = {mean:.4f}")
plt.axvline(mean + np.sqrt(var), color="green", label=f"mean + sqrt(var) = {mean + np.sqrt(var):.4f}")
plt.axvline(mean - np.sqrt(var), color="green", label=f"mean - sqrt(var) = {mean - np.sqrt(var):.4f}")
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(12, 6))
plt.suptitle("80ms pauses is_narrative vs not is_narrative", fontsize=20)

plt.figure(figsize=(12, 5))
plt.suptitle("100ms pauses normalized by all", fontsize=20)
plt.subplot(121)
plt.title("before normalization")
plt.xlabel("pause length")
plt.ylabel("frequency") 
# 地の文のポーズ長の分布
narrative_pause_lengths = df_train_100ms_labeled[df_train_100ms_labeled["is_narrative"] == 1]["morp_pause_clip_no_pause"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(narrative_pause_lengths, bins=100, alpha=0.5, label='narative')

# 非地の文のポーズ長の分布
non_narrative_pause_lengths = df_train_100ms_labeled[df_train_100ms_labeled["is_narrative"] == 0]["morp_pause_clip_no_pause"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(non_narrative_pause_lengths, bins=100, alpha=0.5, label='non-narative')
plt.legend()

plt.subplot(122)
plt.title("after normalization")
plt.xlabel("pause length")
plt.ylabel("frequency")

df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "is_narrative", "morp_pause_clip_no_pause_normalized")
# 地の文のポーズ長の分布
narrative_pause_lengths_normalized = df_train_100ms_labeled_normalized[df_train_100ms_labeled_normalized["is_narrative"] == 1]["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(narrative_pause_lengths_normalized, bins=100, alpha=0.5, label='narative')

# 非地の文のポーズ長の分布
non_narrative_pause_lengths_normalized = df_train_100ms_labeled_normalized[df_train_100ms_labeled_normalized["is_narrative"] == 0]["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(non_narrative_pause_lengths_normalized, bins=100, alpha=0.5, label='non-narative')
plt.legend()

plt.show()

In [None]:
# オーディオブックごとのx=平均、y=分散の散布図をプロットする関数
def plot_mean_var_scatter(df: pd.DataFrame, group: str, x_col: str, y_col: str):
    """オーディオブックごとのx=平均、y=分散の散布図をプロットする関数"""
    df_mean_var = df.groupby(group).apply(lambda x: pd.Series([*calc_mean_var_pause_length(x)], index=["mean", "var"])).reset_index()
    plt.figure(figsize=(8, 8))
    plt.title(f"mean-var scatter plot by {group}")
    plt.xlabel("mean")
    plt.ylabel("var")
    plt.scatter(df_mean_var[x_col], df_mean_var[y_col])
    
    # y=xの直線を引く
    plt.plot([0, 0.5], [0, 0.5], color="red")
    # for i in range(len(df_mean_var)):
    #     plt.annotate(df_mean_var[group].values[i], (df_mean_var[x_col].values[i], df_mean_var[y_col].values[i]))
    
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    plt.show()

plot_mean_var_scatter(df_train_80ms_labeled, "audiobook_name", "mean", "var")
plot_mean_var_scatter(df_train_80ms_labeled, "is_narrative", "mean", "var")

In [None]:
# 80ms, 各グループごとの平均と分散の散布図を色分けしてプロット

plt.figure(figsize=(12, 6))
plt.suptitle("80ms mean-var scatter plot groupby", fontsize=20)
plt.subplot(111)
plt.title("80ms groupby")
plt.xlabel("mean")
plt.ylabel("var")
plt.scatter(df_80ms_audiobook_mean_var["mean"], df_80ms_audiobook_mean_var["var"])
mean, var = calc_mean_var_pause_length(df_train_80ms_labeled)
plt.axvline(mean, color="red", label=f"all mean = {mean:.4f} var = {var:.4f}")
plt.axhline(var, color="red")

df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled, "audiobook_name", "morp_pause_clip_no_pause_normalized")
df_mean_var = df_train_80ms_labeled_normalized.groupby("audiobook_name").apply(lambda x: pd.Series([*calc_mean_var_pause_length(x)], index=["mean", "var"])).reset_index()
plt.scatter(df_mean_var["mean"], df_mean_var["var"], label="groupby audiobook", color="orange")

df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled, "is_narrative", "morp_pause_clip_no_pause_normalized")
df_mean_var = df_train_80ms_labeled_normalized.groupby("is_narrative").apply(lambda x: pd.Series([*calc_mean_var_pause_length(x)], index=["mean", "var"])).reset_index()
plt.scatter(df_mean_var["mean"], df_mean_var["var"], label="groupby is_narrative", color="green")

df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled, "audiobook_name is_narrative".split(), "morp_pause_clip_no_pause_normalized")
df_mean_var = df_train_80ms_labeled_normalized.groupby("audiobook_name is_narrative".split()).apply(lambda x: pd.Series([*calc_mean_var_pause_length(x)], index=["mean", "var"])).reset_index()
plt.scatter(df_mean_var["mean"], df_mean_var["var"], label="groupby (audiobook, is_narrative)", color="blue")

df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled, "speaker", "morp_pause_clip_no_pause_normalized")
df_mean_var = df_train_80ms_labeled_normalized.groupby("speaker").apply(lambda x: pd.Series([*calc_mean_var_pause_length(x)], index=["mean", "var"])).reset_index()
plt.scatter(df_mean_var["mean"], df_mean_var["var"], label="groupby speaker", color="purple")

df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled, "book", "morp_pause_clip_no_pause_normalized")
df_mean_var = df_train_80ms_labeled_normalized.groupby("book").apply(lambda x: pd.Series([*calc_mean_var_pause_length(x)], index=["mean", "var"])).reset_index()
plt.scatter(df_mean_var["mean"], df_mean_var["var"], label="groupby book", color="brown")

plt.legend()
plt.show()

In [None]:
# 100ms, 各グループごとの平均と分散の散布図を色分けしてプロット

plt.figure(figsize=(12, 6))
plt.suptitle("100ms mean-var scatter plot groupby", fontsize=20)
plt.subplot(111)
plt.title("100ms groupby")
plt.xlabel("mean")
plt.ylabel("var")
plt.scatter(df_100ms_audiobook_mean_var["mean"], df_100ms_audiobook_mean_var["var"])
mean, var = calc_mean_var_pause_length(df_train_100ms_labeled)
plt.axvline(mean, color="red", label=f"all mean = {mean:.4f} var = {var:.4f}")
plt.axhline(var, color="red")

df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "audiobook_name", "morp_pause_clip_no_pause_normalized")
df_mean_var = df_train_100ms_labeled_normalized.groupby("audiobook_name").apply(lambda x: pd.Series([*calc_mean_var_pause_length(x)], index=["mean", "var"])).reset_index()
plt.scatter(df_mean_var["mean"], df_mean_var["var"], label="groupby audiobook", color="orange")

df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "is_narrative", "morp_pause_clip_no_pause_normalized")
df_mean_var = df_train_100ms_labeled_normalized.groupby("is_narrative").apply(lambda x: pd.Series([*calc_mean_var_pause_length(x)], index=["mean", "var"])).reset_index()
plt.scatter(df_mean_var["mean"], df_mean_var["var"], label="groupby is_narrative", color="green")

df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "audiobook_name is_narrative".split(), "morp_pause_clip_no_pause_normalized")
df_mean_var = df_train_100ms_labeled_normalized.groupby("audiobook_name is_narrative".split()).apply(lambda x: pd.Series([*calc_mean_var_pause_length(x)], index=["mean", "var"])).reset_index()
plt.scatter(df_mean_var["mean"], df_mean_var["var"], label="groupby (audiobook, is_narrative)", color="blue")

df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "speaker", "morp_pause_clip_no_pause_normalized")
df_mean_var = df_train_100ms_labeled_normalized.groupby("speaker").apply(lambda x: pd.Series([*calc_mean_var_pause_length(x)], index=["mean", "var"])).reset_index()
plt.scatter(df_mean_var["mean"], df_mean_var["var"], label="groupby speaker", color="purple")

df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "book", "morp_pause_clip_no_pause_normalized")
df_mean_var = df_train_100ms_labeled_normalized.groupby("book").apply(lambda x: pd.Series([*calc_mean_var_pause_length(x)], index=["mean", "var"])).reset_index()
plt.scatter(df_mean_var["mean"], df_mean_var["var"], label="groupby book", color="brown")


plt.legend()
plt.show()

In [None]:
# 正規化後の分布をプロットする関数

lims = (-2.5, 2.5)

plt.figure(figsize=(8, 12))
# plt.suptitle("Normalized Distribution of 80ms Pauses", fontsize=26)
plt.subplot(611)
plt.title("normalized by all", fontsize=24)
df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled, "", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='all', color="orange")
plt.xlim(*lims)
# plt.xticks(np.arange(-2.5, 2.6, 0.5)) 
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
# plt.legend()

plt.subplot(612)
plt.title("normalized by audiobook", fontsize=24)
df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled, "audiobook_name", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='audiobook', color="green")
plt.xlim(*lims)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
# plt.legend()

plt.subplot(613)
plt.title("normalized by narrative", fontsize=24)
df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled, "is_narrative", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='is_narrative', color="blue")
plt.xlim(lims)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
# plt.legend()

plt.subplot(614)
plt.title("normalized by audiobook-narrative", fontsize=24)
df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled, "audiobook_name is_narrative".split(), "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='(audiobook, is_narrative)', color="red")
plt.xlim(*lims)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
# plt.legend()

plt.subplot(615)
plt.title("normalized by speaker", fontsize=24)
df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled, "speaker", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='speaker', color="purple")
plt.xlim(*lims)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
# plt.legend()

plt.subplot(616)
plt.title("normalized by book", fontsize=24)
df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled, "book", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='book', color="brown")
plt.xlim(*lims)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.xlabel("pause duration (s)", fontsize=20)
# plt.legend()

plt.tight_layout(rect=[0, 0.03, 1, 0.95]) 
# plt.tight_layout(rect=[0, 0.05, 1, 0.90])
plt.show()

In [None]:
# 正規化後の分布をプロットする関数

lims = (-2.5, 2.5)

plt.figure(figsize=(8, 12))
# plt.suptitle("Normalized Distribution of 100ms Pauses", fontsize=26)
plt.subplot(611)
plt.title("normalized by all", fontsize=24)
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='all', color="orange")
plt.xlim(*lims)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

plt.subplot(612)
plt.title("normalized by audiobook", fontsize=24)
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "audiobook_name", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='audiobook', color="green")
plt.xlim(*lims)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

plt.subplot(613)
plt.title("normalized by narrative", fontsize=24)
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "is_narrative", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='is_narrative', color="blue")
plt.xlim(lims)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

plt.subplot(614)
plt.title("normalized by audiobook-narrative", fontsize=24)
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "audiobook_name is_narrative".split(), "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='(audiobook, is_narrative)', color="red")
plt.xlim(*lims)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

plt.subplot(615)
plt.title("normalized by speaker", fontsize=24)
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "speaker", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='speaker', color="purple")
plt.xlim(*lims)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

plt.subplot(616)
plt.title("normalized by book", fontsize=24)
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "book", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='book', color="brown")
plt.xlim(*lims)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.xlabel("pause duration (s) ", fontsize=24)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
# 正規化後の分布をプロットする関数

lims = (-2.5, 2.5)

plt.figure(figsize=(8, 6))
# plt.suptitle("Normalized Distribution of 100ms Pauses", fontsize=26)
plt.subplot(311)
plt.title("normalized by all", fontsize=24)
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='all', color="orange")
plt.xlim(*lims)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

plt.subplot(312)
plt.title("normalized by audiobook", fontsize=24)
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "audiobook_name", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='audiobook', color="green")
plt.xlim(*lims)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

# plt.subplot(613)
# plt.title("normalized by narrative", fontsize=24)
# df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "is_narrative", "morp_pause_clip_no_pause_normalized")
# pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
# plt.hist(pause_lengths, bins=100, alpha=0.5, label='is_narrative', color="blue")
# plt.xlim(lims)
# plt.xticks(fontsize=18)
# plt.yticks(fontsize=18)

# plt.subplot(614)
# plt.title("normalized by audiobook-narrative", fontsize=24)
# df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "audiobook_name is_narrative".split(), "morp_pause_clip_no_pause_normalized")
# pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
# plt.hist(pause_lengths, bins=100, alpha=0.5, label='(audiobook, is_narrative)', color="red")
# plt.xlim(*lims)
# plt.xticks(fontsize=18)
# plt.yticks(fontsize=18)

plt.subplot(313)
plt.title("normalized by speaker", fontsize=24)
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "speaker", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='speaker', color="purple")
plt.xlim(*lims)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

# plt.subplot(616)
# plt.title("normalized by book", fontsize=24)
# df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "book", "morp_pause_clip_no_pause_normalized")
# pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
# plt.hist(pause_lengths, bins=100, alpha=0.5, label='book', color="brown")
# plt.xlim(*lims)
# plt.xticks(fontsize=18)
# plt.yticks(fontsize=18)
plt.xlabel("pause duration (s) ", fontsize=24)

# plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.tight_layout()
# plt.show()
plt.savefig("100ms_normalized_pause_distribution.svg")

In [None]:
# 正規化後の分布をプロットする関数


plt.figure(figsize=(12, 12))
plt.suptitle("100ms pauses normalized", fontsize=20)
plt.subplot(611)
plt.title("100ms normalized by all")
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='all', color="orange")
plt.xlim(-6, 6)
plt.legend()

plt.subplot(612)
plt.title("100ms normalized by audiobook")
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "audiobook_name", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='audiobook', color="green")
plt.xlim(-6, 6)
plt.legend()

plt.subplot(613)
plt.title("100ms normalized by is_narrative")
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "is_narrative", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='is_narrative', color="blue")
plt.xlim(-6, 6)
plt.legend()

plt.subplot(614)
plt.title("100ms normalized by (audiobook, is_narrative)")
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "audiobook_name is_narrative".split(), "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='(audiobook, is_narrative)', color="red")
plt.xlim(-6, 6)
plt.legend()

plt.subplot(615)
plt.title("100ms normalized by speaker")
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "speaker", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='speaker', color="purple")
plt.xlim(-6, 6)
plt.legend()

plt.subplot(616)
plt.title("100ms normalized by book")
df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "book", "morp_pause_clip_no_pause_normalized")
pause_lengths = df_train_100ms_labeled_normalized["morp_pause_clip_no_pause_normalized"].apply(extract_pause_lengths_in_sentence).explode().astype(float)
plt.hist(pause_lengths, bins=100, alpha=0.5, label='book', color="brown")
plt.xlim(-6, 6)
plt.legend()

plt.show()

## BERT学習データに、正規化したポーズ長のデータを追加する

In [None]:
# audiobook_narraiveのグループのうち、数が少ないものを除外する関数
def exclude_few_audiobook_narratives(df: pd.DataFrame, threshold: int = 10) -> pd.DataFrame:
    """audiobook_narraiveのグループのうち、数が少ないものを除外する関数"""
    df["audiobook_narrative"] = df["audiobook_name"] + "_" + df["is_narrative"].astype(str)
    audiobook_narrative_counts = df["audiobook_narrative"].value_counts()
    audiobook_narratives_to_exclude = audiobook_narrative_counts[audiobook_narrative_counts < threshold].index
    print(f"audiobook_narratives_to_exclude: {audiobook_narratives_to_exclude}")
    return df[~df["audiobook_narrative"].isin(audiobook_narratives_to_exclude)]

# train_test_splitを行って、train, val, testの列を追加する関数
def train_val_test_split(df: pd.DataFrame, test_size: float = 0.2, val_size: float = 0.25) -> pd.DataFrame:
    """train_test_splitを行って、train, val, testの列を追加する関数"""
    from sklearn.model_selection import train_test_split
    stratify_col = df["audiobook_name"] + "_" + df["is_narrative"].astype(str)
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=42, stratify=stratify_col)
    # stratify_colをdf_trainに基づいて再計算
    stratify_col_train = df_train["audiobook_name"] + "_" + df_train["is_narrative"].astype(str)
    df_train, df_val = train_test_split(df_train, test_size=val_size, random_state=42, stratify=stratify_col_train)
    df_train["train_val_test"] = "train"
    df_val["train_val_test"] = "val"
    df_test["train_val_test"] = "test"
    assert len(set(df_train.index) & set(df_val.index)) == 0, "train and val have common index"
    assert len(set(df_train.index) & set(df_test.index)) == 0, "train and test have common index"
    assert len(set(df_val.index) & set(df_test.index)) == 0, "val and test have common index"
    return pd.concat([df_train, df_val, df_test])

# 訓練データのみで平均と分散を計算して、それを使って正規化する関数
def normalize_pause_length_with_train(df_train: pd.DataFrame, df: pd.DataFrame, group: str, added_col: str) -> pd.DataFrame:
    """訓練データのみで平均と分散を計算して、それを使って正規化する関数"""
    df_normalized = df.copy()
    if group == "none":
        # mean, var = calc_mean_var_pause_length(df_train)
        df_normalized["mean"] = 0
        df_normalized["var"] = 1
    elif group == "all":
        mean, var = calc_mean_var_pause_length(df_train)
        df_normalized["mean"] = mean
        df_normalized["var"] = var
    else:
        df_mean_var = df_train.groupby(group).apply(lambda x: pd.Series([*calc_mean_var_pause_length(x)], index=["mean", "var"])).reset_index()
        # 分散が0になるもの確認
        # print(df_mean_var[df_mean_var["var"] <= 0])
        df_normalized = pd.merge(df_normalized, df_mean_var, on=group, how="left")
    
    df_normalized[added_col] = df_normalized.apply(lambda x: normalize_pause_lengths_in_texts(x["morp_pause_clip_no_pause"], x["mean"], x["var"]), axis=1)
    rename_dict = {"none": "none", "all": "all", "audiobook_name": "audiobook", "is_narrative": "narrative", "audiobook_name_is_narrative": "audiobook_narrative", "speaker": "speaker", "book": "book"}
    rename_group = rename_dict[group] if type(group) == str else "_".join([rename_dict[g] for g in group])
    df_normalized.rename(columns={"mean": f"mean_{rename_group}", "var": f"var_{rename_group}"}, inplace=True)
    return df_normalized

# train, val, testごとにグループ数を計測する関数
def count_groups_in_train_val_test(df: pd.DataFrame) -> pd.DataFrame:
    """train, val, testごとにグループ数を計測する関数"""
    nunique_audiobook_name = df.groupby("train_val_test").apply(lambda x: x["audiobook_name"].nunique())
    nunique_is_narrative = df.groupby("train_val_test").apply(lambda x: x["is_narrative"].nunique())
    df_tmp = df.copy()
    df_tmp["audiobook_name_is_narrative"] = df_tmp["audiobook_name"] + "_" + df_tmp["is_narrative"].astype(str)
    nunique_audiobook_name_is_narrative = df_tmp.groupby("train_val_test").apply(lambda x: x["audiobook_name_is_narrative"].nunique())
    nunique_speaker = df.groupby("train_val_test").apply(lambda x: x["speaker"].nunique())
    nunique_book = df.groupby("train_val_test").apply(lambda x: x["book"].nunique())
    group_counts = pd.concat([nunique_audiobook_name, nunique_is_narrative, nunique_audiobook_name_is_narrative, nunique_speaker, nunique_book], axis=1)
    group_counts.columns = ["audiobook_name", "is_narrative", "audiobook_name_is_narrative", "speaker", "book"]
    return group_counts

#　長さが0のテキストの行を削除する関数
def drop_zero_length_texts(df: pd.DataFrame) -> pd.DataFrame:
    """長さが0のテキストの行を削除する関数"""
    df["text_length"] = df["morp_pause_clip_no_pause"].apply(lambda x: len(x))
    print(f"length 0 texts: {len(df[df['text_length'] == 0])}")
    df = df[df["text_length"] > 0]
    return df.drop("text_length", axis=1)


In [None]:
output_80ms_path = exp_dir / "bert_traindata_pause_position_with_length_wo_sokuon_80ms_normalized.pkl"
output_100ms_path = exp_dir / "bert_traindata_pause_position_with_length_wo_sokuon_100ms_normalized.pkl"

# 長さが0のテキストの行を削除
df_train_80ms_labeled = drop_zero_length_texts(df_train_80ms_labeled)
df_train_100ms_labeled = drop_zero_length_texts(df_train_100ms_labeled)

# グループごとにデータが少ないものを除外
df_train_80ms_labeled = exclude_few_audiobook_narratives(df_train_80ms_labeled)
df_train_100ms_labeled = exclude_few_audiobook_narratives(df_train_100ms_labeled)

# train, val, testの列を追加
df_train_80ms_labeled = train_val_test_split(df_train_80ms_labeled)
df_train_100ms_labeled = train_val_test_split(df_train_100ms_labeled)

# グループごとの数を計測
print("group counts")
print("80ms")
display(count_groups_in_train_val_test(df_train_80ms_labeled))
print("100ms")
display(count_groups_in_train_val_test(df_train_100ms_labeled))

# 80ms, 正規化したデータを保存
df_train_80ms_labeled_normalized = normalize_pause_length_with_train(df_train_80ms_labeled[df_train_80ms_labeled["train_val_test"] == "train"], df_train_80ms_labeled, "none", "morp_pause_clip_no_pause_normalized_80ms_none")
df_train_80ms_labeled_normalized = normalize_pause_length_with_train(df_train_80ms_labeled[df_train_80ms_labeled["train_val_test"] == "train"], df_train_80ms_labeled_normalized, "all", "morp_pause_clip_no_pause_normalized_80ms_all")
df_train_80ms_labeled_normalized = normalize_pause_length_with_train(df_train_80ms_labeled[df_train_80ms_labeled["train_val_test"] == "train"], df_train_80ms_labeled_normalized, "audiobook_name", "morp_pause_clip_no_pause_normalized_80ms_audiobook")
df_train_80ms_labeled_normalized = normalize_pause_length_with_train(df_train_80ms_labeled[df_train_80ms_labeled["train_val_test"] == "train"], df_train_80ms_labeled_normalized, "is_narrative", "morp_pause_clip_no_pause_normalized_80ms_narrative")
df_train_80ms_labeled_normalized = normalize_pause_length_with_train(df_train_80ms_labeled[df_train_80ms_labeled["train_val_test"] == "train"], df_train_80ms_labeled_normalized, "audiobook_name is_narrative".split(), "morp_pause_clip_no_pause_normalized_80ms_audiobook_narrative")
df_train_80ms_labeled_normalized = normalize_pause_length_with_train(df_train_80ms_labeled[df_train_80ms_labeled["train_val_test"] == "train"], df_train_80ms_labeled_normalized, "speaker", "morp_pause_clip_no_pause_normalized_80ms_speaker")
df_train_80ms_labeled_normalized = normalize_pause_length_with_train(df_train_80ms_labeled[df_train_80ms_labeled["train_val_test"] == "train"], df_train_80ms_labeled_normalized, "book", "morp_pause_clip_no_pause_normalized_80ms_book")
display(df_train_80ms_labeled_normalized.head())
df_train_80ms_labeled_normalized.to_pickle(output_80ms_path)
print(f"save: {output_80ms_path}")

# 全てのデータで平均と分散を計算して、それを使って正規化する場合(リークするため、20240208コメントアウト)
# df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled, "", "morp_pause_clip_no_pause_normalized_80ms_all")
# df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled_normalized, "audiobook_name", "morp_pause_clip_no_pause_normalized_80ms_audiobook")
# df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled_normalized, "is_narrative", "morp_pause_clip_no_pause_normalized_80ms_narrative")
# df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled_normalized, "audiobook_name is_narrative".split(), "morp_pause_clip_no_pause_normalized_80ms_audiobook_narrative")
# df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled_normalized, "speaker", "morp_pause_clip_no_pause_normalized_80ms_speaker")
# df_train_80ms_labeled_normalized = normalize_pause_length_with_group(df_train_80ms_labeled_normalized, "book", "morp_pause_clip_no_pause_normalized_80ms_book")
# display(df_train_80ms_labeled_normalized.head())
# df_train_80ms_labeled_normalized.to_pickle(output_80ms_path)
# print(f"save: {output_80ms_path}")

# 100ms, 正規化したデータを保存
df_train_100ms_labeled_normalized = normalize_pause_length_with_train(df_train_100ms_labeled[df_train_100ms_labeled["train_val_test"] == "train"], df_train_100ms_labeled, "none", "morp_pause_clip_no_pause_normalized_100ms_none")
df_train_100ms_labeled_normalized = normalize_pause_length_with_train(df_train_100ms_labeled[df_train_100ms_labeled["train_val_test"] == "train"], df_train_100ms_labeled_normalized, "all", "morp_pause_clip_no_pause_normalized_100ms_all")
df_train_100ms_labeled_normalized = normalize_pause_length_with_train(df_train_100ms_labeled[df_train_100ms_labeled["train_val_test"] == "train"], df_train_100ms_labeled_normalized, "audiobook_name", "morp_pause_clip_no_pause_normalized_100ms_audiobook")
df_train_100ms_labeled_normalized = normalize_pause_length_with_train(df_train_100ms_labeled[df_train_100ms_labeled["train_val_test"] == "train"], df_train_100ms_labeled_normalized, "is_narrative", "morp_pause_clip_no_pause_normalized_100ms_narrative")
df_train_100ms_labeled_normalized = normalize_pause_length_with_train(df_train_100ms_labeled[df_train_100ms_labeled["train_val_test"] == "train"], df_train_100ms_labeled_normalized, "audiobook_name is_narrative".split(), "morp_pause_clip_no_pause_normalized_100ms_audiobook_narrative")
df_train_100ms_labeled_normalized = normalize_pause_length_with_train(df_train_100ms_labeled[df_train_100ms_labeled["train_val_test"] == "train"], df_train_100ms_labeled_normalized, "speaker", "morp_pause_clip_no_pause_normalized_100ms_speaker")
df_train_100ms_labeled_normalized = normalize_pause_length_with_train(df_train_100ms_labeled[df_train_100ms_labeled["train_val_test"] == "train"], df_train_100ms_labeled_normalized, "book", "morp_pause_clip_no_pause_normalized_100ms_book")
display(df_train_100ms_labeled_normalized.head())
df_train_100ms_labeled_normalized.to_pickle(output_100ms_path)
print(f"save: {output_100ms_path}")


# df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled, "", "morp_pause_clip_no_pause_normalized_100ms_all")
# df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled_normalized, "audiobook_name", "morp_pause_clip_no_pause_normalized_100ms_audiobook")
# df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled_normalized, "is_narrative", "morp_pause_clip_no_pause_normalized_100ms_narrative")
# df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled_normalized, "audiobook_name is_narrative".split(), "morp_pause_clip_no_pause_normalized_100ms_audiobook_narrative")
# df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled_normalized, "speaker", "morp_pause_clip_no_pause_normalized_100ms_speaker")
# df_train_100ms_labeled_normalized = normalize_pause_length_with_group(df_train_100ms_labeled_normalized, "book", "morp_pause_clip_no_pause_normalized_100ms_book")
# display(df_train_100ms_labeled_normalized.head())
# df_train_100ms_labeled_normalized.to_pickle(output_100ms_path)
# print(f"save: {output_100ms_path}")


In [None]:
# データの行数を確認
print("row counts")
print("80ms")
display(df_train_80ms_labeled_normalized["train_val_test"].value_counts())
print("100ms")
display(df_train_100ms_labeled_normalized["train_val_test"].value_counts())

In [None]:
# colをsortして表示
display(df_train_80ms_labeled_normalized.columns.sort_values())

In [None]:
# morpを見る noneが0,1で正規化されているか確認
display(df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized_80ms_none"].head())
display(df_train_80ms_labeled_normalized["morp_pause_clip_no_pause"].head())

In [None]:
# morpが長さ0のものを見る
display(df_train_80ms_labeled_normalized[df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized_80ms_none"].apply(len) == 0])
# 長さ0がnone, all, audiobook, is_narrative, audiobook_narrative, speaker, bookでそれぞれ同じか確認
display(df_train_80ms_labeled_normalized[df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized_80ms_none"].apply(len) == 0]["morp_pause_clip_no_pause_normalized_80ms_none"].equals(df_train_80ms_labeled_normalized[df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized_80ms_all"].apply(len) == 0]["morp_pause_clip_no_pause_normalized_80ms_all"]))
display(df_train_80ms_labeled_normalized[df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized_80ms_none"].apply(len) == 0]["morp_pause_clip_no_pause_normalized_80ms_none"].equals(df_train_80ms_labeled_normalized[df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized_80ms_audiobook"].apply(len) == 0]["morp_pause_clip_no_pause_normalized_80ms_audiobook"]))
display(df_train_80ms_labeled_normalized[df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized_80ms_none"].apply(len) == 0]["morp_pause_clip_no_pause_normalized_80ms_none"].equals(df_train_80ms_labeled_normalized[df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized_80ms_narrative"].apply(len) == 0]["morp_pause_clip_no_pause_normalized_80ms_narrative"]))
display(df_train_80ms_labeled_normalized[df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized_80ms_none"].apply(len) == 0]["morp_pause_clip_no_pause_normalized_80ms_none"].equals(df_train_80ms_labeled_normalized[df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized_80ms_audiobook_narrative"].apply(len) == 0]["morp_pause_clip_no_pause_normalized_80ms_audiobook_narrative"]))
display(df_train_80ms_labeled_normalized[df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized_80ms_none"].apply(len) == 0]["morp_pause_clip_no_pause_normalized_80ms_none"].equals(df_train_80ms_labeled_normalized[df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized_80ms_speaker"].apply(len) == 0]["morp_pause_clip_no_pause_normalized_80ms_speaker"]))
display(df_train_80ms_labeled_normalized[df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized_80ms_none"].apply(len) == 0]["morp_pause_clip_no_pause_normalized_80ms_none"].equals(df_train_80ms_labeled_normalized[df_train_80ms_labeled_normalized["morp_pause_clip_no_pause_normalized_80ms_book"].apply(len) == 0]["morp_pause_clip_no_pause_normalized_80ms_book"]))
