# Train.pyで現在行っているデータの前処理を、先にデータとして作成しておく

z_normalize_pause_length.ipynbのノートブックの続き

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
from pathlib import Path
import pickle
import yaml
from pprint import pprint
import sys
from sklearn.model_selection import train_test_split

sys.path.append("/home/takeshun256/PausePrediction")

# import own library
from config import DATA_DIR, DATA_TAKESHUN256_DIR, SRC_DIR, DATA_IN_ROOT_DIR

# define path
corpus_name = "jmac"
exp_name = "03_VAD_Adjusted"
exp_dir = Path(DATA_TAKESHUN256_DIR) / corpus_name / exp_name
audiobook_yaml_path = Path(DATA_IN_ROOT_DIR) / corpus_name / "text_audio_dict_new.yaml"

assert exp_dir.exists()
assert audiobook_yaml_path.exists()

# audio book data
with open(audiobook_yaml_path, "rb") as f:
    audiobook_dict = yaml.safe_load(f)

# データの一覧
pause_time_threshold_mss = [80, 100]
preprocess_types = ["none", "all", "audiobook", "narrative", "audiobook_narrative", "speaker", "book"]
num_labels = [1, 2]

# 80ms
df_train_80ms = pd.read_pickle(exp_dir / "bert_traindata_pause_position_with_length_wo_sokuon_80ms_normalized.pkl")

# 100ms
df_train_100ms = pd.read_pickle(exp_dir / "bert_traindata_pause_position_with_length_wo_sokuon_100ms_normalized.pkl")


# output dir
output_dir = exp_dir / "data_bert"
assert output_dir.exists()

# それぞれのディレクトリを作成
for pause_time_threshold_ms in pause_time_threshold_mss:
    for preprocess_type in preprocess_types:
        output_dir_each = output_dir / f"{pause_time_threshold_ms}ms" / preprocess_type
        output_dir_each.mkdir(parents=True, exist_ok=True)
        print(output_dir_each)


print("audio book data")
print(len(audiobook_dict))
pprint(audiobook_dict[list(audiobook_dict.keys())[0]])
print("80ms")
print(df_train_80ms.shape)
print("100ms")
print(df_train_100ms.shape)
display(df_train_80ms.head())

In [None]:
df_train_80ms.columns


In [None]:
len(df_train_80ms), len(df_train_100ms)

In [None]:
# NaNが1つでも含まれる行数を数える
print("80ms")
print(df_train_80ms.isnull().sum().sum())
print("100ms")
print(df_train_100ms.isnull().sum().sum())


In [None]:
# 80msには存在し、100msには存在しないデータを抽出
df_unique_to_80ms = df_train_80ms[~df_train_80ms.set_index(["audiobook_name", "chapter_name"]).index.isin(df_train_100ms.set_index(["audiobook_name", "chapter_name"]).index)]
display(df_unique_to_80ms)

In [None]:
# 訓練・テストデータのトータルの形態素数と認定したポーズ数 を 80ms, 100ms それぞれで1つの図にまとめる

data_type = [["train", "valid"], "test"]

# リスト内でPauseの数を数える
def count_pause(x):
    c = 0
    for i in x:
        if "[PAUSE" in i:
            c += 1
    return c
def count_not_pause(x):
    c = 0
    for i in x:
        if "[NO_PAUSE" in i:
            c += 1
    return c
def count_mora(x):
    c = 0
    for i in x:
        if not ("[NO_PAUSE" in i or "[PAUSE" in i):
            c += 1
    return c

df_train_80ms["total_mora"] = df_train_80ms["morp_pause_clip_no_pause"].apply(count_mora)
df_train_80ms["total_pause"] = df_train_80ms["morp_pause_clip_no_pause"].apply(count_pause)
df_train_80ms["total_no_pause"] = df_train_80ms["morp_pause_clip_no_pause"].apply(count_not_pause)
df_train_100ms["total_mora"] = df_train_100ms["morp_pause_clip_no_pause"].apply(count_mora)
df_train_100ms["total_pause"] = df_train_100ms["morp_pause_clip_no_pause"].apply(count_pause)
df_train_100ms["total_no_pause"] = df_train_100ms["morp_pause_clip_no_pause"].apply(count_not_pause)

# check total_mora + total_pause + total_no_pause = len(morp_pause_clip_no_pause)
import numpy as np

sumlen_80ms = df_train_80ms["total_mora"] + df_train_80ms["total_pause"] + df_train_80ms["total_no_pause"]
sumlen_80ms = sumlen_80ms.values
sumlen_100ms = df_train_100ms["total_mora"] + df_train_100ms["total_pause"] + df_train_100ms["total_no_pause"]
sumlen_100ms = sumlen_100ms.values

# 80msのデータの検証
expected_len_80ms = df_train_80ms["morp_pause_clip_no_pause"].apply(len).values
actual_len_80ms = (df_train_80ms["total_mora"] + df_train_80ms["total_pause"] + df_train_80ms["total_no_pause"]).values
assert (actual_len_80ms == expected_len_80ms).all(), f"80ms: {actual_len_80ms} != {expected_len_80ms}"

# 100msのデータの検証
expected_len_100ms = df_train_100ms["morp_pause_clip_no_pause"].apply(len).values
actual_len_100ms = (df_train_100ms["total_mora"] + df_train_100ms["total_pause"] + df_train_100ms["total_no_pause"]).values
assert (actual_len_100ms == expected_len_100ms).all(), f"100ms: {actual_len_100ms} != {expected_len_100ms}"


# train_test (train_val_testでtrainはtrain, valはtrain, testはtest)
df_train_80ms["data_type"] = df_train_80ms["train_val_test"].replace({"train": "train", "val": "train", "test": "test"})
df_train_100ms["data_type"] = df_train_100ms["train_val_test"].replace({"train": "train", "val": "train", "test": "test"})

print(df_train_80ms["data_type"].value_counts())

# 80ms, 100msそれぞれで、train, testでのtotal_mora, total_pause, total_no_pauseの合計を出す
df_train_80ms_total = df_train_80ms.groupby("data_type")[["total_mora", "total_pause", "total_no_pause"]].sum().reset_index()
df_train_100ms_total = df_train_100ms.groupby("data_type")[["total_mora", "total_pause", "total_no_pause"]].sum().reset_index()

# 80msと100msのデータをsubplotsで表示し、total_moraとtotal_pauseのみをプロット
fig, axes = plt.subplots(1, 2, figsize=(20, 5))

# 80msのデータをプロット
df_train_80ms_total.plot(kind="bar", x="data_type", y=["total_mora", "total_pause"], ax=axes[0])
axes[0].set_xlabel("Number of mora/pause")
axes[0].set_ylabel("Data type")
axes[0].set_title("80ms: Number of mora and pause in train and test data")

# 100msのデータをプロット
df_train_100ms_total.plot(kind="bar", x="data_type", y=["total_mora", "total_pause"], ax=axes[1])
axes[1].set_xlabel("Number of mora/pause")
axes[1].set_ylabel("Data type")
axes[1].set_title("100ms: Number of mora and pause in train and test data")

plt.tight_layout()
plt.show()


In [None]:
display(df_train_80ms_total)
display(df_train_100ms_total)

In [None]:
aa = df_train_80ms["morp_pause_clip_no_pause"].loc[0]
print(aa)
print(count_mora(aa))
print(count_pause(aa))
print(count_not_pause(aa))


In [None]:
# ユニーク数をカウント
print("80ms")
print("audiobook_name のユニーク数:", df_train_80ms["audiobook_name"].nunique())
print("is_narrative のユニーク数:", df_train_80ms["is_narrative"].nunique())
print("speaker のユニーク数:", df_train_80ms["speaker"].nunique())
print("book のユニーク数:", df_train_80ms["book"].nunique())

print("100ms")
print(df_train_100ms["audiobook_name"].nunique())
print(df_train_100ms["is_narrative"].nunique())
print(df_train_100ms["speaker"].nunique())
print(df_train_100ms["book"].nunique())

In [None]:
# それぞれのカテゴリに含まれる数の最大値と最小値を確認(3未満だと分割した際に未知データになるため)
print(f"audiobook_count: {df_train_80ms['audiobook_name'].value_counts().max()}〜{df_train_80ms['audiobook_name'].value_counts().min()}")
print(f"narrative_count: {df_train_80ms['is_narrative'].value_counts().max()}〜{df_train_80ms['is_narrative'].value_counts().min()}")
print(f"speaker_count: {df_train_80ms['speaker'].value_counts().max()}〜{df_train_80ms['speaker'].value_counts().min()}")
print(f"book_count: {df_train_80ms['book'].value_counts().max()}〜{df_train_80ms['book'].value_counts().min()}")
# audiobook x is_narrative の組み合わせを作成
df_train_80ms["audiobook_is_narrative"] = df_train_80ms["audiobook_name"] + "_" + df_train_80ms["is_narrative"].astype(str)
print(df_train_80ms["audiobook_is_narrative"].nunique())
print(f"audiobook_is_narrative_count: {df_train_80ms['audiobook_is_narrative'].value_counts().max()}〜{df_train_80ms['audiobook_is_narrative'].value_counts().min()}")


fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# スピーカーの可視化
df_train_80ms["speaker"].astype("category").cat.codes.value_counts().sort_index().plot(kind="bar", ax=axes[0, 0], title="speaker")

# オーディオブック名の可視化
df_train_80ms["audiobook_name"].astype("category").cat.codes.value_counts().sort_index().plot(kind="bar", ax=axes[0, 1], title="audiobook_name")

# ナラティブの可視化
df_train_80ms["is_narrative"].astype("category").cat.codes.value_counts().sort_index().plot(kind="bar", ax=axes[1, 0], title="is_narrative")

# 本のタイトルの可視化
df_train_80ms["book"].astype("category").cat.codes.value_counts().sort_index().plot(kind="bar", ax=axes[1, 1], title="book")

plt.tight_layout()
plt.show()


## データの前処理

In [None]:
def preprocess_data(df, pause_time_threshold_ms, preprocess_type, num_labels):
    """
    データフレームの前処理を行う関数

    :param df: 前処理を行うデータフレーム
    :param pause_time_threshold_ms: ポーズ時間の閾値（ミリ秒）
    :param preprocess_type: 前処理のタイプ
    :param num_labels: ラベルの数（1: 回帰, 2: 2値分類, 3: 多値分類）
    :return: 前処理後のデータフレーム
    """
    texts = []
    labels = []

    # 列名を決定する辞書
    column_dict = {
        "none": f"morp_pause_clip_no_pause_normalized_{pause_time_threshold_ms}ms_none",
        "all": f"morp_pause_clip_no_pause_normalized_{pause_time_threshold_ms}ms_all",
        "narrative": f"morp_pause_clip_no_pause_normalized_{pause_time_threshold_ms}ms_narrative",
        "audiobook": f"morp_pause_clip_no_pause_normalized_{pause_time_threshold_ms}ms_audiobook",
        "audiobook_narrative": f"morp_pause_clip_no_pause_normalized_{pause_time_threshold_ms}ms_audiobook_narrative",
        "speaker": f"morp_pause_clip_no_pause_normalized_{pause_time_threshold_ms}ms_speaker",
        "book": f"morp_pause_clip_no_pause_normalized_{pause_time_threshold_ms}ms_book",
    }
    column_name = column_dict[preprocess_type]
    print(f"Using {column_name} ...")

    # テキストとラベルを抽出
    for a in df[column_name].values:
        if len(a) == 0:
            texts.append([])
            labels.append([])
            continue
        a = a[1:]  # 最初の要素は、[PAUSE] or [NO_PAUSE] なので削除
        a[-1] = "[NO_PAUSE]"  # 最後の要素は、文間ポーズなので、[NO_PAUSE] にする
        texts.append(a[::2])
        labels.append(a[1::2])
        assert len(texts[-1]) == len(labels[-1]), f"{len(texts[-1])}, {texts[-1]} != {len(labels[-1])}, {labels[-1]}"

    df["texts"] = texts
    df["labels_str"] = labels

    # ラベルの処理
    if num_labels == 1:
        # 回帰タスク
        # [PAUSE 0.5] などの文字列から、0.5 の部分を取得, [NO_PAUSE] は 0 にする。[PAUSE -0.5] などもあり得るので注意
        # df["labels"] = df["labels_str"].apply(
        #     lambda x: [float(re.findall(r"\d+\.\d+", a)[0]) if a.startswith("[PAUSE") else 0 for a in x]
        # )
        def lam(x):
            out = []
            for a in x:
                if a.startswith("[PAUSE"):
                    out.append(float(a.split()[1][:-1]))
                else:
                    out.append(0)
            return out

        df["labels"] = df["labels_str"].apply(lam)

    elif num_labels == 2:
        # 2値分類タスク
        # [PAUSE 0.5] などの文字列は 1 に、[NO_PAUSE] は 0 にする
        df["labels"] = df["labels_str"].apply(lambda x: [1 if a.startswith("[PAUSE") else 0 for a in x])
    else:
        raise ValueError("num_labels must be 1, 2")

    # 空のテキストやラベルを持つ行を削除
    print(f"削除される行数: {len(df) - len(df[df['texts'].apply(lambda x: len(x) > 0)])}")
    df = df[df["texts"].apply(lambda x: len(x) > 0)]
    df = df.reset_index(drop=True)

    # 埋め込み用のIDを作成
    df["id_audiobook"] = df["audiobook_name"].astype("category").cat.codes
    df["id_speaker"] = df["speaker"].astype("category").cat.codes
    df["id_book"] = df["book"].astype("category").cat.codes
    df["id_narrative"] = df["is_narrative"].astype("category").cat.codes
    df["id_none"] = 0
    df["id_all"] = 0
    # audiobook x is_narrative の組み合わせを作成
    df["audiobook_narrative"] = df["audiobook_name"] + "_" + df["is_narrative"].astype(str)
    df["id_audiobook_narrative"] = df["audiobook_narrative"].astype("category").cat.codes
    
    # ID変換時の対応表を保存
    id_audiobook_dict = dict(enumerate(df["audiobook_name"].astype("category").cat.categories))
    id_speaker_dict = dict(enumerate(df["speaker"].astype("category").cat.categories))
    id_book_dict = dict(enumerate(df["book"].astype("category").cat.categories))
    id_narrative_dict = dict(enumerate(df["is_narrative"].astype("category").cat.categories))
    id_audiobook_narrative_dict = dict(enumerate(df["audiobook_narrative"].astype("category").cat.categories))
    id_dict = {"audiobook": id_audiobook_dict, "speaker": id_speaker_dict, "book": id_book_dict, "narrative": id_narrative_dict, "audiobook_narrative": id_audiobook_narrative_dict}
    # カレントディレクトリにcsvで保存
    for key, value in id_dict.items():
        pd.DataFrame(value.items(), columns=["id", key]).to_csv(f"id_dict/in_sentence/id_{key}.csv", index=False)
    
    # 使用する列のみを抽出
    # if preprocess_type == "none":
    #     df = df[["audiobook_name", "chapter_name", "texts", "labels", "labels_str", "id_audiobook", "id_speaker", "id_book", "id_narrative", "id_audiobook_narrative", "id_none", "id_all", "means", "vars"]]
    #     df["means"] = 0
    #     df["vars"] = 1
    # else:
    # noneも統合されたのでそれを確認
    if preprocess_type == "none":
        assert df["mean_none"].isnull().sum() == 0
        assert df["var_none"].isnull().sum() == 0
        # mean=0, var=1 にする
        assert df["mean_none"].mean() == 0
        assert df["var_none"].mean() == 1
    
    cols = ["audiobook_name", "chapter_name", "is_narrative", "speaker", "book", "texts", "labels", "labels_str", "id_audiobook", "id_speaker", "id_book", "id_narrative", "id_audiobook_narrative", "id_none", "id_all", f"mean_{preprocess_type}", f"var_{preprocess_type}", "train_val_test"]
    df = df[cols]
    df = df.rename(columns={f"mean_{preprocess_type}": "means", f"var_{preprocess_type}": "vars"})

    return df

In [None]:
df_train_80ms_all_1label = preprocess_data(df_train_80ms, 80, "none", 1)
display(df_train_80ms_all_1label.head())

In [None]:
df_train_80ms_all_1label = preprocess_data(df_train_80ms, 80, "narrative", 2)
display(df_train_80ms_all_1label.head())

In [None]:
for pause_time_threshold_ms in pause_time_threshold_mss:
    for preprocess_type in preprocess_types:
        for num_label in num_labels:
            print(f"pause_time_threshold_ms: {pause_time_threshold_ms}, preprocess_type: {preprocess_type}, num_label: {num_label}")
            if pause_time_threshold_ms == 80:
                df_train = df_train_80ms
            elif pause_time_threshold_ms == 100:
                df_train = df_train_100ms
            else:
                raise ValueError("pause_time_threshold_ms must be 80 or 100")
            df_train_preprocessed = preprocess_data(df_train, pause_time_threshold_ms, preprocess_type, num_label)
            df_train_preprocessed.to_pickle(output_dir / f"{pause_time_threshold_ms}ms" / preprocess_type / f"bert_traindata_{num_label}label.pkl")
            # 分割
            # train_val_test列で分割
            train_df = df_train_preprocessed[df_train_preprocessed["train_val_test"] == "train"]
            val_df = df_train_preprocessed[df_train_preprocessed["train_val_test"] == "val"]
            test_df = df_train_preprocessed[df_train_preprocessed["train_val_test"] == "test"]
            # test_size = 0.2
            # val_size = 0.25
            # train_val_df, test_df = train_test_split(df_train_preprocessed, test_size=test_size, random_state=42)
            # train_df, val_df = train_test_split(train_val_df, test_size=val_size, random_state=42)
            train_df.to_pickle(output_dir / f"{pause_time_threshold_ms}ms" / preprocess_type / f"bert_traindata_{num_label}label_train.pkl")
            val_df.to_pickle(output_dir / f"{pause_time_threshold_ms}ms" / preprocess_type / f"bert_traindata_{num_label}label_val.pkl")
            test_df.to_pickle(output_dir / f"{pause_time_threshold_ms}ms" / preprocess_type / f"bert_traindata_{num_label}label_test.pkl")

            print("done")