# BERTモデルの学習のために、最適なmax_lengthを計算する

## 文中ポーズのテキストのサブトークンの長さを計算する

データ作成：/home/takeshun256/PausePrediction/research/preprocess_jmac/学習データの作成.ipynb

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
from pathlib import Path
import pickle
import yaml
from pprint import pprint
import sys

sys.path.append("/home/takeshun256/PausePrediction")

# import own library
from config import DATA_DIR, DATA_TAKESHUN256_DIR, SRC_DIR, DATA_IN_ROOT_DIR

# define path
corpus_name = "jmac"
exp_name = "03_VAD_Adjusted"
exp_dir = Path(DATA_TAKESHUN256_DIR) / corpus_name / exp_name
audiobook_yaml_path = Path(DATA_IN_ROOT_DIR) / corpus_name / "text_audio_dict_new.yaml"

assert exp_dir.exists()
assert audiobook_yaml_path.exists()

# audio book data
with open(audiobook_yaml_path, "rb") as f:
    audiobook_dict = yaml.safe_load(f)

# データの一覧
pause_time_threshold_mss = [80, 100]
preprocess_types = ["none", "all", "audiobook", "narrative", "audiobook_narrative"]
num_labels = [1, 2]

# output dir
output_dir = exp_dir / "data_bert"
assert output_dir.exists()


print("audio book data")
print(len(audiobook_dict))
pprint(audiobook_dict[list(audiobook_dict.keys())[0]])

In [None]:
# 80ms, 100msのデータを読み込む
df = pd.read_pickle(output_dir / f"80ms" / "none" / f"bert_traindata_1label.pkl")
print(df.shape)
display(df.head())

In [None]:
# tokenize
# ながさ: cls + textにtokenzieかけたもの + sep  +++ max_lengthに満たない場合はpadding
from transformers import BertJapaneseTokenizer
tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")

text_length_list = []
for texts in tqdm(df["texts"]):
    text_lengh = 0
    for text in texts:
        text_lengh += len(tokenizer.tokenize(text))
    text_lengh += 2  # cls, sep
    text_length_list.append(text_lengh)

# 分布を出す
plt.hist(text_length_list, bins=100)

# 5, 95パーセンタイルを出す
print(f"95%: {np.percentile(text_length_list, 95)}")

# 最大値を出す
print(f"max: {max(text_length_list)}")


## 文間ポーズのテキストのサブトークンの長さを計算する

In [None]:
# 80ms, 100msのデータを読み込む
df = pd.read_pickle(output_dir / f"80ms" / "none" / f"bert_traindata_BetweenSentences_1label.pkl")
print(df.shape)
display(df.head())

In [None]:
# tokenize
# ながさ: cls + textにtokenzieかけたもの + sep  +++ max_lengthに満たない場合はpadding
from transformers import BertJapaneseTokenizer
tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")

text_length_list = []
for texts in tqdm(df["texts"]):
    text_lengh = 0
    if "[SEP]" not in texts:
        raise ValueError(f"not found [SEP] in {texts}")
    text1, text2 = texts.split("[SEP]")
    text_lengh += len(tokenizer.tokenize(text1))
    text_lengh += len(tokenizer.tokenize(text2))
    text_lengh += 3  # cls, sep, sep
    text_length_list.append(text_lengh)

# 分布を出す
plt.hist(text_length_list, bins=100)

# 5, 95パーセンタイルを出す
print(f"95%: {np.percentile(text_length_list, 95)}")

# 最大値を出す
print(f"max: {max(text_length_list)}")