In [40]:
import json, os, re
import pandas as pd

In [41]:
def remove_unwanted_linebreaks(text, keep_after_list=None):
    if keep_after_list is None:
        keep_after_list = []

    # keep_after_list を正規表現 OR に変換
    keep_pattern = "|".join(re.escape(s) for s in keep_after_list)

    # 「何月何日」形式のパターン
    # 例: 1月1日, 12月25日
    date_pattern = r'\d{1,2}月\d{1,2}日'

    # keep_after_list と日付パターンをまとめる
    if keep_pattern:
        combined_keep = f"(?:{keep_pattern}|{date_pattern})"
    else:
        combined_keep = f"(?:{date_pattern})"

    paragraphs = text.split("\n\n")
    cleaned_paragraphs = []

    for p in paragraphs:
        # 削除対象の改行パターン
        pattern = (
            r'(?:'
                r'(?<![。！？.!?])\n(?!\n)(?!' + combined_keep + r')'  # 通常の削除ルール
            r'|'
                r'。\n(?=（)(?!' + combined_keep + r')'               # 「。」＋改行＋（ の削除
            r')'
        )

        p = re.sub(pattern, '', p)
        cleaned_paragraphs.append(p)

    return "\n\n".join(cleaned_paragraphs)



In [None]:
json_file_path = os.path.join("reports-data", "001325488.json")

with open(json_file_path, "r", encoding='utf-8') as f:
    data = json.load(f)
    df = pd.DataFrame(data)

    keep_after_list = ["併用薬", "生活の場"]
    df['pre_existing_conditions'] = df["pre_existing_conditions"].str.replace("\r\n", "\n").map(lambda x: remove_unwanted_linebreaks(x, keep_after_list=keep_after_list))

In [43]:
df_dict = df.to_dict("records")
df_string = json.dumps(df_dict, ensure_ascii=False, indent=2)

with open(json_file_path, encoding='utf-8', mode='w', newline="\n") as f:
	f.write(df_string)