In [15]:
from __future__ import annotations
from pathlib import Path
import re

import matplotlib.pyplot as plt
from sudachipy import tokenizer
from sudachipy import dictionary
from wordcloud import WordCloud

# === 入力フォルダ ===
diary_root = Path(r"C:\\Users\\fuben\\github\\obsidian\\日記")
output_dir = Path(r"C:\\Users\\fuben\\github\\obsidian\\日記\\ワードクラウド")
output_dir.mkdir(parents=True, exist_ok=True)

# === 形態素解析器の準備 ===
tokenizer_obj = dictionary.Dictionary().create()
mode = tokenizer.Tokenizer.SplitMode.C

# === ワードクラウド設定 ===
font_path = r"C:\\Windows\\Fonts\\meiryo.ttc"
wc_config = dict(
    width=1200,
    height=800,
    background_color="white",
    font_path=font_path,
    collocations=False,
)

def split_text_by_bytes(text: str, max_bytes: int = 48000) -> list[str]:
    lines = text.splitlines()
    chunks = []
    current = ""
    for line in lines:
        candidate = f"{current}\n{line}" if current else line
        if len(candidate.encode("utf-8")) > max_bytes:
            if current:
                chunks.append(current)
                current = line
            else:
                line_bytes = line.encode("utf-8")
                for i in range(0, len(line_bytes), max_bytes):
                    chunks.append(line_bytes[i:i + max_bytes].decode("utf-8", errors="ignore"))
                current = ""
        else:
            current = candidate
    if current:
        chunks.append(current)
    return chunks

def extract_hitokoto(text: str) -> str:
    match = re.search(r"^### \s*＜一言＞\s*\n+([\s\S]*?)(?:\n### |\Z)", text, flags=re.MULTILINE)
    if not match:
        return ""
    return match.group(1).strip()

output_paths = []
yearly_words: dict[str, list[str]] = {}
for diary_path in sorted(diary_root.glob("**/*.md")):
    text = diary_path.read_text(encoding="utf-8")
    text = extract_hitokoto(text)

    # === 前処理（記号・URLなどを除去）===
    text = re.sub(r"https?://\S+", "", text)
    text = re.sub(r"[#*<>`\[\]()]", " ", text)
    text = re.sub(r"[0-9]+", " ", text)
    text = re.sub(r"\s+", " ", text).strip()

    # === 形態素解析（名詞・動詞・形容詞を抽出）===
    words = []
    for chunk in split_text_by_bytes(text):
        for token in tokenizer_obj.tokenize(chunk, mode):
            part_of_speech = token.part_of_speech()[0]
            if part_of_speech in {"名詞", "動詞", "形容詞"}:
                base_form = token.dictionary_form()
                if base_form not in {"する", "いる", "ある", "なる", "こと", "もの"}:
                    words.append(base_form)

    if words:
        year_key = diary_path.parent.name
        yearly_words.setdefault(year_key, []).extend(words)

    word_text = " ".join(words)
    if not word_text:
        continue

    # === ワードクラウド生成 ===
    wc = WordCloud(**wc_config)
    frequencies = wc.process_text(word_text)
    if not frequencies:
        continue
    wc.generate_from_frequencies(frequencies)

    output_path = output_dir / f"wordcloud_{diary_path.stem}.png"
    wc.to_file(output_path.as_posix())
    output_paths.append(output_path)

for year_key, words in sorted(yearly_words.items()):
    word_text = " ".join(words)
    if not word_text:
        continue

    wc = WordCloud(**wc_config)
    frequencies = wc.process_text(word_text)
    if not frequencies:
        continue
    wc.generate_from_frequencies(frequencies)

    output_path = output_dir / f"wordcloud_{year_key}.png"
    wc.to_file(output_path.as_posix())
    output_paths.append(output_path)

output_paths


[WindowsPath('C:/Users/fuben/github/obsidian/日記/ワードクラウド/wordcloud_2022年11月.png'),
 WindowsPath('C:/Users/fuben/github/obsidian/日記/ワードクラウド/wordcloud_2022年12月.png'),
 WindowsPath('C:/Users/fuben/github/obsidian/日記/ワードクラウド/wordcloud_2023年10月.png'),
 WindowsPath('C:/Users/fuben/github/obsidian/日記/ワードクラウド/wordcloud_2023年11月.png'),
 WindowsPath('C:/Users/fuben/github/obsidian/日記/ワードクラウド/wordcloud_2023年12月.png'),
 WindowsPath('C:/Users/fuben/github/obsidian/日記/ワードクラウド/wordcloud_2023年1月.png'),
 WindowsPath('C:/Users/fuben/github/obsidian/日記/ワードクラウド/wordcloud_2023年2月.png'),
 WindowsPath('C:/Users/fuben/github/obsidian/日記/ワードクラウド/wordcloud_2023年3月.png'),
 WindowsPath('C:/Users/fuben/github/obsidian/日記/ワードクラウド/wordcloud_2023年4月.png'),
 WindowsPath('C:/Users/fuben/github/obsidian/日記/ワードクラウド/wordcloud_2023年5月.png'),
 WindowsPath('C:/Users/fuben/github/obsidian/日記/ワードクラウド/wordcloud_2023年6月.png'),
 WindowsPath('C:/Users/fuben/github/obsidian/日記/ワードクラウド/wordcloud_2023年7月.png'),
 WindowsPath('C:/Users/