In [None]:
from __future__ import annotations

import json
import os
import pickle
from dataclasses import dataclass
from typing import Final

from matplotlib import pyplot as plt
from tqdm import tqdm

import numpy as np

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from PIL import Image, ImageFilter, ImageFont

import character_utility as charutil
import config
from ipywidgets_helper import render_images
from kvg import Kvg
from utility import pathstr, char2code, create_vertical_stack_image

In [None]:
@dataclass
class PreparedKvg():
    root: Final[Kvg]
    decomposition: Final[dict[str, Kvg]]
    features: Final[dict[str, np.ndarray]]
    images_for_sample: Final[dict[str, Image.Image]]


def prepare_kvg(char: str, image_size: int, stroke_width: float, blur: float, log=False):
    charcode = char2code(char)
    directory_path = config.output_main_kvg_path(charcode)
    with open(pathstr(directory_path, f"{charcode}.json")) as f:
        root_kvg = Kvg.from_dict(json.load(f))
    
    if log: print(json.dumps(root_kvg.to_dict(), ensure_ascii=False, indent=2))

    def dfs(kvg: Kvg) -> list[Kvg]:
        ret: list[Kvg] = []
        if len(kvg.svg) == 0:
            for kvg0 in kvg.children:
                ret0 = dfs(kvg0)
                if len(ret0) == 0:
                    ret.clear()
                    break
                ret += ret0
        if len(ret) == 0:
            if kvg.name is not None:
                ret.append(kvg)

        return ret

    decomposition = {kvg.kvgid: kvg for kvg in dfs(root_kvg)}

    images_for_features: dict[str, Image.Image] = {}
    features: dict[str, np.ndarray] = {}
    images_for_sample: dict[str, Image.Image] = {}
    for kvgid in decomposition.keys():
        images_for_features[kvgid] = Image.open(pathstr(
            directory_path,
            f"{image_size}x,pad=0,sw={stroke_width} {kvgid}.png",
        ))
        images_for_features[kvgid] = images_for_features[kvgid].filter(ImageFilter.GaussianBlur(blur))

        features[kvgid] = np.array(images_for_features[kvgid])
        features[kvgid] = np.concatenate((features[kvgid].reshape(-1), features[kvgid].transpose().reshape(-1))) # 一方向だけだと例えば「かんむり」と「たれ」が区別されにくい

        images_for_sample[kvgid] = Image.open(pathstr(
            directory_path,
            f"64x,pad=4,sw=2 {kvgid}.png", # 雑
        ))

    pkvg = PreparedKvg(
        root=root_kvg,
        decomposition=decomposition,
        features=features,
        images_for_sample=images_for_sample,
    )
    return pkvg, images_for_features


def test():
    pkvg, images_for_features = prepare_kvg("遠", image_size=16, stroke_width=2, blur=1, log=True)

    images = []
    for kvgid, image in pkvg.images_for_sample.items():
        images.append((image, kvgid))
    for kvgid, image in images_for_features.items():
        images.append((image, kvgid))
    return render_images(images, columns=(len(images) // 2))


test()

In [None]:
@dataclass(frozen=True)
class KMeansResult:
    kvgs: list[Kvg]
    features: np.ndarray
    labels: list[int]
    kmeans: KMeans


def train_kmeans(pkvgs: list[PreparedKvg], n_clusters, log=False, plot=False) -> KMeansResult:
    kvgs: list[Kvg] = []
    features = []
    for pkvg in pkvgs:
        for kvgid, feature in pkvg.features.items():
            kvgs.append(pkvg.decomposition[kvgid])
            features.append(feature)
    features = np.stack(features)

    if log: print(f"data size: {len(kvgs)}")

    kmeans = KMeans(n_init=4, n_clusters=n_clusters, init="k-means++")
    labels = kmeans.fit_predict(features).tolist()
    
    if plot:
        pca = PCA(n_components=2)
        boundings2d = pca.fit_transform(features)
        print(f"{pca.explained_variance_ratio_=}")

        plt.figure(figsize=(16, 16))
        plt.scatter(boundings2d[:, 0], boundings2d[:, 1], c=labels)
        for kvg, xy in zip(kvgs, boundings2d):
            assert kvg.name is not None
            plt.annotate(kvg.name, xy)
    
    return KMeansResult(kvgs=kvgs, features=features, labels=labels, kmeans=kmeans)


def test(characters, n_clusters, image_size, stroke_width, blur):
    pkvgs = [prepare_kvg(c, image_size, stroke_width, blur)[0] for c in characters]
    train_kmeans(pkvgs, n_clusters=n_clusters, log=True, plot=True)


test(charutil.kanjis.education(), n_clusters=64, image_size=16, stroke_width=2, blur=1)

In [None]:
def plot_elbow(characters, n_clusters_candidates, image_size, stroke_width, blur):
    pkvgs = [prepare_kvg(c, image_size, stroke_width, blur)[0] for c in characters]

    distortions = []
    for n_clusters in tqdm(n_clusters_candidates):
        result = train_kmeans(pkvgs, n_clusters=n_clusters, log=False)
        distortions.append(-result.kmeans.score(result.features))

    plt.plot(n_clusters_candidates, distortions, marker="o")
    plt.xlabel("n_clusters")
    plt.ylabel("distortion")


plot_elbow(charutil.kanjis.all(), range(16, 512 + 1, 16), image_size=16, stroke_width=2, blur=1)

In [None]:
def save(characters, dataset_name, n_clusters, image_size, stroke_width, blur, n_sample_images):
    directory_path = config.output_radical_clustering_path(dataset_name, n_clusters, image_size, stroke_width, blur)
    os.makedirs(directory_path, exist_ok=False)
    
    pkvgs = [prepare_kvg(c, image_size, stroke_width, blur)[0] for c in characters]
    result = train_kmeans(pkvgs, n_clusters=n_clusters, log=True)

    with open(pathstr(directory_path, "kmeans.pickle"), "wb") as f:
        pickle.dump(result.kmeans, f)

    # label2kvgids
    label2kvgids = [[] for _ in range(n_clusters)]
    for kvg, label in zip(result.kvgs, result.labels):
        label2kvgids[label].append(kvg.kvgid)
    for t in label2kvgids:
        t.sort()

    with open(pathstr(directory_path, "label2kvgids.json"), "w") as f:
        json.dump(label2kvgids, f)

    # cluster_samples_image
    kvgid2label = {kvg.kvgid: label for kvg, label in zip(result.kvgs, result.labels)}

    label2images: list[list[Image.Image]] = [[] for _ in range(n_clusters)]
    for pkvg in pkvgs:
        for kvgid, image in pkvg.images_for_sample.items():
            label2images[kvgid2label[kvgid]].append(image)

    n_clusters_digit = len(str(n_clusters))

    image_data_list = []
    for i in range(n_clusters):
        center = result.kmeans.cluster_centers_[i]
        center_image = (center[:(image_size ** 2)].reshape(image_size, image_size) + center[(image_size ** 2):].reshape(image_size, image_size).transpose()) / 2
        center_image = Image.fromarray(center_image)
        center_image = center_image.resize(label2images[i][0].size)

        sample_images = label2images[i][:n_sample_images]

        image_data_list.append("    ".join((f"{str(i).zfill(n_clusters_digit)}", f"radicals: {len(label2images[i])}")))
        image_data_list.append([center_image, *sample_images])

    cluster_samples_image = create_vertical_stack_image(
        image_data_list,
        gap=8,
        font=ImageFont.truetype(config.font_path, size=24, index=0),
    )
    cluster_samples_image.save(pathstr(directory_path, "cluster-samples.png"))

# save(charutil.kanjis.jis_row(16), "test", n_clusters=16, image_size=16, stroke_width=2, n_sample_images=16)
save(charutil.kanjis.all(), "edu+jis_l1,2", n_clusters=512, image_size=16, stroke_width=2, blur=2, n_sample_images=16)