In [None]:
"""
This project uses the following resources:
- An XML dictionary file from the Kanjidic project: http://www.edrdg.org/wiki/index.php/KANJIDIC_Project
- kanjidic2.xml can be downloaded at: http://www.edrdg.org/kanjidic/kanjidic2.xml.gz

- An XML file from the Kanji Vector Graphics project: https://github.com/KanjiVG/
- kanjivg.xml can be downloaded at: https://github.com/KanjiVG/kanjivg/releases/
- Explanation of the KanjiVG format can be found here: https://kanjivg.tagaini.net/svg-format.html
"""

In [None]:
import xml.etree.ElementTree as ET
import os
import re
import requests
import io
import cairosvg
import shutil
from PIL import Image
from datasets import load_dataset


"""
Generates SVG files from kanjivg.xml in folder kanji_svg
"""

tree = ET.parse("kanjivg.xml")

root = tree.getroot()

svg_folder = "kanji_svg"

if not os.path.exists(svg_folder):
    os.makedirs(svg_folder)

kanji_header = '<svg xmlns="http://www.w3.org/2000/svg" width="128" height="128" viewBox="0 0 128 128">'
kanji_style = 'style="fill:none;stroke:#000000;stroke-width:3;stroke-linecap:round;stroke-linejoin:round;">'

# Assuming each kanji element has an 'id' attribute to use as filename
for kanji in root:
    kanji_id = kanji.attrib.get("id")
    if kanji_id:
        svg_content = f"{kanji_header}\n"
        for g in kanji.findall(".//g"):
            g_str = ET.tostring(g, encoding="utf-8", method="xml").decode("utf-8")
            svg_content += f"<g {kanji_style}{g_str}</g>\n"

        svg_content += "</svg>"

        svg_file_path = os.path.join(svg_folder, f"{kanji_id}.svg").replace(
            "kvg:kanji_", ""
        )

        with open(svg_file_path, "w", encoding="utf-8") as svg_file:
            svg_file.write(svg_content)

In [None]:
"""
Convert SVG files to PNG using cairosvg and then to JPG using PIL
"""

jpg_folder = 'kanji_jpg'
png_folder = 'kanji_png'

if not os.path.exists(jpg_folder):
    os.makedirs(jpg_folder)
if not os.path.exists(png_folder):
    os.makedirs(png_folder)

for svg_file in os.listdir(svg_folder):
    if svg_file.endswith('.svg'):
        svg_file_path = os.path.join(svg_folder, svg_file)
        png_file_path = svg_file_path.replace('svg', 'png')
        jpg_file_path = os.path.join(jpg_folder, svg_file.replace('.svg', '.jpg'))

        cairosvg.svg2png(url=svg_file_path, write_to=png_file_path)

        # Convert PNG to JPG with a white background
        with Image.open(png_file_path) as img:
            with Image.new('RGB', img.size, 'WHITE') as background:
                background.paste(img, (0, 0), img)
                background.save(jpg_file_path, 'JPEG')

In [None]:
"""
Map the kanji filename to the kanji literal using kanjivg.xml
"""

kvg_element_pattern = re.compile(r'kvg:element="([^"]+)"')
lit2name = {}
is_above_kanji = False

with open('kanjivg.xml', 'r', encoding='utf-8') as kanjivg:
    for line in kanjivg:
        if '<kanji' in line:
            is_above_kanji = True
        if is_above_kanji:
            kanji_id = re.search(r'id="([^"]+)"', line)
            lit = kvg_element_pattern.search(line)
            if lit:
                lit2name[lit.group(1)] = kanji_id.group(1).replace('kvg:', '')
                is_above_kanji = False

In [None]:
"""
Map the kanji filename to the English kanji meaning using kanjidic2.xml and write them in metadata.jsonl
"""

root = ET.parse('kanjidic2.xml').getroot()

metadata_file_path = os.path.join(jpg_folder, 'metadata.jsonl')

with open(metadata_file_path, 'w') as metadata:
    for character in root.findall("character"):
        lit = character.find("literal").text
        meanings = []
        for meaning in character.findall(".//reading_meaning/rmgroup/meaning"):
            # Only English meanings, remove for all languages
            if 'r_type' not in meaning.attrib and 'm_lang' not in meaning.attrib:
                meanings.append(meaning.text)
        concat_meanings = ", ".join(meanings)
        if lit in lit2name:
            metadata.write(f'{{"file_name": "{lit2name[lit]}.jpg", "text": "a Kanji meaning {concat_meanings}"}}\n')

In [None]:
"""
Remove images in the JPG folder that are not in metadata.jsonl
"""
for jpg_file in os.listdir(jpg_folder):
    if jpg_file.endswith('.jpg'):
        with open(metadata_file_path, 'r') as metadata:
            if jpg_file not in metadata.read():
                os.remove(os.path.join(jpg_folder, jpg_file))

In [None]:
"""
Push the dataset to the Hugging Face Hub for versioning, sharing and easy access
"""

shutil.move(jpg_folder, "train")
dataset = load_dataset("imagefolder", "train", split="train")
dataset.push_to_hub("sylvainlapeyrade/kanji_english_meaning")

In [None]:
"""
Train on Google Colab using the Hugging Face Diffusers library


os.chdir("drive/MyDrive")
% git clone https://github.com/huggingface/diffusers
% pip install git+https://github.com/huggingface/diffusers
os.chdir("diffusers/examples/text_to_image/")
% pip install -r requirements.txt
% huggingface-cli login

% accelerate launch train_text_to_image_lora.py \
  --pretrained_model_name_or_path="CompVis/stable-diffusion-v1-4" \
  --dataset_name="sylvainlapeyrade/kanji_english_meaning" --caption_column="text" \
  --resolution=512 --random_flip \
  --train_batch_size=1 \
  --num_train_epochs=1 --checkpointing_steps=500 \
  --learning_rate=1e-04 --lr_scheduler="constant" --lr_warmup_steps=0 \
  --seed=42 \
  --output_dir="kanji2english" \
  --validation_prompt="A kanji meaning Elon Musk" \
  --push_to_hub
"""

In [None]:
"""
Query the model to generate an image from a Kanji meaning
"""

API_URL = "https://api-inference.huggingface.co/models/sylvainlapeyrade/kanji2english"
headers = {"Authorization": "HuggingFace_API_Key"}

def query(payload):
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.content

image_bytes = query({
	"inputs": "a Kanji meaning Elon Musk",
})

image = Image.open(io.BytesIO(image_bytes))
image.show()