# Chinese glyph image data generator

In [1]:
import unicodedata

def is_chinese_character(ch):
    """Check if a character is a Chinese character."""
    try:
        # Get the Unicode name of the character
        name = unicodedata.name(ch)
        # Check if the name starts with 'CJK UNIFIED IDEOGRAPH'
        return name.startswith("CJK UNIFIED IDEOGRAPH")
    except ValueError:
        # Character does not have a Unicode name
        return False

# Range of code points to check (this is the basic range for common Chinese characters)
# You might need to expand this range based on your needs
start = 0x4E00
end = 0x9FFF

chinese_characters = []

for code_point in range(start, end + 1):
    ch = chr(code_point)
    if is_chinese_character(ch):
        chinese_characters.append(ch)

# Print the first 100 characters as a sample
print(chinese_characters[:100])
print(len(chinese_characters))

['一', '丁', '丂', '七', '丄', '丅', '丆', '万', '丈', '三', '上', '下', '丌', '不', '与', '丏', '丐', '丑', '丒', '专', '且', '丕', '世', '丗', '丘', '丙', '业', '丛', '东', '丝', '丞', '丟', '丠', '両', '丢', '丣', '两', '严', '並', '丧', '丨', '丩', '个', '丫', '丬', '中', '丮', '丯', '丰', '丱', '串', '丳', '临', '丵', '丶', '丷', '丸', '丹', '为', '主', '丼', '丽', '举', '丿', '乀', '乁', '乂', '乃', '乄', '久', '乆', '乇', '么', '义', '乊', '之', '乌', '乍', '乎', '乏', '乐', '乑', '乒', '乓', '乔', '乕', '乖', '乗', '乘', '乙', '乚', '乛', '乜', '九', '乞', '也', '习', '乡', '乢', '乣']
20989


64x64 generator

In [4]:
import os
import unicodedata
from PIL import Image, ImageDraw, ImageFont

def create_image(character, font_path, image_size=(64, 64), font_size=50):
    """
    Create an image of a single Unicode character.

    Parameters:
    character (str): The Unicode character to render.
    font_path (str): Path to the .ttf font file to use.
    image_size (tuple): Size of the output image (width, height).
    font_size (int): Size of the font.

    Returns:
    Image: The generated image.
    """
    # Create a blank image with white background
    image = Image.new('L', image_size, 'white')
    draw = ImageDraw.Draw(image)

    # Load the font
    font = ImageFont.truetype(font_path, font_size)

    # Calculate the position to draw the character
    text_width, text_height = draw.textbbox((0, 0), character, font=font)[2:]
    x = (image_size[0] - text_width) / 2
    y = (image_size[1] - text_height) / 2 - 8

    # Draw the character
    draw.text((x, y), character, fill='black', font=font)

    return image

def is_chinese_character(ch):
    """Check if a character is a Chinese character."""
    try:
        name = unicodedata.name(ch)
        return "CJK UNIFIED IDEOGRAPH" in name
    except ValueError:
        return False

def generate_and_save_images(start, end, font_path, directory="data"):
    """
    Generate and save images for each Chinese character in the given Unicode range.

    Parameters:
    start (int): Starting Unicode code point.
    end (int): Ending Unicode code point.
    font_path (str): Path to the font file.
    directory (str): Directory to save images.
    """
    if not os.path.exists(directory):
        os.makedirs(directory)

    for code_point in range(start, end + 1):
        character = chr(code_point)
        if is_chinese_character(character):
            img = create_image(character, font_path)
            filename = os.path.join(directory, f"{code_point:04X}.png")
            img.save(filename)

# Parameters
start_range = 0x4E00  # Start of CJK Unified Ideographs
end_range = 0x9FFF   # End of CJK Unified Ideographs
font_path = 'fonts/Noto_Sans_SC/static/NotoSansSC-Regular.ttf'  # Replace with the path to your Chinese font file

# Generate and save the images
generate_and_save_images(start_range, end_range, font_path)


128x128 generator

In [None]:
import os
import unicodedata
from PIL import Image, ImageDraw, ImageFont

def create_image(character, font_path, image_size=(128, 128), font_size=100):
    """
    Create an image of a single Unicode character.

    Parameters:
    character (str): The Unicode character to render.
    font_path (str): Path to the .ttf font file to use.
    image_size (tuple): Size of the output image (width, height).
    font_size (int): Size of the font.

    Returns:
    Image: The generated image.
    """
    # Create a blank image with white background
    image = Image.new('L', image_size, 'white')
    draw = ImageDraw.Draw(image)

    # Load the font
    font = ImageFont.truetype(font_path, font_size)

    # Calculate the position to draw the character
    text_width, text_height = draw.textbbox((0, 0), character, font=font)[2:]
    x = (image_size[0] - text_width) / 2
    y = (image_size[1] - text_height) / 2 - 8

    # Draw the character
    draw.text((x, y), character, fill='black', font=font)

    return image

def is_chinese_character(ch):
    """Check if a character is a Chinese character."""
    try:
        name = unicodedata.name(ch)
        return "CJK UNIFIED IDEOGRAPH" in name
    except ValueError:
        return False

def generate_and_save_images(start, end, font_path, directory=""):
    """
    Generate and save images for each Chinese character in the given Unicode range.

    Parameters:
    start (int): Starting Unicode code point.
    end (int): Ending Unicode code point.
    font_path (str): Path to the font file.
    directory (str): Directory to save images.
    """
    if not os.path.exists(directory):
        os.makedirs(directory)

    for code_point in range(start, end + 1):
        character = chr(code_point)
        if is_chinese_character(character):
            img = create_image(character, font_path)
            filename = os.path.join(directory, f"{code_point:04X}.png")
            img.save(filename)

# Parameters
start_range = 0x4E00  # Start of CJK Unified Ideographs
end_range = 0x9FFF   # End of CJK Unified Ideographs
# font_path = 'fonts/Noto_Sans_SC/static/NotoSansSC-Regular.ttf'  # Replace with the path to your Chinese font file
font_path = 'fonts\Ma_Shan_Zheng\MaShanZheng-Regular.ttf'

# Generate and save the images
generate_and_save_images(start_range, end_range, font_path, directory="msz128")


With blank detector

In [1]:
import os
from PIL import Image, ImageDraw, ImageFont
import unicodedata

def create_image(character, font, image_size=(128, 128)):
    """
    Create an image of a single Unicode character.
    """
    image = Image.new('RGB', image_size, 'white')
    draw = ImageDraw.Draw(image)
    text_width, text_height = draw.textbbox((0, 0), character, font=font)[2:]
    x = (image_size[0] - text_width) / 2
    y = (image_size[1] - text_height) / 2 - 8
    draw.text((x, y), character, fill='black', font=font)
    return image

def is_character_supported(character, font, image_size=(128, 128)):
    """
    Check if the character is supported by the font.
    """
    image_char = create_image(character, font, image_size)
    image_unknown = create_image("�", font, image_size)  # U+FFFD is the replacement character

    return not image_char.tobytes() == image_unknown.tobytes()

def generate_and_save_images(start, end, font_path, directory="data"):
    """
    Generate and save images for each Chinese character in the given Unicode range.
    """
    if not os.path.exists(directory):
        os.makedirs(directory)

    font_size = 100
    font = ImageFont.truetype(font_path, font_size)

    for code_point in range(start, end + 1):
        character = chr(code_point)
        if is_chinese_character(character) and is_character_supported(character, font):
            img = create_image(character, font)
            filename = os.path.join(directory, f"{code_point:04X}.png")
            img.save(filename)

def is_chinese_character(ch):
    """
    Check if a character is a Chinese character.
    """
    try:
        name = unicodedata.name(ch)
        return "CJK UNIFIED IDEOGRAPH" in name
    except ValueError:
        return False

# Parameters
start_range = 0x4E00  # Start of CJK Unified Ideographs
end_range = 0x9FFF   # End of CJK Unified Ideographs
# font_path = 'fonts/Noto_Sans_SC/static/NotoSansSC-Regular.ttf'  # Replace with the path to your Chinese font file
font_path = 'fonts\Ma_Shan_Zheng\MaShanZheng-Regular.ttf' #~7000 glyphs

# Generate and save the images
generate_and_save_images(start_range, end_range, font_path, directory="msz128")


Yi script

In [6]:
import os
from PIL import Image, ImageDraw, ImageFont
import unicodedata

def create_image(character, font, image_size=(128, 128)):
    """
    Create an image of a single Unicode character.
    """
    image = Image.new('RGB', image_size, 'white')
    draw = ImageDraw.Draw(image)
    text_width, text_height = draw.textbbox((0, 0), character, font=font)[2:]
    x = (image_size[0] - text_width) / 2
    y = (image_size[1] - text_height) / 2 - 8
    draw.text((x, y), character, fill='black', font=font)
    return image

def is_character_supported(character, font, image_size=(128, 128)):
    """
    Check if the character is supported by the font.
    """
    image_char = create_image(character, font, image_size)
    image_unknown = create_image("�", font, image_size)  # U+FFFD is the replacement character

    return not image_char.tobytes() == image_unknown.tobytes()

def generate_and_save_images(start, end, font_path, directory="data"):
    """
    Generate and save images for each Yi script character in the given Unicode range.
    """
    if not os.path.exists(directory):
        os.makedirs(directory)

    font_size = 100
    font = ImageFont.truetype(font_path, font_size)

    for code_point in range(start, end + 1):
        character = chr(code_point)
        if is_character_supported(character, font):
            img = create_image(character, font)
            filename = os.path.join(directory, f"{code_point:04X}.png")
            img.save(filename)

# Parameters
start_range = 0xA000  # Start of Yi Syllables
end_range = 0xA48F   # End of Yi Syllables
# font_path = 'fonts/Noto_Sans_SC/static/NotoSansSC-Regular.ttf'  # Replace with the path to your Chinese font file
font_path = 'fonts/Noto_Sans_Yi/NotoSansYi-Regular.ttf' # ~1000 glyphs

# Generate and save the images
generate_and_save_images(start_range, end_range, font_path, directory="yi128")
