# Generate a Chinese glyph and definition dataset
We use the unicode consortium's [unihan database](https://github.com/unicode-org/unihan-database) for this. We can take the kDefinition text file to extract both characters and labels in the form of definitions.

In [8]:
import pandas as pd
import os
import re

# Definitions are of the form "(same as U+4E18 丘) hillock or mound"
# Need to clean the definition of unicode references, Chinese characters and parentheses

def clean_definition(definition):
    # Remove any parentheses containing a Chinese character
    definition = re.sub(r'\(.*?[/u3400-\u9FFF]+.*?\)', '', definition)
    # Remove clauses containing a Chinese character separated by commas or semicolons
    definition = re.sub(r',.*?[\u3400-\u9FFF]+.*?(,|$)', '', definition)
    definition = re.sub(r';.*?[\u3400-\u9FFF]+.*?(;|$)', '', definition)
    # Remove all remaining Chinese characters and non-standard characters
    definition = re.sub(r'[\u3400-\u9FFF]', '', definition)
    # Remove Unicode codes
    definition = re.sub(r'U\+\w+', '', definition)
    # Keep only ASCII
    definition = re.sub(r'[^\x00-\x7F]', '', definition)
    # Remove specific set of punctuation at start or end of string
    definition = re.sub(r'^[!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+|[!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+$', '', definition)
    # Remove isolated punctuation (surrounded by spaces)
    definition = re.sub(r'\s[!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+\s', ' ', definition)
    # Trim extra spaces
    definition = re.sub(r'\s+', ' ', definition).strip()
    return definition

# Initialize lists to store the data
unicodes = []
characters = []
definitions = []

# Read the file line by line
file = "Unihan-kDefinition.txt"
with open(file, 'r', encoding='utf-8') as file:
    for line in file:
        # Extract unicode, character, and definition
        match = re.match(r'U\+([0-9A-F]+)\s([\u3400-\u9FFF])\tkDefinition\t(.+)', line)
        if match:
            unicodes.append(match.group(1))
            characters.append(match.group(2))
            cleaned_definition = clean_definition(match.group(3))
            definitions.append(cleaned_definition)

# Load into a DataFrame
df = pd.DataFrame({
    'Unicode': unicodes,
    'Character': characters,
    'Definition': definitions
})

# Remove rows with empty definitions
df = df[df['Definition'] != '']
df.head()

Unnamed: 0,Unicode,Character,Definition
0,3400,㐀,hillock or mound
1,3401,㐁,"to lick; to taste, a mat, bamboo bark"
2,3402,㐂,"non-standard form of , to like, love, enjoy; a..."
3,3405,㐅,five
4,3406,㐆,"to follow, to trust to; to put confidence in; ..."


Define functions to generate a dataset

In [9]:
import pandas as pd
import os
from PIL import Image, ImageDraw, ImageFont

def create_image(character, font, image_size=(128, 128)):
    """
    Create an image of a single Unicode character.
    """
    image = Image.new('RGB', image_size, 'white')
    draw = ImageDraw.Draw(image)
    text_width, text_height = draw.textbbox((0, 0), character, font=font)[2:]
    x = (image_size[0] - text_width) / 2
    y = (image_size[1] - text_height) / 2 - 16
    draw.text((x, y), character, fill='black', font=font)
    return image

def is_character_supported(character, font, image_size=(128, 128)):
    """
    Check if the character is supported by the font.
    """
    image_char = create_image(character, font, image_size)
    image_unknown = create_image("�", font, image_size)  # U+FFFD is the replacement character

    return not image_char.tobytes() == image_unknown.tobytes()

def generate_and_save_images_from_df(df, font_path, column_name="Unicode", directory="data"):
    """
    Generate and save images for each character specified in the DataFrame's column.
    """
    if not os.path.exists(directory):
        os.makedirs(directory)

    font_size = 100
    font = ImageFont.truetype(font_path, font_size)

    for unicode_str in df[column_name]:
        code_point = int(unicode_str, 16)  # Convert hex string to integer
        character = chr(code_point)
        if is_character_supported(character, font):
            img = create_image(character, font)
            filename = os.path.join(directory, f"{code_point:04X}.png")
            img.save(filename)

# Assuming 'df' is your DataFrame and it is already defined and loaded
font_path = './fonts/NotoSansSC-Regular.ttf'
# Update the function call accordingly
generate_and_save_images_from_df(df, font_path, column_name="Unicode", directory="data")


Generate metadata.jsonl

In [13]:
# Change "Definition" to "caption" column name
df = df.rename(columns={'Definition': 'caption'})

# Generate file names based on the 'Character Unicode' column
df['file_name'] = df['Unicode'].apply(lambda x: x + '.png')

# # If you need to split your dataset
# train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# # Assign 'split' labels
# train_df['split'] = 'train'
# test_df['split'] = 'test'

# # Combine back if needed
# df_combined = pd.concat([train_df, test_df])

# Ensure the DataFrame is sorted or in the desired order
# This step is optional and can be customized
# df_combined = df_combined.sort_values('some_column')

# Convert to JSON Lines format
jsonl_string_with_splits = df.to_json(orient='records', lines=True)

# Write to metadata.jsonl file
with open('metadata.jsonl', 'w') as file:
    file.write(jsonl_string_with_splits)

# move metadata.jsonl to the data folder for training purposes