Dependencies:
- qahirah
- freetype
- `vis_gen.py` from `source/vis_gen.py`

## Download UnicodeData.txt file

In [1]:
import requests

url = "https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt"
response = requests.get(url)

# Save to file
with open("UnicodeData.txt", "w", encoding="utf-8") as f:
    f.write(response.text)

print("Downloaded UnicodeData.txt")

Downloaded UnicodeData.txt


## Download Blocks.txt file

In [2]:
import requests

url = "https://www.unicode.org/Public/UCD/latest/ucd/Blocks.txt"
response = requests.get(url)

# Save to file
with open("Blocks.txt", "w", encoding="utf-8") as f:
    f.write(response.text)

print("Downloaded Blocks.txt")

Downloaded Blocks.txt


## Extract the left-to-right scripts

In [3]:
alphabets = {}

with open('/content/Blocks.txt', 'r') as infile:
  for line in infile.readlines():
    # Skip the comments and new line
    if line[0]=='#' or line[0]=='\n':
      continue

    # Extract the codepoints and the script name
    parts = line.split(";")
    codepoints = parts[0]
    script = parts[1].strip()

    # Separate the starting and ending codepoints
    start_codepoint = codepoints.split('..')[0]
    end_codepoint = codepoints.split('..')[1]

    if any(key in script.lower() for key in ('latin', 'cyrillic', 'armenian',
                                             'greek', 'coptic', 'ipa',
                                             'spacing', 'diacritical')):
      alphabets[script] = {
          'start': start_codepoint,
          'end': end_codepoint
      }

# Sort the dictionary
alphabets = dict(sorted(alphabets.items()))

In [4]:
alphabets

{'Ancient Greek Musical Notation': {'start': '1D200', 'end': '1D24F'},
 'Ancient Greek Numbers': {'start': '10140', 'end': '1018F'},
 'Armenian': {'start': '0530', 'end': '058F'},
 'Basic Latin': {'start': '0000', 'end': '007F'},
 'Combining Diacritical Marks': {'start': '0300', 'end': '036F'},
 'Combining Diacritical Marks Extended': {'start': '1AB0', 'end': '1AFF'},
 'Combining Diacritical Marks Supplement': {'start': '1DC0', 'end': '1DFF'},
 'Combining Diacritical Marks for Symbols': {'start': '20D0', 'end': '20FF'},
 'Coptic': {'start': '2C80', 'end': '2CFF'},
 'Coptic Epact Numbers': {'start': '102E0', 'end': '102FF'},
 'Cyrillic': {'start': '0400', 'end': '04FF'},
 'Cyrillic Extended-A': {'start': '2DE0', 'end': '2DFF'},
 'Cyrillic Extended-B': {'start': 'A640', 'end': 'A69F'},
 'Cyrillic Extended-C': {'start': '1C80', 'end': '1C8F'},
 'Cyrillic Extended-D': {'start': '1E030', 'end': '1E08F'},
 'Cyrillic Supplement': {'start': '0500', 'end': '052F'},
 'Greek Extended': {'start': 

## Extract the codepoints

In [5]:
import os

# Acceptable general categories
renderable_categories = {
    'Lu', 'Ll', 'Lt', 'Lm', 'Lo',  # Letters
    'Mn', 'Mc', 'Me',              # Marks
    'Nd', 'Nl', 'No',              # Numbers
    'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po',  # Punctuation
    'Sm', 'Sc', 'Sk', 'So',        # Symbols
    # 'Zs'                           # Space separator
}

scripts_dir = '/content/scripts'
os.makedirs(scripts_dir, exist_ok=True)

for script, range_info in alphabets.items():
  start_codepoint = range_info['start']
  end_codepoint = range_info['end']

  start_int = int(start_codepoint, 16)
  end_int = int(end_codepoint, 16)

  with open('/content/UnicodeData.txt' ,'r') as infile, open(os.path.join(scripts_dir,f'{script}.txt'),'w') as outfile:
    for line in infile.readlines():
      parts = line.split(';')

      if len(parts)<3:
        continue

      codepoint, name, category = parts[0], parts[1], parts[2]
      cp_int = int(codepoint, 16)

      if start_int<=cp_int<=end_int:
        if category in renderable_categories:
          outfile.write('U+'+codepoint+'\n')

In [6]:
# Shows the fonts installed in the system
!fc-list

/usr/share/fonts/truetype/liberation/LiberationSansNarrow-Italic.ttf: Liberation Sans Narrow:style=Italic
/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf: Liberation Sans:style=Regular
/usr/share/fonts/truetype/liberation/LiberationMono-BoldItalic.ttf: Liberation Mono:style=Bold Italic
/usr/share/fonts/truetype/liberation/LiberationSerif-Italic.ttf: Liberation Serif:style=Italic
/usr/share/fonts/truetype/liberation/LiberationMono-Bold.ttf: Liberation Mono:style=Bold
/usr/share/fonts/truetype/liberation/LiberationSansNarrow-Regular.ttf: Liberation Sans Narrow:style=Regular
/usr/share/fonts/truetype/liberation/LiberationSerif-Bold.ttf: Liberation Serif:style=Bold
/usr/share/fonts/truetype/liberation/LiberationMono-Regular.ttf: Liberation Mono:style=Regular
/usr/share/fonts/truetype/liberation/LiberationSans-Italic.ttf: Liberation Sans:style=Italic
/usr/share/fonts/truetype/liberation/LiberationSerif-BoldItalic.ttf: Liberation Serif:style=Bold Italic
/usr/share/fonts/truet

In [7]:
# Codepoints which couldn't be rendered before

excluded_codepoints = [
    # Combining Diacritical Marks Extended
    *["U+{:04X}".format(cp) for cp in range(0x1AC1, 0x1ACF)],

    # Combining Diacritical Marks for Symbols
    *["U+{:04X}".format(cp) for cp in range(0x20D0, 0x2100)],
    "U+1DFA",

    # Ancient Greek Musical Notation
    *["U+{:04X}".format(cp) for cp in range(0x1D200, 0x1D245)],

    # Ancient Greek Numbers
    *["U+{:04X}".format(cp) for cp in range(0x10140, 0x1018B)],

    # Armenian
    *["U+{:04X}".format(cp) for cp in range(0x0530, 0x0590)],

    # Coptic
    *["U+{:04X}".format(cp) for cp in range(0x2C80, 0x2D00)],

    # Coptic Epact Numbers
    *["U+{:04X}".format(cp) for cp in range(0x102E0, 0x10300)],

    # Cyrillic Extended-C
    "U+1C89", "U+1C8A",

    # Cyrillic Extended-D
    *["U+{:04X}".format(cp) for cp in range(0x1E030, 0x1E08F)],

    # Latin Extended-D (some scattered, some range)
    "U+A7C0", "U+A7C1",
    *["U+{:04X}".format(cp) for cp in range(0xA7CB, 0xA7F4)],

    # Latin Extended-F
    *["U+{:04X}".format(cp) for cp in range(0x10780, 0x107C0)],

    # Latin Extended-G (assumed typo above, but it's Latin Extended-G)
    *["U+{:04X}".format(cp) for cp in range(0x1DF00, 0x1DF1F)],
    *["U+{:04X}".format(cp) for cp in range(0x1DF25, 0x1DF2A)],
]

len(excluded_codepoints)

703

## Download the noto fonts

In [8]:
!git clone https://github.com/notofonts/notofonts.github.io.git

Cloning into 'notofonts.github.io'...
remote: Enumerating objects: 45290, done.[K
remote: Counting objects: 100% (105/105), done.[K
remote: Compressing objects: 100% (30/30), done.[K
remote: Total 45290 (delta 86), reused 75 (delta 75), pack-reused 45185 (from 3)[K
Receiving objects: 100% (45290/45290), 1.33 GiB | 23.34 MiB/s, done.
Resolving deltas: 100% (30187/30187), done.
Updating files: 100% (11977/11977), done.


## Download the Microsoft fonts

In [9]:
# !git clone https://github.com/pjobson/Microsoft-Fonts.git

## Extract the zip files

In [10]:
# import os
# import gzip
# import shutil

# windows_font_dir = '/content/Microsoft-Fonts/2021 - Windows 11/ttf'

# for filename in os.listdir(windows_font_dir):
#     if filename.endswith('.ttf.gz'):
#         gz_path = os.path.join(windows_font_dir, filename)
#         ttf_path = os.path.join(windows_font_dir, filename[:-3])  # remove .gz

#         # Extract .ttf.gz to .ttf
#         with gzip.open(gz_path, 'rb') as f_in, open(ttf_path, 'wb') as f_out:
#             shutil.copyfileobj(f_in, f_out)

#         # Delete the .ttf.gz file
#         os.remove(gz_path)

# print("Extraction complete and original .ttf.gz files deleted.")

## Create the codepoint to font mapping

In [9]:
root_dirs = ["/content/notofonts.github.io/fonts"]
font_paths = []
for root_dir in root_dirs:
  for root, dirs, files in os.walk(root_dir):
      for file in files:
          if file.endswith('.ttf'):
              font_paths.append(os.path.join(root, file))

Only keep the regular or basic fonts without any styles like Bold, Italic, etc.

In [10]:
import os
from collections import defaultdict

# A map from font family name to its candidate font files
font_family_map = defaultdict(list)

def extract_family_name(path):
    filename = os.path.basename(path)
    # Remove style part like -Bold, -Thin, etc.
    name = filename.replace('.ttf', '')
    name = name.split('-')[0]  # e.g., NotoSansOriya-Regular → NotoSansOriya
    return name

# Group fonts by base family name
for path in font_paths:
    family = extract_family_name(path)
    font_family_map[family].append(path)

# Now pick the 'basic' font: prefer Regular.ttf, else first available
pruned_fonts = []

for family, paths in font_family_map.items():
    regular_fonts = [p for p in paths if 'Regular.ttf' in p]
    if regular_fonts:
        pruned_fonts.append(regular_fonts[0])
    else:
        pruned_fonts.append(paths[0])  # fallback

# Optional: sort the result
pruned_fonts.sort()

# Final output
for font in pruned_fonts:
    print(font)


/content/notofonts.github.io/fonts/NotoFangsongKSSRotated/googlefonts/ttf/NotoFangsongKSSRotated-Regular.ttf
/content/notofonts.github.io/fonts/NotoFangsongKSSVertical/googlefonts/ttf/NotoFangsongKSSVertical-Regular.ttf
/content/notofonts.github.io/fonts/NotoKufiArabic/googlefonts/ttf/NotoKufiArabic-Regular.ttf
/content/notofonts.github.io/fonts/NotoKufiArabic/googlefonts/variable/NotoKufiArabic[wght].ttf
/content/notofonts.github.io/fonts/NotoMusic/googlefonts/ttf/NotoMusic-Regular.ttf
/content/notofonts.github.io/fonts/NotoNaskhArabic/googlefonts/ttf/NotoNaskhArabic-Regular.ttf
/content/notofonts.github.io/fonts/NotoNaskhArabic/googlefonts/variable-ttf/NotoNaskhArabic[wght].ttf
/content/notofonts.github.io/fonts/NotoNaskhArabicUI/googlefonts/ttf/NotoNaskhArabicUI-Regular.ttf
/content/notofonts.github.io/fonts/NotoNaskhArabicUI/googlefonts/variable-ttf/NotoNaskhArabicUI[wght].ttf
/content/notofonts.github.io/fonts/NotoNastaliqUrdu/googlefonts/ttf/NotoNastaliqUrdu-Regular.ttf
/content/

Now install the fonts in the system

In [11]:
import os
import subprocess
from glob import glob

FONT_SOURCE_DIR = "/content/notofonts.github.io/fonts"
FONT_INSTALL_DIR = "/usr/share/fonts/truetype/custom/"

os.makedirs(FONT_INSTALL_DIR, exist_ok=True)

# Get all .ttf files recursively
ttf_files = glob(os.path.join(FONT_SOURCE_DIR, "**/*.ttf"), recursive=True)

# Skip fonts with variable axes like [wght] or [wdth,wght]
def is_variable_font(ttf_path):
    basename = os.path.basename(ttf_path)
    return "[" in basename and "]" in basename

for ttf_file in ttf_files:
    if is_variable_font(ttf_file):
        print(f"Skipping variable font: {ttf_file}")
        continue

    filename = os.path.basename(ttf_file)
    installed_path = os.path.join(FONT_INSTALL_DIR, filename)
    subprocess.run(["cp", ttf_file, installed_path])

# Update font cache
subprocess.run(["fc-cache", "-fv"])

Skipping variable font: /content/notofonts.github.io/fonts/NotoSansKawi/googlefonts/variable-ttf/NotoSansKawi[wght].ttf
Skipping variable font: /content/notofonts.github.io/fonts/NotoSansKawi/unhinted/slim-variable-ttf/NotoSansKawi[wght].ttf
Skipping variable font: /content/notofonts.github.io/fonts/NotoSansKawi/unhinted/variable-ttf/NotoSansKawi[wght].ttf
Skipping variable font: /content/notofonts.github.io/fonts/NotoSansKawi/full/slim-variable-ttf/NotoSansKawi[wght].ttf
Skipping variable font: /content/notofonts.github.io/fonts/NotoSansKawi/full/variable-ttf/NotoSansKawi[wght].ttf
Skipping variable font: /content/notofonts.github.io/fonts/NotoSansOriya/googlefonts/variable-ttf/NotoSansOriya[wdth,wght].ttf
Skipping variable font: /content/notofonts.github.io/fonts/NotoSansOriya/googlefonts/variable/NotoSansOriya[wdth,wght].ttf
Skipping variable font: /content/notofonts.github.io/fonts/NotoSansOriya/unhinted/slim-variable-ttf/NotoSansOriya[wght].ttf
Skipping variable font: /content/not

CompletedProcess(args=['fc-cache', '-fv'], returncode=0)

Verify the installation

In [12]:
!fc-list

/usr/share/fonts/truetype/custom/NotoSansCanadianAboriginal-Thin.ttf: Noto Sans Canadian Aboriginal,Noto Sans Canadian Aboriginal Thin:style=Thin,Regular
/usr/share/fonts/truetype/custom/NotoSansKannadaUI-Black.ttf: Noto Sans Kannada UI,Noto Sans Kannada UI Black:style=Black,Regular
/usr/share/fonts/truetype/custom/NotoSansGurmukhi-ExtraCondensedThin.ttf: Noto Sans Gurmukhi ExtraCondensed,Noto Sans Gurmukhi ExtraCondensed Thin:style=Thin,Regular
/usr/share/fonts/truetype/custom/NotoSerif-ExtraCondensedThinItalic.ttf: Noto Serif,Noto Serif ExtraCondensed Thin:style=ExtraCondensed Thin Italic,Italic
/usr/share/fonts/truetype/custom/NotoSerifThai-Light.ttf: Noto Serif Thai,Noto Serif Thai Light:style=Light,Regular
/usr/share/fonts/truetype/custom/NotoSerif-SemiCondensedMedium.ttf: Noto Serif,Noto Serif SemiCondensed Medium:style=SemiCondensed Medium,Regular
/usr/share/fonts/truetype/custom/NotoSansKannada-SemiCondensedSemiBold.ttf: Noto Sans Kannada,Noto Sans Kannada SemiCondensed SemiBol

In [13]:
import subprocess

# Run fc-list and decode the output
output = subprocess.check_output(['fc-list'], encoding='utf-8')

# Split the output into lines (each line = one font)
font_list = output.strip().split('\n')
fc_font_names={}

for font in font_list:
  font_name = font.split(':')[1].strip().split(',')[0]
  font_name_striped = ''.join(font_name.split())
  if font_name_striped not in fc_font_names:
    fc_font_names[font_name_striped]=font_name

fc_font_names

{'NotoSansCanadianAboriginal': 'Noto Sans Canadian Aboriginal',
 'NotoSansKannadaUI': 'Noto Sans Kannada UI',
 'NotoSansGurmukhiExtraCondensed': 'Noto Sans Gurmukhi ExtraCondensed',
 'NotoSerif': 'Noto Serif',
 'NotoSerifThai': 'Noto Serif Thai',
 'NotoSansKannada': 'Noto Sans Kannada',
 'NotoSerifEthiopic': 'Noto Serif Ethiopic',
 'NotoSansDevanagariUI': 'Noto Sans Devanagari UI',
 'NotoSansBalinese': 'Noto Sans Balinese',
 'NotoSansSogdian': 'Noto Sans Sogdian',
 'NotoSansTeluguUI': 'Noto Sans Telugu UI',
 'NotoSans': 'Noto Sans',
 'NotoSansTifinaghAzawagh': 'Noto Sans Tifinagh Azawagh',
 'NotoSansMalayalamSemiCondensed': 'Noto Sans Malayalam SemiCondensed',
 'NotoSansTamilCondensed': 'Noto Sans Tamil Condensed',
 'NotoSerifDisplayCondensed': 'Noto Serif Display Condensed',
 'NotoSerifSinhalaExtraCondensed': 'Noto Serif Sinhala ExtraCondensed',
 'NotoSansHebrew': 'Noto Sans Hebrew',
 'NotoSansArabicUI': 'Noto Sans Arabic UI',
 'NotoSansThaiLooped': 'Noto Sans Thai Looped',
 'NotoSans

Now go through all the scripts and create the mapping of the Unicode codepoints the fonts which can render them

In [14]:
import os
from fontTools.ttLib import TTFont
from pprint import pprint

scripts_dir = '/content/scripts'
scripts = os.listdir(scripts_dir)
scripts.sort()

supported_fonts_mapping={}

def print_coverage_summary(results):
    header = f"{'Block':<45} {'Supported / Total':>17} {'Coverage':>10}"
    sep = "-" * len(header)
    # print(header)
    # print(sep)

    for info in results:
        supported = info['supported_count']
        total = info['total_count']
        coverage = info['coverage_percent']
        # print(f"{info['block']:<45} {supported:>7} / {total:<7} {coverage:9.2f}%")

def save_unsupported_chars_as_report(results, filename="unsupported_chars_report.txt"):
    with open(filename, 'w', encoding='utf-8') as f:
        # Write summary table header
        header = f"{'Block':<45} {'Supported / Total':>17} {'Coverage':>10}"
        sep = "-" * len(header)
        f.write(header + "\n")
        f.write(sep + "\n")

        # Write summary table rows
        for info in results:
            supported = info['supported_count']
            total = info['total_count']
            coverage = info['coverage_percent']
            f.write(f"{info['block']:<45} {supported:>7} / {total:<7} {coverage:9.2f}%\n")

        f.write("\n\nUnsupported characters per block:\n\n")

        # Write unsupported characters block by block
        for info in results:
            if info['unsupported_chars']:
                f.write(f"Block: {info['block']}\n")
                f.write(f"Unsupported characters ({len(info['unsupported_chars'])} total):\n")
                for cp in info['unsupported_chars']:
                    f.write(f"U+{cp:04X} ({cp})\n")
                f.write("\n" + "-"*40 + "\n\n")

def analyze_font_coverage(font_path):
    font = TTFont(font_path)
    cmap_table=font['cmap']

    cmap = None

    for table in cmap_table.tables:
      if cmap is None:
        cmap = table.cmap
      else:
        if len(table.cmap) > len(cmap):
          cmap = table.cmap

    if not cmap:
        raise ValueError("No Unicode cmap found in font")

    results = []

    for script in scripts:
        supported_chars = []
        unsupported_chars = []
        script_name = script.split('.')[0]
        total_chars = 0

        with open(os.path.join(scripts_dir, script), 'r', encoding='utf-8') as infile:
            for line in infile:
                line = line.strip()
                if not line.strip():
                    continue
                total_chars += 1
                codepoint = int(line.split('+')[1], 16)
                font_path = os.path.basename(font_path).split('.ttf')[0]
                font_name=""
                if '-' in font_path:
                  font_name = font_path.split('-')[0]
                elif '[' in font_path:
                  font_name = font_path.split('[')[0]

                font_name = fc_font_names[font_name]

                if codepoint in cmap:
                    if line not in supported_fonts_mapping:
                      supported_fonts_mapping[line]=set()
                    supported_fonts_mapping[line].add(font_name)
                    supported_chars.append(codepoint)
                else:
                    unsupported_chars.append(codepoint)

        coverage_percent = (len(supported_chars) / total_chars) * 100 if total_chars else 0.0

        # if round(coverage_percent,2)==100.0:
        #   if script_name not in supported_fonts_mapping:
        #     supported_fonts_mapping[script_name]=[font_path]
        #   else:
        #     supported_fonts_mapping[script_name].append(font_path)

        results.append({
            'block': script_name,
            'supported_count': len(supported_chars),
            'total_count': total_chars,
            'coverage_percent': round(coverage_percent, 2),
            'unsupported_chars': unsupported_chars
        })

    return results

for font_path in pruned_fonts:
  if os.path.exists(font_path):
    try:
      coverage_info = analyze_font_coverage(font_path)
      # print_coverage_summary(coverage_info)
      report_name = f"{os.path.basename(font_path).split('.')[0]}_report.txt"
      # save_unsupported_chars_as_report(coverage_info, report_name)
      print(f"Report saved to {report_name}")

    except Exception as e:
      print(f"Skipped {font_path} due to error: {e}")
  else:
      print("Font file not found.")

for cp in supported_fonts_mapping:
    supported_fonts_mapping[cp] = list(supported_fonts_mapping[cp])

Report saved to NotoFangsongKSSRotated-Regular_report.txt
Report saved to NotoFangsongKSSVertical-Regular_report.txt
Report saved to NotoKufiArabic-Regular_report.txt
Report saved to NotoKufiArabic[wght]_report.txt
Report saved to NotoMusic-Regular_report.txt
Report saved to NotoNaskhArabic-Regular_report.txt
Report saved to NotoNaskhArabic[wght]_report.txt
Report saved to NotoNaskhArabicUI-Regular_report.txt
Report saved to NotoNaskhArabicUI[wght]_report.txt
Report saved to NotoNastaliqUrdu-Regular_report.txt
Report saved to NotoNastaliqUrdu[wght]_report.txt
Report saved to NotoRashiHebrew[wght]_report.txt
Report saved to NotoRashiHebrew-Regular_report.txt
Report saved to NotoSans-Regular_report.txt
Report saved to NotoSans[wdth,wght]_report.txt
Report saved to NotoSans[wght]_report.txt
Report saved to NotoSansAdlam-Regular_report.txt
Report saved to NotoSansAdlam[wght]_report.txt
Report saved to NotoSansAdlamUnjoined-Regular_report.txt
Report saved to NotoSansAdlamUnjoined[wght]_repo

In [15]:
supported_fonts_mapping["U+0021"]

['Noto Sans Tifinagh SIL',
 'Noto Sans Mahajani',
 'Noto Sans Sundanese',
 'Noto Kufi Arabic',
 'Noto Sans Soyombo',
 'Noto Sans Lepcha',
 'Noto Sans',
 'Noto Sans Devanagari UI',
 'Noto Sans Tifinagh APT',
 'Noto Naskh Arabic',
 'Noto Fangsong KSS Rotated',
 'Noto Sans Georgian',
 'Noto Sans Devanagari',
 'Noto Sans Palmyrene',
 'Noto Sans Bamum',
 'Noto Serif Balinese',
 'Noto Sans Tifinagh Ghat',
 'Noto Sans Vithkuqi',
 'Noto Serif Bengali',
 'Noto Sans Old Turkic',
 'Noto Serif Devanagari',
 'Noto Sans Ugaritic',
 'Noto Sans Sogdian',
 'Noto Sans Cham',
 'Noto Sans Sinhala',
 'Noto Sans Canadian Aboriginal',
 'Noto Sans Osmanya',
 'Noto Serif Khmer',
 'Noto Serif Tangut',
 'Noto Naskh Arabic UI',
 'Noto Sans Linear B',
 'Noto Sans Imperial Aramaic',
 'Noto Serif Ahom',
 'Noto Sans Marchen',
 'Noto Serif NP Hmong',
 'Noto Rashi Hebrew',
 'Noto Sans Mono',
 'Noto Sans Lydian',
 'Noto Sans Multani',
 'Noto Sans Tifinagh Air',
 'Noto Sans Nag Mundari',
 'Noto Serif Thai',
 'Noto Sans K

In [16]:
import json

# Save to JSON file
json_filename = "supported_fonts_mapping_combined.json"

with open(json_filename, 'w', encoding='utf-8') as json_file:
    json.dump(supported_fonts_mapping, json_file, indent=4, ensure_ascii=False)

print(f"Supported fonts mapping saved to {json_filename}")

Supported fonts mapping saved to supported_fonts_mapping_combined.json


## Find the unsupported characters

In [17]:
unsupported_chars = []

for script in scripts:
    script_name = script.split('.')[0]

    with open(os.path.join(scripts_dir, script), 'r', encoding='utf-8') as infile:
        for line in infile:
            line = line.strip()
            if not line:
                continue  # skip blank lines
            if line not in supported_fonts_mapping:
                unsupported_chars.append(line)

In [18]:
len(unsupported_chars)

83

In [19]:
unsupported_chars

['U+1AC1',
 'U+1AC2',
 'U+1AC3',
 'U+1AC4',
 'U+1AC6',
 'U+1DFA',
 'U+1C89',
 'U+1C8A',
 'U+1E030',
 'U+1E031',
 'U+1E032',
 'U+1E033',
 'U+1E034',
 'U+1E035',
 'U+1E036',
 'U+1E037',
 'U+1E038',
 'U+1E039',
 'U+1E03A',
 'U+1E03B',
 'U+1E03C',
 'U+1E03D',
 'U+1E03E',
 'U+1E03F',
 'U+1E040',
 'U+1E041',
 'U+1E042',
 'U+1E043',
 'U+1E044',
 'U+1E045',
 'U+1E046',
 'U+1E047',
 'U+1E048',
 'U+1E049',
 'U+1E04A',
 'U+1E04B',
 'U+1E04C',
 'U+1E04D',
 'U+1E04E',
 'U+1E04F',
 'U+1E050',
 'U+1E051',
 'U+1E052',
 'U+1E053',
 'U+1E054',
 'U+1E055',
 'U+1E056',
 'U+1E057',
 'U+1E058',
 'U+1E059',
 'U+1E05A',
 'U+1E05B',
 'U+1E05C',
 'U+1E05D',
 'U+1E05E',
 'U+1E05F',
 'U+1E060',
 'U+1E061',
 'U+1E062',
 'U+1E063',
 'U+1E064',
 'U+1E065',
 'U+1E066',
 'U+1E067',
 'U+1E068',
 'U+1E069',
 'U+1E06A',
 'U+1E06B',
 'U+1E06C',
 'U+1E06D',
 'U+1E08F',
 'U+A7CB',
 'U+A7CC',
 'U+A7CD',
 'U+A7DA',
 'U+A7DB',
 'U+A7DC',
 'U+1DF25',
 'U+1DF26',
 'U+1DF27',
 'U+1DF28',
 'U+1DF29',
 'U+1DF2A']

In [20]:
with open("unsupported_chars_noto_combined.txt",'w', encoding='utf-8') as f:
  f.write(", ".join(unsupported_chars))

### Install Roboto fonts (if required)

In [None]:
# !git clone https://github.com/googlefonts/roboto-3-classic.git

Cloning into 'roboto-3-classic'...
remote: Enumerating objects: 234786, done.[K
remote: Counting objects: 100% (188/188), done.[K
remote: Compressing objects: 100% (97/97), done.[K
remote: Total 234786 (delta 118), reused 121 (delta 91), pack-reused 234598 (from 2)[K
Receiving objects: 100% (234786/234786), 219.29 MiB | 23.30 MiB/s, done.
Resolving deltas: 100% (214410/214410), done.
Updating files: 100% (44202/44202), done.


In [None]:
# %cd /content/roboto-3-classic

/content/roboto-3-classic


In [None]:
# !pip install .

In [None]:
# !pip install -r requirements.txt

In [None]:
# !sh sources/build.sh

## Take at least 5 fonts for each codepoint

In [21]:
import json

# Set threshold
MAX_FONTS_PER_CODEPOINT = 5

# Trim to at most 5 fonts per codepoint
trimmed_supported_fonts_mapping = {
    cp: fonts[:MAX_FONTS_PER_CODEPOINT]
    for cp, fonts in supported_fonts_mapping.items()
}

# Save to JSON file
json_filename = "supported_fonts_mapping_trimmed.json"

with open(json_filename, 'w', encoding='utf-8') as json_file:
    json.dump(trimmed_supported_fonts_mapping, json_file, indent=4, ensure_ascii=False)

print(f"Supported fonts mapping saved to {json_filename}")


Supported fonts mapping saved to supported_fonts_mapping_trimmed.json


In [24]:
trimmed_supported_fonts_mapping["U+00A1"]

['Noto Sans Tifinagh SIL',
 'Noto Sans Mahajani',
 'Noto Sans Sundanese',
 'Noto Kufi Arabic',
 'Noto Sans Soyombo']

In [None]:
# !wget https://unifoundry.com/pub/unifont/unifont-16.0.04/font-builds/unifont-16.0.04.otf -O /content/unifont-16.0.04.otf

# !mkdir -p /usr/share/fonts/truetype/custom
# !cp /content/unifont-16.0.04.otf /usr/share/fonts/truetype/custom/
# !fc-cache -fv
# !fc-list | grep -i unifont

### Install Qahirah Library for drawing the characters

In [None]:
!apt-get -y update
!apt-get -y install libfreetype6 libcairo2 libsm6 libxext6 libfontconfig1 libxrender1 fontconfig libgl1-mesa-glx unzip
!wget https://gitlab.com/ldo/qahirah/-/archive/master/qahirah-master.tar.gz
!tar -xvzf /content/qahirah-master.tar.gz
!mv /content/qahirah-master /content/qahirah
%cd /content/qahirah
!pip install .
%cd ..

### Install Python Freetype

In [None]:
!wget https://gitlab.com/ldo/python_freetype/-/archive/master/python_freetype-master.tar.gz
!tar -xvzf /content/python_freetype-master.tar.gz
!mv /content/python_freetype-master /content/python_freetype
%cd /content/python_freetype
!pip install .
%cd ..

### Custom VisualGenerator Class inherited from the VisualGenerator class.

In [8]:
from vis_gen import VisualGenerator
import qahirah as qah
from qahirah import CAIRO, Colour, Vector
from tqdm import tqdm

class CustomVisualGenerator(VisualGenerator):
  def generate_dataset_from_json_file(self, file_path, font_styles, antialiases):
    """
      Generates a dataset of rendered images from a JSON file mapping Unicode codepoints to font names.

      Args:
          file_path (str): Path to the JSON file containing a dictionary where keys are codepoints (e.g., "U+0041")
                          and values are lists of font names that support rendering that codepoint.
          font_styles (List[str]): A list of font style names (e.g., ["Regular", "Bold", "Italic"]) to use for rendering.
          antialiases (List[str]): A list of antialiasing options (e.g., ["Default", "None", "Grayscale"]) to apply
                                  during rendering.

      This method processes each codepoint and renders it using each combination of font name, style, and antialiasing
      setting provided. The rendered images are saved to the output directory specified in the class.

      Note:
          - Codepoints that cause errors during rendering are skipped with a warning.
          - This function assumes the fonts are already installed and accessible by name.
          - Output directory will be created if it doesn't exist.
    """
    out_dir_abs = self._get_out_dir_abs_and_check()

    with open(file_path, 'r') as f:
      trimmed_supported_fonts_mapping = json.load(f)

    print(f"Processing {len(trimmed_supported_fonts_mapping)} codepoints...")
    for codepoint, font_names in tqdm(trimmed_supported_fonts_mapping.items(), desc="Codepoints"):
      try:
        code_point = chr(int('0x' + codepoint[2:], 16))
        self.generate_dataset_from_list([code_point], font_names, font_styles, antialiases)
      except Exception as e:
        print(f"Error processing codepoint {codepoint}: {e}")
        continue

    self._check_out_dir = True

  def generate_dataset_from_list(self, code_points, font_names, font_styles, antialiases):
    # Check if out_dir exists and create if not
    out_dir_abs = self._get_out_dir_abs_and_check()

    for font_name in font_names:
      self.font_name = font_name
      for font_style in font_styles:
        self.font_style = font_style
        for antialias in antialiases:
          self.antialias = antialias
          self.visualize_list(code_points)

    # Flag flipped to False in self._get_out_dir_abs_and_check
    self._check_out_dir = True

  def visualize_list(self, code_points, x=None, y=None):
    # Check if out_dir exists and create if not
    out_dir_abs = self._get_out_dir_abs_and_check()

    # Visualize list of code points
    for idx, code_point in enumerate(code_points):
        self.visualize_single(code_point, False, x=x, y=y)

    # Flag flipped to False in self._get_out_dir_abs_and_check
    self._check_out_dir = True

In [9]:
# Remove the old datasets to make space
!rm -rf /content/data_* data_*.zip

In [None]:
from vis_gen import VisualGenerator
import os
import json

vg = CustomVisualGenerator(font_name='Noto Sans')

vg.image_size = 42
vg.font_size = 40
vg.out_dir = 'data_custom'
mapping_file='/content/supported_fonts_mapping_trimmed.json'

vg.generate_dataset_from_json_file(mapping_file, ['Bold','Medium','Regular','DemiLight','Light','Thin'],['Default','None'])

In [None]:
!zip -r data_custom.zip /content/data_custom

## Transfer the data to Google Drive

In [33]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [34]:
!cp /content/supported_fonts_mapping_combined.json /content/drive/MyDrive/Unicode_GSoC_Colab

In [None]:
# !cp /content/supported_fonts_mapping_trimmed.json /content/drive/MyDrive/Unicode_GSoC_Colab