Skip to content

Commit

Permalink
Refactor unicode.py
Browse files Browse the repository at this point in the history
- Align tables
- Use helper function to parse properties
  • Loading branch information
Jules-Bertholet committed May 22, 2024
1 parent da626ef commit a2db56b
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 102 deletions.
172 changes: 81 additions & 91 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@

import enum
import math
import operator
import os
import re
import sys
import urllib.request
from collections import defaultdict
from itertools import batched
from typing import Callable

UNICODE_VERSION = "15.1.0"
"""The version of the Unicode data files to download."""
Expand Down Expand Up @@ -90,13 +92,32 @@ def fetch_open(filename: str, local_prefix: str = ""):
sys.exit(1)


def load_unicode_version() -> "tuple[int, int, int]":
def load_unicode_version() -> tuple[int, int, int]:
"""Returns the current Unicode version by fetching and processing `ReadMe.txt`."""
with fetch_open("ReadMe.txt") as readme:
pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
return tuple(map(int, re.search(pattern, readme.read()).groups()))


def load_property(filename: str, pattern: str, action: Callable[[int], None]):
with fetch_open(filename) as properties:
single = re.compile(rf"^([0-9A-F]+)\s*;\s*{pattern}\s+")
multiple = re.compile(rf"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*{pattern}\s+")

for line in properties.readlines():
raw_data = None # (low, high)
if match := single.match(line):
raw_data = (match.group(1), match.group(1))
elif match := multiple.match(line):
raw_data = (match.group(1), match.group(2))
else:
continue
low = int(raw_data[0], 16)
high = int(raw_data[1], 16)
for cp in range(low, high + 1):
action(cp)


class EffectiveWidth(enum.IntEnum):
"""Represents the width of a Unicode character. All East Asian Width classes resolve into
either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.
Expand All @@ -112,15 +133,15 @@ class EffectiveWidth(enum.IntEnum):
""" Two columns wide in a CJK context. One column wide in all other contexts. """


def load_east_asian_widths() -> "list[EffectiveWidth]":
def load_east_asian_widths() -> list[EffectiveWidth]:
"""Return a list of effective widths, indexed by codepoint.
Widths are determined by fetching and parsing `EastAsianWidth.txt`.
`Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`.
`Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`.
`Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
`Ambiguous` characters are assigned `EffectiveWidth.AMBIGUOUS`."""
with fetch_open("EastAsianWidth.txt") as eaw:
# matches a width assignment for a single codepoint, i.e. "1F336;N # ..."
single = re.compile(r"^([0-9A-F]+)\s*;\s*(\w+) +# (\w+)")
Expand Down Expand Up @@ -161,7 +182,7 @@ def load_east_asian_widths() -> "list[EffectiveWidth]":
return width_map


def load_zero_widths() -> "list[bool]":
def load_zero_widths() -> list[bool]:
"""Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
character. `c` is considered a zero-width character if
Expand All @@ -180,26 +201,11 @@ def load_zero_widths() -> "list[bool]":
# `Grapheme_Extend` includes characters with general category `Mn` or `Me`,
# as well as a few `Mc` characters that need to be included so that
# canonically equivalent sequences have the same width.
with fetch_open("DerivedCoreProperties.txt") as properties:
single = re.compile(
r"^([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
)
multiple = re.compile(
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(?:Default_Ignorable_Code_Point|Grapheme_Extend)\s+"
)

for line in properties.readlines():
raw_data = None # (low, high)
if match := single.match(line):
raw_data = (match.group(1), match.group(1))
elif match := multiple.match(line):
raw_data = (match.group(1), match.group(2))
else:
continue
low = int(raw_data[0], 16)
high = int(raw_data[1], 16)
for cp in range(low, high + 1):
zw_map[cp] = True
load_property(
"DerivedCoreProperties.txt",
r"(?:Default_Ignorable_Code_Point|Grapheme_Extend)",
lambda cp: operator.setitem(zw_map, cp, True),
)

# Unicode spec bug: these should be `Grapheme_Cluster_Break=Extend`,
# as they canonically decompose to two characters with this property,
Expand All @@ -217,29 +223,11 @@ def load_zero_widths() -> "list[bool]":
# and the resulting grapheme has width 2.
#
# (See the Unicode Standard sections 3.12 and 18.6 for more on Hangul)
with fetch_open("HangulSyllableType.txt") as categories:
single = re.compile(r"^([0-9A-F]+)\s*;\s*(V|T)\s+")
multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*(V|T)\s+")

for line in categories.readlines():
raw_data = None # (low, high)
if match := single.match(line):
raw_data = (match.group(1), match.group(1))
elif match := multiple.match(line):
raw_data = (match.group(1), match.group(2))
else:
continue
low = int(raw_data[0], 16)
high = int(raw_data[1], 16)
for cp in range(low, high + 1):
zw_map[cp] = True

# Special case: U+115F HANGUL CHOSEONG FILLER.
# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
# zero width. However, the expected usage is to combine it with vowel or trailing jamo
# (which are considered 0-width on their own) to form a composed Hangul syllable with
# width 2. Therefore, we treat it as having width 2.
zw_map[0x115F] = False
load_property(
"HangulSyllableType.txt",
r"(?:V|T)",
lambda cp: operator.setitem(zw_map, cp, True),
)

# Syriac abbreviation mark:
# Zero-width `Prepended_Concatenation_Mark`
Expand All @@ -252,7 +240,14 @@ def load_zero_widths() -> "list[bool]":
zw_map[0x0891] = True
zw_map[0x08E2] = True

# U+A8FA DEVANAGARI CARET
# HANGUL CHOSEONG FILLER
# U+115F is a `Default_Ignorable_Code_Point`, and therefore would normally have
# zero width. However, the expected usage is to combine it with vowel or trailing jamo
# (which are considered 0-width on their own) to form a composed Hangul syllable with
# width 2. Therefore, we treat it as having width 2.
zw_map[0x115F] = False

# DEVANAGARI CARET
# https://www.unicode.org/versions/Unicode15.0.0/ch12.pdf#G667447
zw_map[0xA8FA] = True

Expand Down Expand Up @@ -287,13 +282,13 @@ def try_extend(self, attempt: "Bucket") -> bool:
self.widths = more
return True

def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]":
def entries(self) -> list[tuple[Codepoint, EffectiveWidth]]:
"""Return a list of the codepoint/width pairs in this bucket, sorted by codepoint."""
result = list(self.entry_set)
result.sort()
return result

def width(self) -> "EffectiveWidth | None":
def width(self) -> EffectiveWidth | None:
"""If all codepoints in this bucket have the same width, return that width; otherwise,
return `None`."""
if len(self.widths) == 0:
Expand All @@ -305,7 +300,7 @@ def width(self) -> "EffectiveWidth | None":
return potential_width


def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":
def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> list[Bucket]:
"""Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All
codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the
same bucket. Returns a list of the buckets in increasing order of those bits."""
Expand Down Expand Up @@ -373,7 +368,7 @@ def buckets(self):
"""Returns an iterator over this table's buckets."""
return self.indexed

def to_bytes(self) -> "list[int]":
def to_bytes(self) -> list[int]:
"""Returns this table's entries as a list of bytes. The bytes are formatted according to
the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries
to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will
Expand All @@ -389,8 +384,8 @@ def to_bytes(self) -> "list[int]":


def make_tables(
table_cfgs: "list[tuple[BitPos, BitPos, OffsetType]]", entries
) -> "list[Table]":
table_cfgs: list[tuple[BitPos, BitPos, OffsetType]], entries
) -> list[Table]:
"""Creates a table for each configuration in `table_cfgs`, with the first config corresponding
to the top-level lookup table, the second config corresponding to the second-level lookup
table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs
Expand All @@ -404,7 +399,7 @@ def make_tables(
return tables


def load_emoji_presentation_sequences() -> "list[int]":
def load_emoji_presentation_sequences() -> list[int]:
"""Outputs a list of character ranages, corresponding to all the valid characters for starting
an emoji presentation sequence."""

Expand All @@ -420,7 +415,7 @@ def load_emoji_presentation_sequences() -> "list[int]":
return codepoints


def load_text_presentation_sequences() -> "list[int]":
def load_text_presentation_sequences() -> list[int]:
"""Outputs a list of character ranages, corresponding to all the valid characters
whose widths change with a text presentation sequence."""

Expand All @@ -435,24 +430,12 @@ def load_text_presentation_sequences() -> "list[int]":
text_presentation_seq_codepoints.add(cp)

default_emoji_codepoints = set()
with fetch_open("emoji/emoji-data.txt") as emoji_data:
single = re.compile(r"^([0-9A-F]+)\s*;\s*Emoji_Presentation\s+")
multiple = re.compile(
r"^([0-9A-F]+)\.\.([0-9A-F]+)\s*;\s*Emoji_Presentation\s+"
)

for line in emoji_data.readlines():
raw_data = None # (low, high)
if match := single.match(line):
raw_data = (match.group(1), match.group(1))
elif match := multiple.match(line):
raw_data = (match.group(1), match.group(2))
else:
continue
low = int(raw_data[0], 16)
high = int(raw_data[1], 16)
for cp in range(low, high + 1):
default_emoji_codepoints.add(cp)
load_property(
"emoji/emoji-data.txt",
"Emoji_Presentation",
lambda cp: default_emoji_codepoints.add(cp),
)

codepoints = []
for cp in text_presentation_seq_codepoints.intersection(default_emoji_codepoints):
Expand All @@ -466,11 +449,11 @@ def load_text_presentation_sequences() -> "list[int]":


def make_presentation_sequence_table(
seqs: "list[int]",
width_map: "list[EffectiveWidth]",
spurious_false: "set[EffectiveWidth]",
spurious_true: "set[EffectiveWidth]",
) -> "tuple[list[tuple[int, int]], list[list[int]]]":
seqs: list[Codepoint],
width_map: list[EffectiveWidth],
spurious_false: set[EffectiveWidth],
spurious_true: set[EffectiveWidth],
) -> tuple[list[tuple[int, int]], list[list[int]]]:
"""Generates 2-level lookup table for whether a codepoint might start an emoji variation sequence.
The first level is a match on all but the 10 LSB, the second level is a 1024-bit bitmap for those 10 LSB.
"""
Expand All @@ -488,13 +471,13 @@ def make_presentation_sequence_table(
):
del prefixes_dict[k]

msbs: "list[int]" = list(prefixes_dict.keys())
msbs: list[int] = list(prefixes_dict.keys())

for cp, width in enumerate(width_map):
if width in spurious_true and (cp >> 10) in msbs:
prefixes_dict[cp >> 10].add(cp & 0x3FF)

leaves: "list[list[int]]" = []
leaves: list[list[int]] = []
for cps in prefixes_dict.values():
leaf = [0] * 128
for cp in cps:
Expand Down Expand Up @@ -524,10 +507,10 @@ def make_presentation_sequence_table(

def emit_module(
out_name: str,
unicode_version: "tuple[int, int, int]",
tables: "list[Table]",
emoji_presentation_table: "tuple[list[tuple[int, int]], list[list[int]]]",
text_presentation_table: "tuple[list[tuple[int, int]], list[list[int]]]",
unicode_version: tuple[int, int, int],
tables: list[Table],
emoji_presentation_table: tuple[list[tuple[int, int]], list[list[int]]],
text_presentation_table: tuple[list[tuple[int, int]], list[list[int]]],
):
"""Outputs a Rust module to `out_name` using table data from `tables`.
If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.
Expand Down Expand Up @@ -574,18 +557,18 @@ def emit_module(
pub fn lookup_width(c: char, is_cjk: bool) -> usize {
let cp = c as usize;
let t1_offset = TABLES_0[cp >> 13 & 0xFF];
let t1_offset = TABLES_0.0[cp >> 13 & 0xFF];
// Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte,
// so each sub-table is 128 bytes in size.
// (Sub-tables are selected using the computed offset from the previous table.)
let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];
let t2_offset = TABLES_1.0[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];
// Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits.
// This is accomplished by packing four stored entries into one byte.
// So each sub-table is 2**(6-2) == 16 bytes in size.
// Since this is the last table, each entry represents an encoded width.
let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];
let packed_widths = TABLES_2.0[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];
// Extract the packed width
let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;
Expand Down Expand Up @@ -669,6 +652,12 @@ def emit_module(
// Use the 3 LSB of `cp` to index into `leaf_byte`.
((leaf_byte >> (cp & 7)) & 1) == 1
}
#[repr(align(128))]
struct Align128<T>(T);
#[repr(align(16))]
struct Align16<T>(T);
"""
)

Expand All @@ -677,26 +666,27 @@ def emit_module(
new_subtable_count = len(table.buckets())
if i == len(tables) - 1:
table.indices_to_widths() # for the last table, indices == widths
align = 16
else:
align = 128
byte_array = table.to_bytes()
module.write(
f"""
/// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
static TABLES_{i}: [u8; {len(byte_array)}] = ["""
static TABLES_{i}: Align{align}<[u8; {len(byte_array)}]> = Align{align}(["""
)
for j, byte in enumerate(byte_array):
# Add line breaks for every 15th entry (chosen to match what rustfmt does)
if j % 15 == 0:
module.write("\n ")
module.write(f" 0x{byte:02X},")
module.write("\n ];\n")
module.write("\n ]);\n")
subtable_count = new_subtable_count

# emoji table

module.write(
f"""
#[repr(align(128))]
struct Align128<T>(T);
/// Array of 1024-bit bitmaps. Index into the correct bitmap with the 10 LSB of your codepoint
/// to get whether it can start an emoji presentation sequence.
static EMOJI_PRESENTATION_LEAVES: Align128<[[u8; 128]; {len(emoji_presentation_leaves)}]> = Align128([
Expand Down
Loading

0 comments on commit a2db56b

Please sign in to comment.