From 7ef6fe73d886fabd8f0057fd5ac52b0e21bd8bcd Mon Sep 17 00:00:00 2001 From: Garret Rieger Date: Fri, 31 Oct 2025 21:41:31 +0000 Subject: [PATCH] Add built-in support for loading data files from ift-encoder-data. --- MODULE.bazel | 4 ++++ util/BUILD | 10 +++++++--- util/load_codepoints.cc | 5 +++++ util/load_codepoints.h | 6 ++++++ util/load_codepoints_test.cc | 10 ++++++++++ util/segmenter_config.proto | 28 +++++++++++++++++----------- util/segmenter_config_util.cc | 16 ++++++++++++---- util/segmenter_config_util.h | 4 +++- 8 files changed, 64 insertions(+), 19 deletions(-) diff --git a/MODULE.bazel b/MODULE.bazel index 659fc4c..3af012e 100644 --- a/MODULE.bazel +++ b/MODULE.bazel @@ -15,6 +15,10 @@ bazel_dep(name = "rules_rust", version = "0.64.0") bazel_dep(name = "glib", version = "2.82.2.bcr.5") bazel_dep(name = "brotli", version = "1.1.0") +# Frequency Data +bazel_dep(name = "ift_encoder_data", version = "git") +git_override(module_name = "ift_encoder_data", remote = "https://github.com/w3c/ift-encoder-data.git", commit = "317ea02ac68d45004aa842f831c65ed33c891701") + # Non Bazel Modules http_archive = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") diff --git a/util/BUILD b/util/BUILD index 1c245de..c3b3e60 100644 --- a/util/BUILD +++ b/util/BUILD @@ -98,6 +98,9 @@ cc_binary( srcs = [ "closure_glyph_keyed_segmenter_util.cc", ], + data = [ + "@ift_encoder_data//:freq_data", + ], deps = [ "//ift", "//ift/encoder", @@ -214,9 +217,10 @@ cc_test( "testdata/codepoints_invalid_1.txt", "testdata/codepoints_invalid_2.txt", "testdata/codepoints_with_freq.txt", - "testdata/codepoints_with_freq_invalid.txt", - "testdata/test_freq_data.riegeli", - "testdata/invalid_test_freq_data.riegeli", + "testdata/codepoints_with_freq_invalid.txt", + "testdata/test_freq_data.riegeli", + "testdata/invalid_test_freq_data.riegeli", + "@ift_encoder_data//:freq_data", ] + glob([ "testdata/sharded/*", ]), diff --git a/util/load_codepoints.cc b/util/load_codepoints.cc index 8e1f826..2125ea1 100644 --- a/util/load_codepoints.cc +++ b/util/load_codepoints.cc @@ -205,4 +205,9 @@ StatusOr LoadFrequenciesFromRiegeli(const char* path) { return frequencies; } +StatusOr LoadBuiltInFrequencies(const char* name) { + std::string path = StrCat("../ift_encoder_data+/data/", name); + return LoadFrequenciesFromRiegeli(path.c_str()); +} + } // namespace util \ No newline at end of file diff --git a/util/load_codepoints.h b/util/load_codepoints.h index 21521d7..ee00601 100644 --- a/util/load_codepoints.h +++ b/util/load_codepoints.h @@ -43,6 +43,12 @@ absl::StatusOr LoadFile(const char* path); absl::StatusOr LoadFrequenciesFromRiegeli( const char* path); +// Loads frequency data from https://github.com/w3c/ift-encoder-data +// +// name is the file name to load. +// Append "@*" to the name to load all sharded files for a name. +absl::StatusOr LoadBuiltInFrequencies(const char* name); + // Given a filepath if it ends with @* this will expand the path into // the list of paths matching the pattern: -?????-of-????? // Otherwise returns just the input path. diff --git a/util/load_codepoints_test.cc b/util/load_codepoints_test.cc index dab594e..f2eeeec 100644 --- a/util/load_codepoints_test.cc +++ b/util/load_codepoints_test.cc @@ -155,4 +155,14 @@ TEST_F(LoadCodepointsTest, ExpandShardedPath) { ASSERT_TRUE(absl::IsNotFound(result.status())) << result.status(); } +TEST_F(LoadCodepointsTest, LoadBuiltInFrequencies) { + auto result = + util::LoadBuiltInFrequencies("Script_latin.riegeli"); + ASSERT_TRUE(result.ok()) << result.status(); + + EXPECT_EQ(result->ProbabilityFor(0x20, 0x20), 1.0); + EXPECT_LT(result->ProbabilityFor(0x20, 'Z'), 1.0); + EXPECT_EQ(result->CoveredCodepoints().size(), 1363); +} + } // namespace util diff --git a/util/segmenter_config.proto b/util/segmenter_config.proto index 5dfb51e..61481f4 100644 --- a/util/segmenter_config.proto +++ b/util/segmenter_config.proto @@ -144,10 +144,16 @@ message MergeGroup { // Codepoint frequency data is utilized to evaluate the cost function. message CostConfiguration { - // Path (relative to this configuration file) that points to Riegeli encoded frequency data - // (using unicode_count.proto) that will be used during the evaluation of the cost function - // for merging within this group. Must be provided. - string path_to_frequency_data = 1; + oneof freq_data { + // Path (relative to this configuration file) that points to Riegeli encoded frequency data + // (using unicode_count.proto) that will be used during the evaluation of the cost function + // for merging within this group. Must be provided. + string path_to_frequency_data = 1; + + // Name (eg. "Script_latin.riegeli") of one of the data files from https://github.com/w3c/ift-encoder-data + // For sharded files append a "@*" (eg. "Script_japanese.riegeli@*"). + string built_in_freq_data_name = 2; + } // By default the cost function used for merging will utilize only individual codepoint probabilities // (ie. P(A)), when pair probabilities are needed it's assumed that individual probabilities are @@ -157,32 +163,32 @@ message CostConfiguration { // the the cost function will not assume independence and utilize the supplied pair frequencies. // This is much more accurate, but increase the computational complexity of the cost analysis // significantly. - bool use_bigrams = 2 [default = false]; + bool use_bigrams = 3 [default = false]; // When evaluating the cost of each patch this is the estimated number of bytes of // network overhead for making the network request for the patch. Higher values encourage // more aggressive merging. - uint32 network_overhead_cost = 3 [default = 75]; + uint32 network_overhead_cost = 4 [default = 75]; // The minimum number of codepoints in each activation condition in the final segmentation. - uint32 min_group_size = 4 [default = 1]; + uint32 min_group_size = 5 [default = 1]; // The set of segments whose total contributition to the total cost is less than this fraction // will have optimization skipped. This saves spending computation on segments that have very // little impact on the total cost of the segmentation. Higher values will cause more segments // to be cutoff. - double optimization_cutoff_fraction = 5 [default = 0.001]; + double optimization_cutoff_fraction = 6 [default = 0.001]; // If the cost delta (bytes) to move a segment into the initial font is less than this value then, // it will be moved into the initial font. If this is left unset then nothing will be moved to the // init font for this group. - double initial_font_merge_threshold = 6; + double initial_font_merge_threshold = 7; // Segments below this probability will no be considered for moving into the initial font. This is // useful to prune segments from the analysis that are unlikely to be beneficial for moving. // // Value is from [0, 1]. - double initial_font_merge_probability_threshold = 7; + double initial_font_merge_probability_threshold = 8; // By default merges under cost strategy are made by joining segments together, if this setting is // enabled than an alternate merge type, patch merge, will be considered by the merger. In a patch @@ -203,7 +209,7 @@ message CostConfiguration { // Work is planned to fix this issue (either in the harfbuzz closure or with a workaround in the segmenter), // until then it's recommended to not used this except for testing (or if appropriate care has been taken // in producing the input segments to avoid this issue). - bool experimental_use_patch_merges = 8 [default = false]; + bool experimental_use_patch_merges = 9 [default = false]; } // The merger will choose segments to merge based on a heuristic which primarily utilizes diff --git a/util/segmenter_config_util.cc b/util/segmenter_config_util.cc index eb57dda..d61fc6d 100644 --- a/util/segmenter_config_util.cc +++ b/util/segmenter_config_util.cc @@ -23,7 +23,13 @@ namespace util { // Loads unicode frequency data from either a dedicated frequency data file or // from the codepoint and frequency entries if no data file is given. StatusOr SegmenterConfigUtil::GetFrequencyData( - const std::string& frequency_data_file_path) { + const std::string& frequency_data_file_path, + bool built_in + ) { + if (built_in) { + return util::LoadBuiltInFrequencies(frequency_data_file_path.c_str()); + } + std::filesystem::path freq_path = frequency_data_file_path; std::filesystem::path resolved_path = freq_path; if (freq_path.is_relative()) { @@ -91,13 +97,15 @@ StatusOr SegmenterConfigUtil::ProtoToStrategy( CostConfiguration merged = base; merged.MergeFrom(config); - if (merged.path_to_frequency_data().empty()) { + if (merged.path_to_frequency_data().empty() && merged.built_in_freq_data_name().empty()) { return absl::InvalidArgumentError( "Path to frequency data must be provided."); } - UnicodeFrequencies freq = - TRY(GetFrequencyData(merged.path_to_frequency_data())); + UnicodeFrequencies freq = config.has_built_in_freq_data_name() ? + TRY(GetFrequencyData(merged.built_in_freq_data_name(), true)) : + TRY(GetFrequencyData(merged.path_to_frequency_data(), false)); + covered_codepoints = freq.CoveredCodepoints(); MergeStrategy strategy = MergeStrategy::None(); diff --git a/util/segmenter_config_util.h b/util/segmenter_config_util.h index c75325b..2b3395a 100644 --- a/util/segmenter_config_util.h +++ b/util/segmenter_config_util.h @@ -46,7 +46,9 @@ class SegmenterConfigUtil { absl::flat_hash_map& segment_id_to_index); absl::StatusOr GetFrequencyData( - const std::string& frequency_data_file_path); + const std::string& frequency_data_file_path, + bool built_in + ); absl::StatusOr ProtoToStrategy( const CostConfiguration& base, const CostConfiguration& config,