Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions MODULE.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,10 @@ bazel_dep(name = "rules_rust", version = "0.64.0")
bazel_dep(name = "glib", version = "2.82.2.bcr.5")
bazel_dep(name = "brotli", version = "1.1.0")

# Frequency Data
bazel_dep(name = "ift_encoder_data", version = "git")
git_override(module_name = "ift_encoder_data", remote = "https://github.com/w3c/ift-encoder-data.git", commit = "317ea02ac68d45004aa842f831c65ed33c891701")

# Non Bazel Modules
http_archive = use_repo_rule("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")

Expand Down
10 changes: 7 additions & 3 deletions util/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ cc_binary(
srcs = [
"closure_glyph_keyed_segmenter_util.cc",
],
data = [
"@ift_encoder_data//:freq_data",
],
deps = [
"//ift",
"//ift/encoder",
Expand Down Expand Up @@ -214,9 +217,10 @@ cc_test(
"testdata/codepoints_invalid_1.txt",
"testdata/codepoints_invalid_2.txt",
"testdata/codepoints_with_freq.txt",
"testdata/codepoints_with_freq_invalid.txt",
"testdata/test_freq_data.riegeli",
"testdata/invalid_test_freq_data.riegeli",
"testdata/codepoints_with_freq_invalid.txt",
"testdata/test_freq_data.riegeli",
"testdata/invalid_test_freq_data.riegeli",
"@ift_encoder_data//:freq_data",
] + glob([
"testdata/sharded/*",
]),
Expand Down
5 changes: 5 additions & 0 deletions util/load_codepoints.cc
Original file line number Diff line number Diff line change
Expand Up @@ -205,4 +205,9 @@ StatusOr<UnicodeFrequencies> LoadFrequenciesFromRiegeli(const char* path) {
return frequencies;
}

StatusOr<UnicodeFrequencies> LoadBuiltInFrequencies(const char* name) {
std::string path = StrCat("../ift_encoder_data+/data/", name);
return LoadFrequenciesFromRiegeli(path.c_str());
}

} // namespace util
6 changes: 6 additions & 0 deletions util/load_codepoints.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ absl::StatusOr<common::FontData> LoadFile(const char* path);
absl::StatusOr<ift::freq::UnicodeFrequencies> LoadFrequenciesFromRiegeli(
const char* path);

// Loads frequency data from https://github.com/w3c/ift-encoder-data
//
// name is the file name to load.
// Append "@*" to the name to load all sharded files for a name.
absl::StatusOr<ift::freq::UnicodeFrequencies> LoadBuiltInFrequencies(const char* name);

// Given a filepath if it ends with @* this will expand the path into
// the list of paths matching the pattern: <path>-?????-of-?????
// Otherwise returns just the input path.
Expand Down
10 changes: 10 additions & 0 deletions util/load_codepoints_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -155,4 +155,14 @@ TEST_F(LoadCodepointsTest, ExpandShardedPath) {
ASSERT_TRUE(absl::IsNotFound(result.status())) << result.status();
}

TEST_F(LoadCodepointsTest, LoadBuiltInFrequencies) {
auto result =
util::LoadBuiltInFrequencies("Script_latin.riegeli");
ASSERT_TRUE(result.ok()) << result.status();

EXPECT_EQ(result->ProbabilityFor(0x20, 0x20), 1.0);
EXPECT_LT(result->ProbabilityFor(0x20, 'Z'), 1.0);
EXPECT_EQ(result->CoveredCodepoints().size(), 1363);
}

} // namespace util
28 changes: 17 additions & 11 deletions util/segmenter_config.proto
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,16 @@ message MergeGroup {
// Codepoint frequency data is utilized to evaluate the cost function.
message CostConfiguration {

// Path (relative to this configuration file) that points to Riegeli encoded frequency data
// (using unicode_count.proto) that will be used during the evaluation of the cost function
// for merging within this group. Must be provided.
string path_to_frequency_data = 1;
oneof freq_data {
// Path (relative to this configuration file) that points to Riegeli encoded frequency data
// (using unicode_count.proto) that will be used during the evaluation of the cost function
// for merging within this group. Must be provided.
string path_to_frequency_data = 1;

// Name (eg. "Script_latin.riegeli") of one of the data files from https://github.com/w3c/ift-encoder-data
// For sharded files append a "@*" (eg. "Script_japanese.riegeli@*").
string built_in_freq_data_name = 2;
}

// By default the cost function used for merging will utilize only individual codepoint probabilities
// (ie. P(A)), when pair probabilities are needed it's assumed that individual probabilities are
Expand All @@ -157,32 +163,32 @@ message CostConfiguration {
// the the cost function will not assume independence and utilize the supplied pair frequencies.
// This is much more accurate, but increase the computational complexity of the cost analysis
// significantly.
bool use_bigrams = 2 [default = false];
bool use_bigrams = 3 [default = false];

// When evaluating the cost of each patch this is the estimated number of bytes of
// network overhead for making the network request for the patch. Higher values encourage
// more aggressive merging.
uint32 network_overhead_cost = 3 [default = 75];
uint32 network_overhead_cost = 4 [default = 75];

// The minimum number of codepoints in each activation condition in the final segmentation.
uint32 min_group_size = 4 [default = 1];
uint32 min_group_size = 5 [default = 1];

// The set of segments whose total contributition to the total cost is less than this fraction
// will have optimization skipped. This saves spending computation on segments that have very
// little impact on the total cost of the segmentation. Higher values will cause more segments
// to be cutoff.
double optimization_cutoff_fraction = 5 [default = 0.001];
double optimization_cutoff_fraction = 6 [default = 0.001];

// If the cost delta (bytes) to move a segment into the initial font is less than this value then,
// it will be moved into the initial font. If this is left unset then nothing will be moved to the
// init font for this group.
double initial_font_merge_threshold = 6;
double initial_font_merge_threshold = 7;

// Segments below this probability will no be considered for moving into the initial font. This is
// useful to prune segments from the analysis that are unlikely to be beneficial for moving.
//
// Value is from [0, 1].
double initial_font_merge_probability_threshold = 7;
double initial_font_merge_probability_threshold = 8;

// By default merges under cost strategy are made by joining segments together, if this setting is
// enabled than an alternate merge type, patch merge, will be considered by the merger. In a patch
Expand All @@ -203,7 +209,7 @@ message CostConfiguration {
// Work is planned to fix this issue (either in the harfbuzz closure or with a workaround in the segmenter),
// until then it's recommended to not used this except for testing (or if appropriate care has been taken
// in producing the input segments to avoid this issue).
bool experimental_use_patch_merges = 8 [default = false];
bool experimental_use_patch_merges = 9 [default = false];
}

// The merger will choose segments to merge based on a heuristic which primarily utilizes
Expand Down
16 changes: 12 additions & 4 deletions util/segmenter_config_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@ namespace util {
// Loads unicode frequency data from either a dedicated frequency data file or
// from the codepoint and frequency entries if no data file is given.
StatusOr<UnicodeFrequencies> SegmenterConfigUtil::GetFrequencyData(
const std::string& frequency_data_file_path) {
const std::string& frequency_data_file_path,
bool built_in
) {
if (built_in) {
return util::LoadBuiltInFrequencies(frequency_data_file_path.c_str());
}

std::filesystem::path freq_path = frequency_data_file_path;
std::filesystem::path resolved_path = freq_path;
if (freq_path.is_relative()) {
Expand Down Expand Up @@ -91,13 +97,15 @@ StatusOr<MergeStrategy> SegmenterConfigUtil::ProtoToStrategy(
CostConfiguration merged = base;
merged.MergeFrom(config);

if (merged.path_to_frequency_data().empty()) {
if (merged.path_to_frequency_data().empty() && merged.built_in_freq_data_name().empty()) {
return absl::InvalidArgumentError(
"Path to frequency data must be provided.");
}

UnicodeFrequencies freq =
TRY(GetFrequencyData(merged.path_to_frequency_data()));
UnicodeFrequencies freq = config.has_built_in_freq_data_name() ?
TRY(GetFrequencyData(merged.built_in_freq_data_name(), true)) :
TRY(GetFrequencyData(merged.path_to_frequency_data(), false));

covered_codepoints = freq.CoveredCodepoints();

MergeStrategy strategy = MergeStrategy::None();
Expand Down
4 changes: 3 additions & 1 deletion util/segmenter_config_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ class SegmenterConfigUtil {
absl::flat_hash_map<SegmentId, uint32_t>& segment_id_to_index);

absl::StatusOr<ift::freq::UnicodeFrequencies> GetFrequencyData(
const std::string& frequency_data_file_path);
const std::string& frequency_data_file_path,
bool built_in
);

absl::StatusOr<ift::encoder::MergeStrategy> ProtoToStrategy(
const CostConfiguration& base, const CostConfiguration& config,
Expand Down
Loading