diff --git a/ift/encoder/closure_glyph_segmenter.cc b/ift/encoder/closure_glyph_segmenter.cc index f5eaca7..fad3b60 100644 --- a/ift/encoder/closure_glyph_segmenter.cc +++ b/ift/encoder/closure_glyph_segmenter.cc @@ -21,11 +21,10 @@ #include "common/int_set.h" #include "common/try.h" #include "common/woff2.h" -#include "ift/encoder/activation_condition.h" #include "ift/encoder/glyph_segmentation.h" #include "ift/encoder/merge_strategy.h" #include "ift/encoder/merger.h" -#include "ift/encoder/patch_size_cache.h" +#include "ift/encoder/segment.h" #include "ift/encoder/segmentation_context.h" #include "ift/encoder/subset_definition.h" #include "ift/encoder/types.h" @@ -193,6 +192,66 @@ struct SegmentOrdering { } }; +static std::vector PreGroupSegments( + const btree_map& merge_groups, + const std::vector& ordering, + const std::vector& subset_definitions, + std::vector& segment_index_map +) { + segment_index_map.resize(subset_definitions.size()); + std::vector segments; + + unsigned i = 0; + unsigned last_group_index = 0; + auto merge_group_it = merge_groups.begin(); + auto ordering_it = ordering.begin(); + + while (ordering_it != ordering.end()) { + const auto& o = *ordering_it; + if (o.group_index != last_group_index && merge_group_it != merge_groups.end()) { + merge_group_it++; + } + + const MergeStrategy* strategy = nullptr; + if (merge_group_it != merge_groups.end()) { + strategy = &(merge_group_it->second); + } + + Segment segment = Segment{subset_definitions[o.original_index], o.probability}; + ordering_it++; + + if (strategy == nullptr || + strategy->PreClosureGroupSize() <= 1 || + o.probability.Max() > strategy->PreClosureProbabilityThreshold()) { + segment_index_map[o.original_index] = i; + } else { + uint32_t remaining = strategy->PreClosureGroupSize() - 1; + while (remaining > 0) { + if (ordering_it == ordering.end() || + ordering_it->group_index != o.group_index) { + break; + } + + segment.Definition().Union(subset_definitions[ordering_it->original_index]); + segment_index_map[ordering_it->original_index] = i; + + ordering_it++; + remaining--; + } + + if (strategy->UseCosts()) { + segment.SetProbability(strategy->ProbabilityCalculator()->ComputeProbability(segment.Definition())); + } + } + + last_group_index = o.group_index; + segments.push_back(segment); + i++; + } + + return segments; +} + // Converts the input subset definitions to a sorted list of segments, remaps // the merge_groups segment set keys to reflect the ordering changes. static StatusOr> ToOrderedSegments( @@ -268,16 +327,12 @@ static StatusOr> ToOrderedSegments( std::sort(ordering.begin(), ordering.end()); // maps from index in subset_definitions to the new ordering. - std::vector segment_index_map(subset_definitions.size()); - std::vector segments; - unsigned i = 0; - for (const auto& ordering : ordering) { - segments.push_back(Segment{subset_definitions[ordering.original_index], - ordering.probability}); - segment_index_map[ordering.original_index] = i++; - } + std::vector segment_index_map; + std::vector segments = PreGroupSegments(merge_groups, ordering, subset_definitions, segment_index_map); + VLOG(0) << segments.size() << " segments after pregrouping."; btree_map new_merge_groups; + group_index = 0; for (auto& [segments, strategy] : merge_groups) { SegmentSet remapped; SegmentSet remapped_full; @@ -289,6 +344,10 @@ static StatusOr> ToOrderedSegments( remapped_full.insert(s_prime); } + + VLOG(0) << " Merge group " << group_index << " has " << remapped.size() << " segments."; + group_index++; + if (!new_merge_groups.insert(std::make_pair(remapped, std::move(strategy))) .second) { return absl::InvalidArgumentError( diff --git a/ift/encoder/closure_glyph_segmenter_test.cc b/ift/encoder/closure_glyph_segmenter_test.cc index 8e742e6..41b7d49 100644 --- a/ift/encoder/closure_glyph_segmenter_test.cc +++ b/ift/encoder/closure_glyph_segmenter_test.cc @@ -1427,6 +1427,78 @@ if (s0 AND s2) then p2 )"); } + +TEST_F(ClosureGlyphSegmenterTest, MultipleMergeGroups_PreGrouping) { + UnicodeFrequencies freq{ + {{' ', ' '}, 100}, + {{'d', 'd'}, 100}, + {{'a', 'a'}, 60}, + {{'e', 'e'}, 30}, + {{'b', 'b'}, 29}, + {{'f', 'f'}, 28}, + {{'c', 'c'}, 10}, + {{'g', 'g'}, 9}, + {{'h', 'h'}, 5}, + {{'i', 'i'}, 1}, // 8 + }; + + MergeStrategy costs = *MergeStrategy::CostBased(std::move(freq), 0, 1); + costs.SetPreClosureProbabilityThreshold(0.55); + costs.SetPreClosureGroupSize(3); + + btree_map merge_groups{ + {{0, 1, 2, 3, 4, 5, 6, 7, 8}, costs}, + {{7, 8}, MergeStrategy::Heuristic(1)}, + }; + + auto segmentation = segmenter.CodepointToGlyphSegments(roboto.get(), {}, + { + {'a'}, + {'b'}, + {'c'}, + {'d'}, + {'e'}, + {'f'}, + {'g'}, + {'h'}, + {'i'}, + }, + merge_groups, false); + ASSERT_TRUE(segmentation.ok()) << segmentation.status(); + + // d, a are above the pregrouping threshold so aren't grouped. + // e, b, f, c, and g are below so are grouped into sets of 3. + // h, i are shared between merge groups so don't participate in pregrouping. + std::vector expected_segments = { + // Group 1 + {'d'}, + {'a'}, + {'e', 'b', 'f'}, // pre merge + {'c', 'g'}, // pre merge + // Shared + {'h'}, + {'i'}, + }; + ASSERT_EQ(segmentation->Segments(), expected_segments); + ASSERT_EQ(segmentation->ToString(), + R"(initial font: { gid0 } +p0: { gid72 } +p1: { gid69 } +p2: { gid70, gid73, gid74 } +p3: { gid71, gid75 } +p4: { gid76 } +p5: { gid77 } +p6: { gid444, gid446 } +if (s0) then p0 +if (s1) then p1 +if (s2) then p2 +if (s3) then p3 +if (s4) then p4 +if (s5) then p5 +if (s2 AND s5) then p6 +)"); +} + // TODO(garretrieger): test that segments are excluded by init font segment. ie. // if a segment is present in the init font then it should be cleared out in the // segmentation. diff --git a/ift/encoder/glyph_closure_cache.cc b/ift/encoder/glyph_closure_cache.cc index 48a94c8..13db048 100644 --- a/ift/encoder/glyph_closure_cache.cc +++ b/ift/encoder/glyph_closure_cache.cc @@ -4,6 +4,7 @@ #include "common/int_set.h" #include "common/try.h" #include "ift/encoder/requested_segmentation_information.h" +#include "ift/encoder/subset_definition.h" #include "ift/encoder/types.h" using absl::Status; @@ -63,6 +64,37 @@ StatusOr GlyphClosureCache::CodepointsToOrGids( return or_gids; } +// This generates the subset definition that contains all segments except for +// those listed in segment_ids. +SubsetDefinition ComputExceptSegment( + const RequestedSegmentationInformation& segmentation_info, + const SegmentSet& segment_ids, const SubsetDefinition& combined) { + if (segmentation_info.SegmentsAreDisjoint() && + (segment_ids.size() == 1 || + segment_ids.size() < (segmentation_info.Segments().size() / 2))) { + // Approach that is optimzied for the case where input segments are disjoint + // and the number of segment ids is smallish. + SubsetDefinition except_segment = segmentation_info.FullDefinition(); + except_segment.Subtract(combined); + return except_segment; + } + + // Otherwise this approach will always work even with non-disjoint segments + SegmentSet except_segment_ids = segment_ids; + except_segment_ids.invert(); + + uint32_t num_segments = segmentation_info.Segments().size(); + SubsetDefinition except_segment = segmentation_info.InitFontSegment(); + for (segment_index_t s : except_segment_ids) { + if (s >= num_segments) { + break; + } + except_segment.Union(segmentation_info.Segments()[s].Definition()); + } + + return except_segment; +} + Status GlyphClosureCache::AnalyzeSegment( const RequestedSegmentationInformation& segmentation_info, const SegmentSet& segment_ids, GlyphSet& and_gids, GlyphSet& or_gids, @@ -95,20 +127,19 @@ Status GlyphClosureCache::AnalyzeSegment( // * I - D: the activation conditions for these glyphs is s_i OR … // Where … is one or more additional segments. // * D intersection I: the activation conditions for these glyphs is only s_i - SubsetDefinition except_segment = segmentation_info.InitFontSegment(); - for (uint32_t s = 0; s < segmentation_info.Segments().size(); s++) { - if (segment_ids.contains(s)) { - continue; - } - except_segment.Union(segmentation_info.Segments()[s].Definition()); + + SubsetDefinition + combined; // This is the subset definition of the unions of segment_ids. + for (segment_index_t s_id : segment_ids) { + combined.Union(segmentation_info.Segments()[s_id].Definition()); } + SubsetDefinition except_segment = + ComputExceptSegment(segmentation_info, segment_ids, combined); auto B_except_segment_closure = TRY(GlyphClosure(except_segment)); - SubsetDefinition only_segment = segmentation_info.InitFontSegment(); - for (segment_index_t s_id : segment_ids) { - only_segment.Union(segmentation_info.Segments()[s_id].Definition()); - } + SubsetDefinition only_segment = combined; + only_segment.Union(segmentation_info.InitFontSegment()); auto I_only_segment_closure = TRY(GlyphClosure(only_segment)); I_only_segment_closure.subtract(segmentation_info.InitFontGlyphs()); diff --git a/ift/encoder/merge_strategy.h b/ift/encoder/merge_strategy.h index d132909..c532057 100644 --- a/ift/encoder/merge_strategy.h +++ b/ift/encoder/merge_strategy.h @@ -145,6 +145,22 @@ class MergeStrategy { return init_font_merge_probability_threshold_; } + uint32_t PreClosureGroupSize() const { + return pre_closure_group_size_; + } + + double PreClosureProbabilityThreshold() const { + return pre_closure_probability_threshold_; + } + + void SetPreClosureGroupSize(uint32_t value) { + pre_closure_group_size_ = value; + } + + void SetPreClosureProbabilityThreshold(double value) { + pre_closure_probability_threshold_ = value; + } + void SetInitFontMergeThreshold(std::optional value) { init_font_merge_threshold_ = value; } @@ -187,6 +203,9 @@ class MergeStrategy { std::optional init_font_merge_probability_threshold_ = std::nullopt; bool use_patch_merges_ = false; + uint32_t pre_closure_group_size_ = 1; + double pre_closure_probability_threshold_ = 0.0; + std::shared_ptr probability_calculator_; }; diff --git a/ift/encoder/requested_segmentation_information.cc b/ift/encoder/requested_segmentation_information.cc index 9ed3263..bd810ac 100644 --- a/ift/encoder/requested_segmentation_information.cc +++ b/ift/encoder/requested_segmentation_information.cc @@ -11,6 +11,24 @@ RequestedSegmentationInformation::RequestedSegmentationInformation( GlyphClosureCache& closure_cache) : segments_(std::move(segments)), init_font_segment_() { ReassignInitSubset(closure_cache, std::move(init_font_segment)); + + segments_disjoint_ = true; + + full_definition_ = init_font_segment_; + for (const auto& s : segments_) { + const auto& def = s.Definition(); + if (segments_disjoint_) { + for (hb_tag_t tag : def.feature_tags) { + if (full_definition_.feature_tags.contains(tag)) { + segments_disjoint_ = false; + } + } + segments_disjoint_ = + segments_disjoint_ && + !full_definition_.codepoints.intersects(def.codepoints); + } + full_definition_.Union(s.Definition()); + } } } // namespace ift::encoder \ No newline at end of file diff --git a/ift/encoder/requested_segmentation_information.h b/ift/encoder/requested_segmentation_information.h index b39baeb..afe452f 100644 --- a/ift/encoder/requested_segmentation_information.h +++ b/ift/encoder/requested_segmentation_information.h @@ -83,6 +83,10 @@ class RequestedSegmentationInformation { const common::GlyphSet& FullClosure() const { return full_closure_; } + const SubsetDefinition& FullDefinition() const { return full_definition_; } + + bool SegmentsAreDisjoint() const { return segments_disjoint_; } + const std::vector& Segments() const { return segments_; } const std::vector SegmentSubsetDefinitions() const { @@ -125,8 +129,10 @@ class RequestedSegmentationInformation { std::vector segments_; SubsetDefinition init_font_segment_; + SubsetDefinition full_definition_; common::GlyphSet init_font_glyphs_; common::GlyphSet full_closure_; + bool segments_disjoint_; }; } // namespace ift::encoder diff --git a/ift/encoder/segmentation_context.h b/ift/encoder/segmentation_context.h index 2608428..be496cb 100644 --- a/ift/encoder/segmentation_context.h +++ b/ift/encoder/segmentation_context.h @@ -176,14 +176,16 @@ class SegmentationContext { // too small to be worthwhile. absl::StatusOr ComputeSegmentCutoff() const; - static std::unique_ptr NewPatchSizeCache(hb_face_t* face, uint32_t brotli_quality) { + static std::unique_ptr NewPatchSizeCache( + hb_face_t* face, uint32_t brotli_quality) { if (brotli_quality == 0) { auto cache = EstimatedPatchSizeCache::New(face); if (cache.ok()) { return std::move(*cache); } } - return std::unique_ptr(new PatchSizeCacheImpl(face, brotli_quality)); + return std::unique_ptr( + new PatchSizeCacheImpl(face, brotli_quality)); } public: diff --git a/ift/encoder/subset_definition.cc b/ift/encoder/subset_definition.cc index 3aa00a2..41ec792 100644 --- a/ift/encoder/subset_definition.cc +++ b/ift/encoder/subset_definition.cc @@ -79,14 +79,23 @@ void PrintTo(const SubsetDefinition& def, std::ostream* os) { } template -S subtract(const S& a, const S& b) { - S c; - for (uint32_t v : a) { - if (!b.contains(v)) { - c.insert(v); +void subtract_sets(S& a, const S& b) { + // Depending on which set is bigger use the implementation + // that iterates the fewest elements. + if (a.size() < b.size()) { + for (auto it = a.begin(); it != a.end();) { + if (b.contains(*it)) { + it = a.erase(it); + } else { + ++it; + } } + return; + } + + for (uint32_t v : b) { + a.erase(v); } - return c; } std::optional subtract(const AxisRange& a, const AxisRange& b) { @@ -143,7 +152,7 @@ design_space_t subtract(const design_space_t& a, const design_space_t& b) { void SubsetDefinition::Subtract(const SubsetDefinition& other) { codepoints.subtract(other.codepoints); gids.subtract(other.gids); - feature_tags = subtract(feature_tags, other.feature_tags); + subtract_sets(feature_tags, other.feature_tags); design_space = subtract(design_space, other.design_space); } diff --git a/ift/encoder/subset_definition_test.cc b/ift/encoder/subset_definition_test.cc index fd9e94a..8866e4e 100644 --- a/ift/encoder/subset_definition_test.cc +++ b/ift/encoder/subset_definition_test.cc @@ -141,18 +141,28 @@ TEST_F(SubsetDefinitionTest, Subtraction) { { SubsetDefinition a{1, 2, 3, 4}; a.gids = {7, 8, 9}; - a.feature_tags = {HB_TAG('f', 'o', 'o', ' '), HB_TAG('b', 'a', 'r', ' ')}; + a.feature_tags = {HB_TAG('f', 'o', 'o', ' '), HB_TAG('b', 'a', 'r', ' '), + HB_TAG('b', 'a', 'z', ' ')}; SubsetDefinition b{3, 5, 6}; b.gids = {8, 10}; - b.feature_tags = {HB_TAG('f', 'o', 'o', ' ')}; + b.feature_tags = {HB_TAG('f', 'o', 'o', ' '), HB_TAG('a', 'b', 'c', 'd')}; SubsetDefinition c{1, 2, 4}; c.gids = {7, 9}; - c.feature_tags = {HB_TAG('b', 'a', 'r', ' ')}; + c.feature_tags = {HB_TAG('b', 'a', 'r', ' '), HB_TAG('b', 'a', 'z', ' ')}; - a.Subtract(b); - ASSERT_EQ(a, c); + SubsetDefinition def = a; + def.Subtract(b); + ASSERT_EQ(def, c); + + SubsetDefinition d{5, 6}; + d.gids = {10}; + d.feature_tags = {HB_TAG('a', 'b', 'c', 'd')}; + + def = b; + def.Subtract(a); + ASSERT_EQ(def, d); } } diff --git a/util/closure_glyph_keyed_segmenter_util.cc b/util/closure_glyph_keyed_segmenter_util.cc index ee573f5..7c6dc63 100644 --- a/util/closure_glyph_keyed_segmenter_util.cc +++ b/util/closure_glyph_keyed_segmenter_util.cc @@ -142,7 +142,7 @@ static Status Main(const std::vector args) { TRY(config_util.ConfigToMergeGroups(config, font_codepoints, segments)); ClosureGlyphSegmenter segmenter(config.brotli_quality(), - config.brotli_quality_for_init_font_merge()); + config.brotli_quality_for_initial_font_merging()); GlyphSegmentation segmentation = TRY(segmenter.CodepointToGlyphSegments( font.get(), init_segment, segments, merge_groups, config.move_fallback_glyphs_into_initial_font())); diff --git a/util/generate_table_keyed_config.cc b/util/generate_table_keyed_config.cc index 1b2c779..95c84af 100644 --- a/util/generate_table_keyed_config.cc +++ b/util/generate_table_keyed_config.cc @@ -81,22 +81,6 @@ static StatusOr LoadSegmentationPlan(const char* path) { int main(int argc, char** argv) { auto args = absl::ParseCommandLine(argc, argv); - if (args.size() <= 1) { - std::cerr << "Usage:" << std::endl - << "generate_table_keyed_config " - " [...
]" - << std::endl - << std::endl - << "Where a subset file lists one codepoint per line in " - "hexadecimal format: 0xXXXX" - << std::endl - << std::endl - << "If you don't want the config to contain an initial codepoint " - "set, pass an empty file as the first argument." - << std::endl; - return -1; - } - SegmentationPlan config; CodepointSet init_codepoints; @@ -115,6 +99,20 @@ int main(int argc, char** argv) { CodepointSet empty; sets.push_back(empty); + } else if (args.size() <= 1) { + std::cerr << "Usage:" << std::endl + << "generate_table_keyed_config " + "
[...
]" + << std::endl + << std::endl + << "Where a subset file lists one codepoint per line in " + "hexadecimal format: 0xXXXX" + << std::endl + << std::endl + << "If you don't want the config to contain an initial codepoint " + "set, pass an empty file as the first argument." + << std::endl; + return -1; } for (size_t i = 1; i < args.size(); i++) { diff --git a/util/segmenter_config.proto b/util/segmenter_config.proto index 937b756..81d734d 100644 --- a/util/segmenter_config.proto +++ b/util/segmenter_config.proto @@ -3,32 +3,64 @@ edition = "2023"; import "util/common.proto"; // This messages provides the configuration details for closure_glyph_keyed_segmenter_util +// +// The closure_glyph_keyed_segmenter_util is used to turn a series of codepoint/feature based +// segments into a set of glyph based patches and activation conditions which respect a font's +// glyph substitution rules. This means that no matter what codepoints being rendered as long +// as the provided activation conditions are followed all needed glyphs will always be present. +// +// In addition to organizing the glyphs to respect substitution behaviour, the segmenter can +// also optionally try to optimize a glyph segmentation by selectively merging glyph patches +// together to reduce to the total number of patches and in turn the network overhead for loading +// them. In this configuration merging behaviour is primarily configured via the merge groups. +// +// Merging (if enabled) is done is three distinct phases (each is configured separately): +// 1. Preprocess merging: prior to glyph segmentation adjacent segments are merged together. +// Configuration specifies how many input segments are merged to form each merged segment. +// This is the cheapest type of merging, and since it happens prior to glyph segmentation +// it reduces the computation cost of all subsequent operations. +// +// 2. Initial font merging: when enabled, after glyph segmentation all of the patches are +// evaluated to see if it is beneficial to move their glyphs into the initial font and +// remove the patch. For example consider a patch that is expected to be needed 100% of +// the time. The overhead associated with the patch can be eliminated by moving it into +// the initial font. +// +// 3. Patch merging: lastly, any remaining patches are evaluated to find cases where segments +// can be merged together to improve the performance of the overall glyph segmentation. +// Two different merge selection algorithms are currently supported: cost based and heuristic +// The cost based approach evaluates potential merges agaisnt a cost function which utilizes +// codepoint frequency data. Heuristic is a much faster appraoch, but does not make use of +// frequency data. As a result it typically produces less optimal results. Primarily useful +// when frequency data is not available. message SegmenterConfig { // Specifies the set of codepoints and layout feature tags that should be placed into the // initial font, and hence be always available. SegmentProto initial_segment = 1; - // When set any glyphs that would be in the fallback segment (ie. glyphs that are always loaded) - // are moved into the initial font. + // When set any glyphs that would be in the fallback patch (ie. glyphs that are always loaded) + // are instead moved into the initial font. bool move_fallback_glyphs_into_initial_font = 2 [default = true]; // The set of segments the font is initially broken up into. The key in the map is an ID // used to refer to the segment in other parts of the config. Segments must be disjoint. // - // If this is not specified then an initial segmentation where each codepoint in the font - // is placed into it's own segment will be used. In this case each segment id is the value of + // If this is not specified then a segmentation where each codepoint in the font is placed + // into it's own segment will be used. In this case each segment id is the value of // the codepoint in that segment map segments = 3; // When utilizing automated segment generation ('segments' is unspecified), this provides // a list of feature groups that should be added to the automated segment list. Each group - // will be one segment. These use their own id space from segments. + // will be one segment. These use their own id space separate from segments. map feature_segments = 4; // When generating compressed patches (to evaluate their size) this is the brotli quality - // level used. Segmentation is typically bottle necked on brotli compression so higher values - // increase segmentation times, but yield more accurate results. + // level used. Merge selection is typically bottle necked on brotli compression so higher + // values increase processing times, but yield more accurate results. + // + // Value can from from 0 to 11. Higher numbers represent higher quality. // // If quality is set to '0' this disables brotli compression and instead estimates the // effect of compression using a fixed compression ratio calculated based on how well @@ -39,10 +71,10 @@ message SegmenterConfig { // the brotli quality used for generating compressed patches (to evaluate their size). // The init font merge is more sensitive to lower brotli qualities, so this allows a higher // brotli quality to be used in this phase if desired. - uint32 brotli_quality_for_init_font_merge = 6 [default = 11]; + uint32 brotli_quality_for_initial_font_merging = 6 [default = 11]; // These base configs define the common config setings used by the merge groups. - // Each individual merge groups config is created by starting with the base config of the + // Each individual merge group's config is created by starting with the base config of the // matching type and then overiding any fields specified in the merge group config. HeuristicConfiguration base_heuristic_config = 7; CostConfiguration base_cost_config = 8; @@ -54,10 +86,15 @@ message SegmenterConfig { // Like the merge groups this config will be combined with base_heuristic_config. HeuristicConfiguration ungrouped_config = 9; + // This is the group size (number of segments merged together) used in preprocess merging + // of any segments not covered by a merge group. Setting to 1 disables preprocess merging + // of ungrouped segments. + uint32 preprocess_merging_group_size_for_ungrouped = 10 [default = 1]; + // Merge groups specify how merging will be performed for groups of segments. // Any segments that are not covered by any merge group will not be merged. // Merge groups are not required to be disjoint and may have overlapping segments. - repeated MergeGroup merge_groups = 10; + repeated MergeGroup merge_groups = 11; } // For a given set of segments this configures how merging will be performed. Each merge group @@ -73,9 +110,20 @@ message MergeGroup { // Adds in segments provided in the 'feature_segments' mapping. SegmentsProto feature_segment_ids = 2; + // This is the group size (number of segments merged together) used in preprocess merging + // of any segments covered by this merge group. Setting to 1 disables preprocess merging + // of ungrouped segments. + uint32 preprocess_merging_group_size = 3 [default = 1]; + + // If frequency data is available only segments with probability less than this will + // be included in the preprocess merging phase. Setting this to 1.0 will make preprocess + // merging apply to all segments. Has no effect if this merge group is using heuristic + // merging. + double preprocess_merging_probability_threshold = 4 [default = 1.0]; + oneof config { - HeuristicConfiguration heuristic_config = 3; - CostConfiguration cost_config = 4; + HeuristicConfiguration heuristic_config = 5; + CostConfiguration cost_config = 6; } } @@ -116,13 +164,13 @@ message CostConfiguration { // If the cost delta (bytes) to move a segment into the initial font is less than this value then, // it will be moved into the initial font. If this is left unset then nothing will be moved to the // init font for this group. - double init_font_merge_threshold = 6; + double initial_font_merge_threshold = 6; // Segments below this probability will no be considered for moving into the initial font. This is // useful to prune segments from the analysis that are unlikely to be beneficial for moving. // // Value is from [0, 1]. - double init_font_merge_probability_threshold = 7; + double initial_font_merge_probability_threshold = 7; // By default merges under cost strategy are made by joining segments together, if this setting is // enabled than an alternate merge type, patch merge, will be considered by the merger. In a patch diff --git a/util/segmenter_config_util.cc b/util/segmenter_config_util.cc index 75edb6a..4ce7903 100644 --- a/util/segmenter_config_util.cc +++ b/util/segmenter_config_util.cc @@ -115,13 +115,13 @@ StatusOr SegmenterConfigUtil::ProtoToStrategy( strategy.SetOptimizationCutoffFraction(merged.optimization_cutoff_fraction()); - if (merged.has_init_font_merge_threshold()) { - strategy.SetInitFontMergeThreshold(merged.init_font_merge_threshold()); + if (merged.has_initial_font_merge_threshold()) { + strategy.SetInitFontMergeThreshold(merged.initial_font_merge_threshold()); } - if (merged.has_init_font_merge_probability_threshold()) { + if (merged.has_initial_font_merge_probability_threshold()) { strategy.SetInitFontMergeProbabilityThreshold( - merged.init_font_merge_probability_threshold()); + merged.initial_font_merge_probability_threshold()); } return strategy; @@ -172,6 +172,9 @@ SegmenterConfigUtil::ProtoToMergeGroup( } } + strategy.SetPreClosureGroupSize(group.preprocess_merging_group_size()); + strategy.SetPreClosureProbabilityThreshold(group.preprocess_merging_probability_threshold()); + return std::make_pair(segment_indices, strategy); } else { if (group.has_segment_ids()) { @@ -183,6 +186,10 @@ SegmenterConfigUtil::ProtoToMergeGroup( MergeStrategy strategy = ::util::ProtoToStrategy(base_heuristic, group.heuristic_config()); + + strategy.SetPreClosureGroupSize(group.preprocess_merging_group_size()); + strategy.SetPreClosureProbabilityThreshold(1.0); + return std::make_pair(segment_indices, strategy); } } @@ -226,6 +233,9 @@ SegmenterConfigUtil::ConfigToMergeGroups( MergeStrategy strategy = util::ProtoToStrategy(config.base_heuristic_config(), config.ungrouped_config()); + strategy.SetPreClosureGroupSize(config.preprocess_merging_group_size_for_ungrouped()); + strategy.SetPreClosureProbabilityThreshold(1.0); + merge_groups.insert(std::make_pair(uncovered_segments, strategy)); return merge_groups; diff --git a/util/segmenter_config_util_test.cc b/util/segmenter_config_util_test.cc index be9d107..bd7828d 100644 --- a/util/segmenter_config_util_test.cc +++ b/util/segmenter_config_util_test.cc @@ -249,7 +249,7 @@ TEST_F(SegmenterConfigUtilTest, group->mutable_cost_config()->set_path_to_frequency_data( "test_freq_data.riegeli"); group->mutable_cost_config()->set_network_overhead_cost(85); - group->mutable_cost_config()->set_init_font_merge_threshold(-70); + group->mutable_cost_config()->set_initial_font_merge_threshold(-70); CodepointSet font_codepoints{0x40, 0x42, 0x43, 0x45, 0x47}; @@ -274,7 +274,7 @@ TEST_F(SegmenterConfigUtilTest, TEST_F(SegmenterConfigUtilTest, ConfigToMergeGroups_SegmentsInferred_MergeGroupsSpecified_Cost) { SegmenterConfig config; - config.mutable_base_cost_config()->set_init_font_merge_threshold(-90); + config.mutable_base_cost_config()->set_initial_font_merge_threshold(-90); auto* group = config.add_merge_groups(); group->mutable_cost_config()->set_path_to_frequency_data(