diff --git a/ift/encoder/BUILD b/ift/encoder/BUILD index a398024..6886f74 100644 --- a/ift/encoder/BUILD +++ b/ift/encoder/BUILD @@ -84,6 +84,7 @@ cc_library( "//ift", "//ift/freq", "@abseil-cpp//absl/status", + "@abseil-cpp//absl/flags:flag", "@abseil-cpp//absl/status:statusor", "@abseil-cpp//absl/strings", "@abseil-cpp//absl/container:flat_hash_map", diff --git a/ift/encoder/candidate_merge.cc b/ift/encoder/candidate_merge.cc index 0b7efb6..37f9892 100644 --- a/ift/encoder/candidate_merge.cc +++ b/ift/encoder/candidate_merge.cc @@ -5,6 +5,7 @@ #include #include +#include "absl/flags/flag.h" #include "absl/container/btree_map.h" #include "absl/log/log.h" #include "absl/status/status.h" @@ -191,15 +192,10 @@ static void MergeSegments(const Merger& merger, const SegmentSet& segments, base.SetProbability(bound); } -static Status AddConditionAndPatchSize( - const Merger& merger, const ActivationCondition& condition, - btree_map& conditions) { - auto existing = conditions.find(condition); - if (existing != conditions.end()) { - // already exists. - return absl::OkStatus(); - } - +static StatusOr ConditionToPatchSize( + const Merger& merger, + const ActivationCondition& condition +) { const auto& conditions_and_glyphs = merger.Context().glyph_groupings.ConditionsAndGlyphs(); auto it = conditions_and_glyphs.find(condition); @@ -209,9 +205,21 @@ static Status AddConditionAndPatchSize( } const GlyphSet& glyphs = it->second; - uint32_t patch_size = + return TRY(merger.Context().patch_size_cache->GetPatchSize(glyphs)); - conditions.insert(std::pair(condition, patch_size)); +} + +static Status AddConditionAndPatchSize( + const Merger& merger, const ActivationCondition& condition, + btree_map& conditions) { + + auto existing = conditions.lower_bound(condition); + if (existing != conditions.end() && existing->first == condition) { + // already exists. + return absl::OkStatus(); + } + + conditions.emplace_hint(existing, condition, TRY(ConditionToPatchSize(merger, condition))); return absl::OkStatus(); } @@ -413,8 +421,27 @@ StatusOr> CandidateMerge::ComputeInitFontCostDelta( return std::make_pair(total_delta, glyph_closure_delta); } +static std::optional ComputeMergedSizeReduction( + uint32_t new_patch_size, + const btree_map& removed_conditions +) { + int32_t total_removed_size = 0; + int32_t largest_size = 0; + for (const auto& [_, removed_size] : removed_conditions) { + total_removed_size += removed_size; + largest_size = std::max((int32_t) removed_size, largest_size); + } + + int32_t extra_raw = total_removed_size - largest_size; + int32_t extra_actual = ((int32_t) new_patch_size) - largest_size; + if (extra_raw == 0) { + return std::nullopt; + } + return (double) extra_actual / (double) extra_raw; +} + StatusOr CandidateMerge::ComputeCostDelta( - const Merger& merger, const SegmentSet& merged_segments, + Merger& merger, const SegmentSet& merged_segments, const Segment& merged_segment, std::optional maybe_new_patch_size) { // TODO(garretrieger): the accuracy of this can be improved by factoring @@ -443,14 +470,24 @@ StatusOr CandidateMerge::ComputeCostDelta( uint32_t new_patch_size = 0; if (maybe_new_patch_size.has_value()) { new_patch_size = *maybe_new_patch_size; + if (merger.ShouldRecordMergedSizeReductions()) { + std::optional reduction = ComputeMergedSizeReduction(new_patch_size, removed_conditions); + if (reduction.has_value()) { + merger.RecordMergedSizeReduction(*reduction); + } + } } else { - // In the best case the merged patch size is equal to that of the largest removed patch. - // All removed patches will be joined into the new merged segment, and the best case is - // that all of their data is completely redundant as much as possible. + // In the best case the merged patch size is equal to that of the largest removed patch, + // plus the data of all other removed patches reduced by a configured fraction. + uint32_t total_removed_size = 0; + uint32_t largest_size = 0; for (const auto& [_, removed_size] : removed_conditions) { - new_patch_size = std::max(removed_size, new_patch_size); + total_removed_size += removed_size; + largest_size = std::max(removed_size, largest_size); } - new_patch_size += Merger::BEST_CASE_MERGE_SIZE_DELTA; + uint32_t extra = total_removed_size - largest_size; + extra = std::max((uint32_t) (extra * merger.Strategy().BestCaseSizeReductionFraction()), Merger::BEST_CASE_MERGE_SIZE_DELTA); + new_patch_size = largest_size + extra; } double cost_delta = 0.0; diff --git a/ift/encoder/candidate_merge.h b/ift/encoder/candidate_merge.h index ca48844..1e164ff 100644 --- a/ift/encoder/candidate_merge.h +++ b/ift/encoder/candidate_merge.h @@ -143,7 +143,7 @@ struct CandidateMerge { // If new_patch_size is not provided then this computes a "best case" delta // where the new patch size is choosen to produce the best achievable delta. static absl::StatusOr ComputeCostDelta( - const Merger& merger, const common::SegmentSet& merged_segments, + Merger& merger, const common::SegmentSet& merged_segments, const Segment& merged_segment, std::optional new_patch_size); // Computes the predicted change to the toal cost if moved_glyphs are diff --git a/ift/encoder/closure_glyph_segmenter.cc b/ift/encoder/closure_glyph_segmenter.cc index 37991a6..b5ee711 100644 --- a/ift/encoder/closure_glyph_segmenter.cc +++ b/ift/encoder/closure_glyph_segmenter.cc @@ -445,10 +445,6 @@ StatusOr ClosureGlyphSegmenter::CodepointToGlyphSegments( TRYV(context.ReassignInitSubset(new_def)); } - // Before we start merging, make sure the state after init font processing is - // correct. - TRYV(ValidateIncrementalGroupings(face, context)); - if (merge_groups.empty()) { // No merging will be needed so we're done. return context.ToGlyphSegmentation(); @@ -494,6 +490,10 @@ StatusOr ClosureGlyphSegmenter::CodepointToGlyphSegments( // Nothing was merged so we're done. TRYV(ValidateIncrementalGroupings(face, context)); context.patch_size_cache->LogBrotliCallCount(); + for (const auto& merger : mergers) { + merger.LogMergedSizeHistogram(); + } + return context.ToGlyphSegmentation(); } diff --git a/ift/encoder/merge_strategy.cc b/ift/encoder/merge_strategy.cc index 951ab29..0f32665 100644 --- a/ift/encoder/merge_strategy.cc +++ b/ift/encoder/merge_strategy.cc @@ -10,8 +10,19 @@ void PrintTo(const MergeStrategy& strategy, std::ostream* os) { << " network_overhead = " << strategy.NetworkOverheadCost() << std::endl << " min_group_size = " << strategy.MinimumGroupSize() << std::endl - << " optimization_cutoff = " << strategy.OptimizationCutoffFraction() - << std::endl; + << " optimization_cutoff = " << strategy.OptimizationCutoffFraction() << std::endl + << " best_case_size_reduction_fraction = " << strategy.BestCaseSizeReductionFraction() << std::endl; + + if (strategy.InitFontMergeThreshold().has_value()) { + *os << " init_font_merge_threshold = " << *strategy.InitFontMergeThreshold() << std::endl; + } + if (strategy.InitFontMergeProbabilityThreshold().has_value()) { + *os << " init_font_merge_probability_threshold = " << *strategy.InitFontMergeProbabilityThreshold() << std::endl; + } + *os << " use_patch_merges = " << strategy.UsePatchMerges() << std::endl + << " pre_closure_group_size = " << strategy.PreClosureGroupSize() << std::endl + << " pre_closure_probability_threshold = " << strategy.PreClosureProbabilityThreshold() << std::endl; + *os << std::endl; if (strategy.init_font_merge_threshold_.has_value()) { *os << " init_font_merge_threshold = " diff --git a/ift/encoder/merge_strategy.h b/ift/encoder/merge_strategy.h index 08fa9a8..481cfcd 100644 --- a/ift/encoder/merge_strategy.h +++ b/ift/encoder/merge_strategy.h @@ -142,10 +142,23 @@ class MergeStrategy { double OptimizationCutoffFraction() const { return optimization_cutoff_fraction_; } + void SetOptimizationCutoffFraction(double value) { optimization_cutoff_fraction_ = value; } + // For best case size reduction computations this sets the assumed smallest + // possible reduction in data (post compression) added to a base patch. + // + // See the comment in segmenter_config.proto for more details. + double BestCaseSizeReductionFraction() const { + return best_case_size_reduction_fraction_; + } + + void SetBestCaseSizeReductionFraction(double value) { + best_case_size_reduction_fraction_ = std::max(0.0, std::min(1.0, value)); + } + // Configures the threshold (cost delta) for when to merge a segment into // the init font. If not set then no segments will be merged into the init // font. @@ -189,7 +202,12 @@ class MergeStrategy { patch_size_max_bytes_ == other.patch_size_max_bytes_ && optimization_cutoff_fraction_ == other.optimization_cutoff_fraction_ && - init_font_merge_threshold_ == other.init_font_merge_threshold_; + best_case_size_reduction_fraction_ == other.best_case_size_reduction_fraction_ && + init_font_merge_threshold_ == other.init_font_merge_threshold_ && + init_font_merge_probability_threshold_ == other.init_font_merge_probability_threshold_ && + use_patch_merges_ == other.use_patch_merges_ && + pre_closure_group_size_ == other.pre_closure_group_size_ && + pre_closure_probability_threshold_ == other.pre_closure_probability_threshold_; } private: @@ -210,12 +228,13 @@ class MergeStrategy { uint32_t patch_size_min_bytes_; uint32_t patch_size_max_bytes_; double optimization_cutoff_fraction_ = 0.001; + double best_case_size_reduction_fraction_ = 0.5; std::optional init_font_merge_threshold_ = std::nullopt; std::optional init_font_merge_probability_threshold_ = std::nullopt; bool use_patch_merges_ = false; uint32_t pre_closure_group_size_ = 1; - double pre_closure_probability_threshold_ = 0.0; + double pre_closure_probability_threshold_ = 1.0; std::shared_ptr probability_calculator_; }; diff --git a/ift/encoder/merger.cc b/ift/encoder/merger.cc index b3a94b3..d821b37 100644 --- a/ift/encoder/merger.cc +++ b/ift/encoder/merger.cc @@ -2,6 +2,7 @@ #include +#include "absl/flags/flag.h" #include "absl/status/status.h" #include "absl/status/statusor.h" #include "common/int_set.h" @@ -16,8 +17,15 @@ using absl::StatusOr; using common::GlyphSet; using common::SegmentSet; +ABSL_FLAG(bool, record_merged_size_reductions, false, + "When enabled the merger will record the percent size reductions of each assessed merge."); + namespace ift::encoder { +bool Merger::ShouldRecordMergedSizeReductions() const { + return absl::GetFlag(FLAGS_record_merged_size_reductions); +} + StatusOr>> Merger::TryNextMerge() { if (strategy_.IsNone()) { @@ -704,4 +712,18 @@ Status Merger::ApplyInitFontMove(const GlyphSet& glyphs_to_move, double delta) { return absl::OkStatus(); } +void Merger::LogMergedSizeHistogram() const { + if (!ShouldRecordMergedSizeReductions()) { + return; + } + + std::stringstream histogram_string; + histogram_string << "reduction_percent, count" << std::endl; + for (const auto [percent, count] : merged_size_reduction_histogram_) { + histogram_string << percent << ", " << count << std::endl; + } + VLOG(0) << "Merged Size Reduction Histogram for " << strategy_.Name().value_or("unamed") << std::endl + << histogram_string.str(); +} + } // namespace ift::encoder \ No newline at end of file diff --git a/ift/encoder/merger.h b/ift/encoder/merger.h index d37e155..f0e3a76 100644 --- a/ift/encoder/merger.h +++ b/ift/encoder/merger.h @@ -2,6 +2,7 @@ #define IFT_ENCODER_MERGER_ #include +#include #include "common/int_set.h" #include "ift/encoder/candidate_merge.h" @@ -85,6 +86,15 @@ class Merger { uint32_t NumInscopeSegments() const { return inscope_segments_.size(); } + void RecordMergedSizeReduction(double size_reduction) { + int32_t reduction_percent = 100.0 * size_reduction; + merged_size_reduction_histogram_[reduction_percent]++; + } + + bool ShouldRecordMergedSizeReductions() const; + + void LogMergedSizeHistogram() const; + private: Merger(SegmentationContext& context, MergeStrategy strategy, common::SegmentSet inscope_segments, @@ -180,6 +190,9 @@ class Merger { // selecting merges. Merging is done via simple selection until minimum group // sizes are met. segment_index_t optimization_cutoff_segment_; + + // Percent reduction of data beyond the single largest input patch. + absl::btree_map merged_size_reduction_histogram_; }; } // namespace ift::encoder diff --git a/util/segmenter_config.proto b/util/segmenter_config.proto index 61481f4..b1f9006 100644 --- a/util/segmenter_config.proto +++ b/util/segmenter_config.proto @@ -190,6 +190,30 @@ message CostConfiguration { // Value is from [0, 1]. double initial_font_merge_probability_threshold = 8; + // The merger prunes patch pairs from evaluation by looking at a "best case" cost reduction. During + // this calculation the size of the merged patch is estimated using the size of the individual patches + // being merged. The following forumula is used: + // + // merged_size = + // largest_individual_patch_size + + // best_case_size_reduction_fraction * sum of remaining patch sizes + // + // Where the individual patch sizes used are post compression. In affect this sets the lower bound for + // how much merged data can shrink beyond it's initial compressed size (for the best case computation). + // + // The value is from [0, 1.0]. Lower values make best case pruning less aggressive, which means slower + // run time but may lead to lower final segmentation costs. Higher values make best case pruning more + // aggressive, which means faster run time but if it's set too high the merger may miss good merges. + // + // The current default value of 0.5 was selected by looking at the distribution of size reductions + // in some example fonts. It was found (using brotli 9) that a only 0.2% of pairs have a reduction + // less than 0.5. So 0.5 should typically have minimal impact on the final cost, while offering + // a signficant run time speedup. + // + // TODO(garretrieger): evaluate this accross a larger sampling of fonts and brotli qualities to find a + // more general default value. + double best_case_size_reduction_fraction = 9 [default = 0.5]; + // By default merges under cost strategy are made by joining segments together, if this setting is // enabled than an alternate merge type, patch merge, will be considered by the merger. In a patch // merge glyphs from two patches are merged together along with the conditions for those patches. @@ -209,7 +233,7 @@ message CostConfiguration { // Work is planned to fix this issue (either in the harfbuzz closure or with a workaround in the segmenter), // until then it's recommended to not used this except for testing (or if appropriate care has been taken // in producing the input segments to avoid this issue). - bool experimental_use_patch_merges = 9 [default = false]; + bool experimental_use_patch_merges = 10 [default = false]; } // The merger will choose segments to merge based on a heuristic which primarily utilizes diff --git a/util/segmenter_config_util.cc b/util/segmenter_config_util.cc index d61fc6d..be0755b 100644 --- a/util/segmenter_config_util.cc +++ b/util/segmenter_config_util.cc @@ -122,6 +122,7 @@ StatusOr SegmenterConfigUtil::ProtoToStrategy( strategy.SetUsePatchMerges(merged.experimental_use_patch_merges()); strategy.SetOptimizationCutoffFraction(merged.optimization_cutoff_fraction()); + strategy.SetBestCaseSizeReductionFraction(merged.best_case_size_reduction_fraction()); if (merged.has_initial_font_merge_threshold()) { strategy.SetInitFontMergeThreshold(merged.initial_font_merge_threshold()); diff --git a/util/segmenter_config_util_test.cc b/util/segmenter_config_util_test.cc index bd7828d..2ae20a2 100644 --- a/util/segmenter_config_util_test.cc +++ b/util/segmenter_config_util_test.cc @@ -446,4 +446,31 @@ TEST_F(SegmenterConfigUtilTest, {{0}, ExpectedCostStrategy(75)}, {{1}, ExpectedCostStrategy(75)}, })); +} + +TEST_F(SegmenterConfigUtilTest, ConfigToMergeGroups_OptimizationSettings) { + SegmenterConfig config; + auto* group = config.add_merge_groups(); + group->mutable_cost_config()->set_path_to_frequency_data( + "test_freq_data.riegeli"); + + group->mutable_cost_config()->set_optimization_cutoff_fraction(0.12); + group->mutable_cost_config()->set_best_case_size_reduction_fraction(0.34); + + CodepointSet font_codepoints{0x40, 0x42, 0x43, 0x45, 0x47}; + + SegmenterConfigUtil util("util/testdata/config.txtpb"); + + std::vector segments_out; + auto groups = util.ConfigToMergeGroups(config, font_codepoints, segments_out); + ASSERT_TRUE(groups.ok()) << groups.status(); + + + MergeStrategy expected = ExpectedCostStrategy(75); + expected.SetOptimizationCutoffFraction(0.12); + expected.SetBestCaseSizeReductionFraction(0.34); + + ASSERT_EQ( + *groups, + (btree_map{{{2}, expected}})); } \ No newline at end of file