Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ift/encoder/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ cc_library(
"//ift",
"//ift/freq",
"@abseil-cpp//absl/status",
"@abseil-cpp//absl/flags:flag",
"@abseil-cpp//absl/status:statusor",
"@abseil-cpp//absl/strings",
"@abseil-cpp//absl/container:flat_hash_map",
Expand Down
71 changes: 54 additions & 17 deletions ift/encoder/candidate_merge.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include <utility>
#include <vector>

#include "absl/flags/flag.h"
#include "absl/container/btree_map.h"
#include "absl/log/log.h"
#include "absl/status/status.h"
Expand Down Expand Up @@ -191,15 +192,10 @@ static void MergeSegments(const Merger& merger, const SegmentSet& segments,
base.SetProbability(bound);
}

static Status AddConditionAndPatchSize(
const Merger& merger, const ActivationCondition& condition,
btree_map<ActivationCondition, uint32_t>& conditions) {
auto existing = conditions.find(condition);
if (existing != conditions.end()) {
// already exists.
return absl::OkStatus();
}

static StatusOr<uint32_t> ConditionToPatchSize(
const Merger& merger,
const ActivationCondition& condition
) {
const auto& conditions_and_glyphs =
merger.Context().glyph_groupings.ConditionsAndGlyphs();
auto it = conditions_and_glyphs.find(condition);
Expand All @@ -209,9 +205,21 @@ static Status AddConditionAndPatchSize(
}

const GlyphSet& glyphs = it->second;
uint32_t patch_size =
return
TRY(merger.Context().patch_size_cache->GetPatchSize(glyphs));
conditions.insert(std::pair(condition, patch_size));
}

static Status AddConditionAndPatchSize(
const Merger& merger, const ActivationCondition& condition,
btree_map<ActivationCondition, uint32_t>& conditions) {

auto existing = conditions.lower_bound(condition);
if (existing != conditions.end() && existing->first == condition) {
// already exists.
return absl::OkStatus();
}

conditions.emplace_hint(existing, condition, TRY(ConditionToPatchSize(merger, condition)));
return absl::OkStatus();
}

Expand Down Expand Up @@ -413,8 +421,27 @@ StatusOr<std::pair<double, GlyphSet>> CandidateMerge::ComputeInitFontCostDelta(
return std::make_pair(total_delta, glyph_closure_delta);
}

static std::optional<double> ComputeMergedSizeReduction(
uint32_t new_patch_size,
const btree_map<ActivationCondition, uint32_t>& removed_conditions
) {
int32_t total_removed_size = 0;
int32_t largest_size = 0;
for (const auto& [_, removed_size] : removed_conditions) {
total_removed_size += removed_size;
largest_size = std::max((int32_t) removed_size, largest_size);
}

int32_t extra_raw = total_removed_size - largest_size;
int32_t extra_actual = ((int32_t) new_patch_size) - largest_size;
if (extra_raw == 0) {
return std::nullopt;
}
return (double) extra_actual / (double) extra_raw;
}

StatusOr<double> CandidateMerge::ComputeCostDelta(
const Merger& merger, const SegmentSet& merged_segments,
Merger& merger, const SegmentSet& merged_segments,
const Segment& merged_segment, std::optional<uint32_t> maybe_new_patch_size) {

// TODO(garretrieger): the accuracy of this can be improved by factoring
Expand Down Expand Up @@ -443,14 +470,24 @@ StatusOr<double> CandidateMerge::ComputeCostDelta(
uint32_t new_patch_size = 0;
if (maybe_new_patch_size.has_value()) {
new_patch_size = *maybe_new_patch_size;
if (merger.ShouldRecordMergedSizeReductions()) {
std::optional<double> reduction = ComputeMergedSizeReduction(new_patch_size, removed_conditions);
if (reduction.has_value()) {
merger.RecordMergedSizeReduction(*reduction);
}
}
} else {
// In the best case the merged patch size is equal to that of the largest removed patch.
// All removed patches will be joined into the new merged segment, and the best case is
// that all of their data is completely redundant as much as possible.
// In the best case the merged patch size is equal to that of the largest removed patch,
// plus the data of all other removed patches reduced by a configured fraction.
uint32_t total_removed_size = 0;
uint32_t largest_size = 0;
for (const auto& [_, removed_size] : removed_conditions) {
new_patch_size = std::max(removed_size, new_patch_size);
total_removed_size += removed_size;
largest_size = std::max(removed_size, largest_size);
}
new_patch_size += Merger::BEST_CASE_MERGE_SIZE_DELTA;
uint32_t extra = total_removed_size - largest_size;
extra = std::max((uint32_t) (extra * merger.Strategy().BestCaseSizeReductionFraction()), Merger::BEST_CASE_MERGE_SIZE_DELTA);
new_patch_size = largest_size + extra;
}

double cost_delta = 0.0;
Expand Down
2 changes: 1 addition & 1 deletion ift/encoder/candidate_merge.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ struct CandidateMerge {
// If new_patch_size is not provided then this computes a "best case" delta
// where the new patch size is choosen to produce the best achievable delta.
static absl::StatusOr<double> ComputeCostDelta(
const Merger& merger, const common::SegmentSet& merged_segments,
Merger& merger, const common::SegmentSet& merged_segments,
const Segment& merged_segment, std::optional<uint32_t> new_patch_size);

// Computes the predicted change to the toal cost if moved_glyphs are
Expand Down
8 changes: 4 additions & 4 deletions ift/encoder/closure_glyph_segmenter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -445,10 +445,6 @@ StatusOr<GlyphSegmentation> ClosureGlyphSegmenter::CodepointToGlyphSegments(
TRYV(context.ReassignInitSubset(new_def));
}

// Before we start merging, make sure the state after init font processing is
// correct.
TRYV(ValidateIncrementalGroupings(face, context));

if (merge_groups.empty()) {
// No merging will be needed so we're done.
return context.ToGlyphSegmentation();
Expand Down Expand Up @@ -494,6 +490,10 @@ StatusOr<GlyphSegmentation> ClosureGlyphSegmenter::CodepointToGlyphSegments(
// Nothing was merged so we're done.
TRYV(ValidateIncrementalGroupings(face, context));
context.patch_size_cache->LogBrotliCallCount();
for (const auto& merger : mergers) {
merger.LogMergedSizeHistogram();
}

return context.ToGlyphSegmentation();
}

Expand Down
15 changes: 13 additions & 2 deletions ift/encoder/merge_strategy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,19 @@ void PrintTo(const MergeStrategy& strategy, std::ostream* os) {
<< " network_overhead = " << strategy.NetworkOverheadCost()
<< std::endl
<< " min_group_size = " << strategy.MinimumGroupSize() << std::endl
<< " optimization_cutoff = " << strategy.OptimizationCutoffFraction()
<< std::endl;
<< " optimization_cutoff = " << strategy.OptimizationCutoffFraction() << std::endl
<< " best_case_size_reduction_fraction = " << strategy.BestCaseSizeReductionFraction() << std::endl;

if (strategy.InitFontMergeThreshold().has_value()) {
*os << " init_font_merge_threshold = " << *strategy.InitFontMergeThreshold() << std::endl;
}
if (strategy.InitFontMergeProbabilityThreshold().has_value()) {
*os << " init_font_merge_probability_threshold = " << *strategy.InitFontMergeProbabilityThreshold() << std::endl;
}
*os << " use_patch_merges = " << strategy.UsePatchMerges() << std::endl
<< " pre_closure_group_size = " << strategy.PreClosureGroupSize() << std::endl
<< " pre_closure_probability_threshold = " << strategy.PreClosureProbabilityThreshold() << std::endl;
*os << std::endl;

if (strategy.init_font_merge_threshold_.has_value()) {
*os << " init_font_merge_threshold = "
Expand Down
23 changes: 21 additions & 2 deletions ift/encoder/merge_strategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -142,10 +142,23 @@ class MergeStrategy {
double OptimizationCutoffFraction() const {
return optimization_cutoff_fraction_;
}

void SetOptimizationCutoffFraction(double value) {
optimization_cutoff_fraction_ = value;
}

// For best case size reduction computations this sets the assumed smallest
// possible reduction in data (post compression) added to a base patch.
//
// See the comment in segmenter_config.proto for more details.
double BestCaseSizeReductionFraction() const {
return best_case_size_reduction_fraction_;
}

void SetBestCaseSizeReductionFraction(double value) {
best_case_size_reduction_fraction_ = std::max(0.0, std::min(1.0, value));
}

// Configures the threshold (cost delta) for when to merge a segment into
// the init font. If not set then no segments will be merged into the init
// font.
Expand Down Expand Up @@ -189,7 +202,12 @@ class MergeStrategy {
patch_size_max_bytes_ == other.patch_size_max_bytes_ &&
optimization_cutoff_fraction_ ==
other.optimization_cutoff_fraction_ &&
init_font_merge_threshold_ == other.init_font_merge_threshold_;
best_case_size_reduction_fraction_ == other.best_case_size_reduction_fraction_ &&
init_font_merge_threshold_ == other.init_font_merge_threshold_ &&
init_font_merge_probability_threshold_ == other.init_font_merge_probability_threshold_ &&
use_patch_merges_ == other.use_patch_merges_ &&
pre_closure_group_size_ == other.pre_closure_group_size_ &&
pre_closure_probability_threshold_ == other.pre_closure_probability_threshold_;
}

private:
Expand All @@ -210,12 +228,13 @@ class MergeStrategy {
uint32_t patch_size_min_bytes_;
uint32_t patch_size_max_bytes_;
double optimization_cutoff_fraction_ = 0.001;
double best_case_size_reduction_fraction_ = 0.5;
std::optional<double> init_font_merge_threshold_ = std::nullopt;
std::optional<double> init_font_merge_probability_threshold_ = std::nullopt;
bool use_patch_merges_ = false;

uint32_t pre_closure_group_size_ = 1;
double pre_closure_probability_threshold_ = 0.0;
double pre_closure_probability_threshold_ = 1.0;

std::shared_ptr<freq::ProbabilityCalculator> probability_calculator_;
};
Expand Down
22 changes: 22 additions & 0 deletions ift/encoder/merger.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include <optional>

#include "absl/flags/flag.h"
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "common/int_set.h"
Expand All @@ -16,8 +17,15 @@ using absl::StatusOr;
using common::GlyphSet;
using common::SegmentSet;

ABSL_FLAG(bool, record_merged_size_reductions, false,
"When enabled the merger will record the percent size reductions of each assessed merge.");

namespace ift::encoder {

bool Merger::ShouldRecordMergedSizeReductions() const {
return absl::GetFlag(FLAGS_record_merged_size_reductions);
}

StatusOr<std::optional<std::pair<segment_index_t, GlyphSet>>>
Merger::TryNextMerge() {
if (strategy_.IsNone()) {
Expand Down Expand Up @@ -704,4 +712,18 @@ Status Merger::ApplyInitFontMove(const GlyphSet& glyphs_to_move, double delta) {
return absl::OkStatus();
}

void Merger::LogMergedSizeHistogram() const {
if (!ShouldRecordMergedSizeReductions()) {
return;
}

std::stringstream histogram_string;
histogram_string << "reduction_percent, count" << std::endl;
for (const auto [percent, count] : merged_size_reduction_histogram_) {
histogram_string << percent << ", " << count << std::endl;
}
VLOG(0) << "Merged Size Reduction Histogram for " << strategy_.Name().value_or("unamed") << std::endl
<< histogram_string.str();
}

} // namespace ift::encoder
13 changes: 13 additions & 0 deletions ift/encoder/merger.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#define IFT_ENCODER_MERGER_

#include <cstdint>
#include <sstream>

#include "common/int_set.h"
#include "ift/encoder/candidate_merge.h"
Expand Down Expand Up @@ -85,6 +86,15 @@ class Merger {

uint32_t NumInscopeSegments() const { return inscope_segments_.size(); }

void RecordMergedSizeReduction(double size_reduction) {
int32_t reduction_percent = 100.0 * size_reduction;
merged_size_reduction_histogram_[reduction_percent]++;
}

bool ShouldRecordMergedSizeReductions() const;

void LogMergedSizeHistogram() const;

private:
Merger(SegmentationContext& context, MergeStrategy strategy,
common::SegmentSet inscope_segments,
Expand Down Expand Up @@ -180,6 +190,9 @@ class Merger {
// selecting merges. Merging is done via simple selection until minimum group
// sizes are met.
segment_index_t optimization_cutoff_segment_;

// Percent reduction of data beyond the single largest input patch.
absl::btree_map<int32_t, uint32_t> merged_size_reduction_histogram_;
};

} // namespace ift::encoder
Expand Down
26 changes: 25 additions & 1 deletion util/segmenter_config.proto
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,30 @@ message CostConfiguration {
// Value is from [0, 1].
double initial_font_merge_probability_threshold = 8;

// The merger prunes patch pairs from evaluation by looking at a "best case" cost reduction. During
// this calculation the size of the merged patch is estimated using the size of the individual patches
// being merged. The following forumula is used:
//
// merged_size =
// largest_individual_patch_size +
// best_case_size_reduction_fraction * sum of remaining patch sizes
//
// Where the individual patch sizes used are post compression. In affect this sets the lower bound for
// how much merged data can shrink beyond it's initial compressed size (for the best case computation).
//
// The value is from [0, 1.0]. Lower values make best case pruning less aggressive, which means slower
// run time but may lead to lower final segmentation costs. Higher values make best case pruning more
// aggressive, which means faster run time but if it's set too high the merger may miss good merges.
//
// The current default value of 0.5 was selected by looking at the distribution of size reductions
// in some example fonts. It was found (using brotli 9) that a only 0.2% of pairs have a reduction
// less than 0.5. So 0.5 should typically have minimal impact on the final cost, while offering
// a signficant run time speedup.
//
// TODO(garretrieger): evaluate this accross a larger sampling of fonts and brotli qualities to find a
// more general default value.
double best_case_size_reduction_fraction = 9 [default = 0.5];

// By default merges under cost strategy are made by joining segments together, if this setting is
// enabled than an alternate merge type, patch merge, will be considered by the merger. In a patch
// merge glyphs from two patches are merged together along with the conditions for those patches.
Expand All @@ -209,7 +233,7 @@ message CostConfiguration {
// Work is planned to fix this issue (either in the harfbuzz closure or with a workaround in the segmenter),
// until then it's recommended to not used this except for testing (or if appropriate care has been taken
// in producing the input segments to avoid this issue).
bool experimental_use_patch_merges = 9 [default = false];
bool experimental_use_patch_merges = 10 [default = false];
}

// The merger will choose segments to merge based on a heuristic which primarily utilizes
Expand Down
1 change: 1 addition & 0 deletions util/segmenter_config_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ StatusOr<MergeStrategy> SegmenterConfigUtil::ProtoToStrategy(
strategy.SetUsePatchMerges(merged.experimental_use_patch_merges());

strategy.SetOptimizationCutoffFraction(merged.optimization_cutoff_fraction());
strategy.SetBestCaseSizeReductionFraction(merged.best_case_size_reduction_fraction());

if (merged.has_initial_font_merge_threshold()) {
strategy.SetInitFontMergeThreshold(merged.initial_font_merge_threshold());
Expand Down
27 changes: 27 additions & 0 deletions util/segmenter_config_util_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -446,4 +446,31 @@ TEST_F(SegmenterConfigUtilTest,
{{0}, ExpectedCostStrategy(75)},
{{1}, ExpectedCostStrategy(75)},
}));
}

TEST_F(SegmenterConfigUtilTest, ConfigToMergeGroups_OptimizationSettings) {
SegmenterConfig config;
auto* group = config.add_merge_groups();
group->mutable_cost_config()->set_path_to_frequency_data(
"test_freq_data.riegeli");

group->mutable_cost_config()->set_optimization_cutoff_fraction(0.12);
group->mutable_cost_config()->set_best_case_size_reduction_fraction(0.34);

CodepointSet font_codepoints{0x40, 0x42, 0x43, 0x45, 0x47};

SegmenterConfigUtil util("util/testdata/config.txtpb");

std::vector<SubsetDefinition> segments_out;
auto groups = util.ConfigToMergeGroups(config, font_codepoints, segments_out);
ASSERT_TRUE(groups.ok()) << groups.status();


MergeStrategy expected = ExpectedCostStrategy(75);
expected.SetOptimizationCutoffFraction(0.12);
expected.SetBestCaseSizeReductionFraction(0.34);

ASSERT_EQ(
*groups,
(btree_map<SegmentSet, MergeStrategy>{{{2}, expected}}));
}
Loading