-
Notifications
You must be signed in to change notification settings - Fork 139
Open
Labels
enhancementNew feature or requestNew feature or request
Description
Description
The SSA optimizer currently makes multiple independent passes over the CFG, each reading and modifying instructions. Combining related optimizations into single traversals would improve cache locality and reduce compilation time by 25-30%.
Current Implementation
// src/ssa.c - Current multiple passes
void ssa_optimize(func_t *func) {
sccp(func); // Pass 1: Full CFG traversal
cse(func); // Pass 2: Full CFG traversal
dce(func); // Pass 3: Full CFG traversal
strength_reduce(func); // Pass 4: Full CFG traversal
// Each pass separately iterates all blocks and instructions
}
Performance profile for 10,000 instruction function:
- 4 separate passes × 10,000 instructions = 40,000 instruction visits
- Poor cache locality between passes
- Redundant liveness/dominance recalculation
Proposed Combined Pass Architecture
1. Single-Pass Local Optimizations
// Combine all local (within basic block) optimizations
typedef struct {
bool changed;
int constants_propagated;
int expressions_eliminated;
int strength_reductions;
int algebraic_simplifications;
} local_opt_stats_t;
local_opt_stats_t optimize_basic_block(basic_block_t *bb) {
local_opt_stats_t stats = {0};
// Single pass through instructions
for (insn_t *insn = bb->first; insn; insn = insn->next) {
// Try all applicable optimizations on this instruction
// 1. Constant propagation
if (try_constant_propagation(insn)) {
stats.constants_propagated++;
stats.changed = true;
}
// 2. Algebraic simplification (may enable more constant prop)
if (try_algebraic_simplification(insn)) {
stats.algebraic_simplifications++;
stats.changed = true;
// Retry constant propagation after simplification
if (try_constant_propagation(insn)) {
stats.constants_propagated++;
}
}
// 3. Strength reduction
if (try_strength_reduction(insn)) {
stats.strength_reductions++;
stats.changed = true;
}
// 4. Common subexpression elimination
if (try_eliminate_common_subexpr(insn, bb)) {
stats.expressions_eliminated++;
stats.changed = true;
}
// 5. Peephole optimization
if (try_peephole_optimization(insn)) {
stats.changed = true;
}
}
return stats;
}
2. Worklist-Driven Global Optimization
// Process blocks in intelligent order
typedef struct {
basic_block_t **blocks;
int count;
int capacity;
bool *in_worklist; // Bitmap for O(1) membership test
} worklist_t;
void optimize_function_worklist(func_t *func) {
worklist_t *worklist = create_worklist(func->bb_count);
// Add all blocks initially
for (int i = 0; i < func->bb_count; i++) {
worklist_add(worklist, func->bbs[i]);
}
// Process until convergence
while (!worklist_empty(worklist)) {
basic_block_t *bb = worklist_remove(worklist);
// Apply all local optimizations
local_opt_stats_t stats = optimize_basic_block(bb);
if (stats.changed) {
// Add successors to worklist if this block changed
for (int i = 0; i < bb->succ_count; i++) {
if (!worklist->in_worklist[bb->successors[i]->id]) {
worklist_add(worklist, bb->successors[i]);
}
}
// Add users of defined variables (for CSE)
add_users_to_worklist(bb, worklist);
}
}
// One final pass for dead code elimination
eliminate_dead_code_global(func);
}
3. Optimization Context Sharing
// Share analysis results between optimizations
typedef struct {
// Constant propagation state
hashmap_t *known_constants;
// CSE state
hashmap_t *available_expressions;
// Liveness information (computed once)
bitset_t **live_in;
bitset_t **live_out;
// Dominance information (computed once)
int *idom; // Immediate dominators
bitset_t **dominance_frontier;
// Statistics
opt_statistics_t stats;
} optimization_context_t;
void optimize_with_context(func_t *func) {
optimization_context_t *ctx = create_opt_context(func);
// Compute analyses once
compute_liveness(func, ctx->live_in, ctx->live_out);
compute_dominance(func, ctx->idom, ctx->dominance_frontier);
// Run optimizations sharing context
bool changed;
int iteration = 0;
do {
changed = false;
for (bb in func->bbs) {
// All optimizations use shared context
changed |= propagate_constants_bb(bb, ctx);
changed |= eliminate_expressions_bb(bb, ctx);
changed |= simplify_algebraic_bb(bb, ctx);
}
iteration++;
} while (changed && iteration < MAX_ITERATIONS);
// Final cleanup
remove_dead_code(func, ctx);
print_optimization_stats(&ctx->stats);
}
4. Cascading Optimizations
// Enable cascading optimizations in single pass
bool optimize_instruction_cascade(insn_t *insn, optimization_context_t *ctx) {
bool changed = false;
bool local_changed;
// Keep applying optimizations until fixed point
do {
local_changed = false;
// Constant folding might enable simplification
if (fold_constants(insn, ctx)) {
local_changed = true;
ctx->stats.constants_folded++;
}
// Simplification might enable more folding
if (local_changed || simplify_algebraic(insn, ctx)) {
local_changed = true;
ctx->stats.algebraic_simplified++;
// Try constant folding again
if (fold_constants(insn, ctx)) {
ctx->stats.constants_folded++;
}
}
// Strength reduction on simplified expression
if (reduce_strength(insn, ctx)) {
local_changed = true;
ctx->stats.strength_reduced++;
}
changed |= local_changed;
} while (local_changed);
return changed;
}
5. Fast Pattern Matching
// Optimize pattern matching for common cases
typedef struct {
opcode_t op;
bool (*matcher)(insn_t *);
void (*optimizer)(insn_t *, optimization_context_t *);
} optimization_pattern_t;
// Pre-sorted by frequency for better branch prediction
optimization_pattern_t patterns[] = {
{OP_ADD, match_add_zero, optimize_identity},
{OP_MUL, match_mul_power2, optimize_mul_to_shift},
{OP_SUB, match_sub_self, optimize_to_zero},
{OP_DIV, match_div_power2, optimize_div_to_shift},
// ... more patterns
};
void apply_patterns(insn_t *insn, optimization_context_t *ctx) {
// Fast path for common opcodes
if (insn->op >= OP_ADD && insn->op <= OP_XOR) {
int pattern_idx = pattern_lookup_table[insn->op];
if (pattern_idx >= 0) {
optimization_pattern_t *p = &patterns[pattern_idx];
if (p->matcher(insn)) {
p->optimizer(insn, ctx);
ctx->stats.patterns_matched++;
}
}
}
}
6. Batch Processing for Cache Efficiency
// Process multiple instructions together for better cache usage
void optimize_instruction_batch(insn_t **batch, int count,
optimization_context_t *ctx) {
// Prefetch next batch while processing current
if (count >= 4) {
__builtin_prefetch(batch[4], 0, 1);
}
// Process batch
for (int i = 0; i < count; i++) {
insn_t *insn = batch[i];
// All optimizations on same instruction (cache hot)
optimize_instruction_cascade(insn, ctx);
}
}
void optimize_with_batching(func_t *func) {
#define BATCH_SIZE 8
insn_t *batch[BATCH_SIZE];
int batch_count = 0;
for (bb in func->bbs) {
for (insn in bb->insns) {
batch[batch_count++] = insn;
if (batch_count == BATCH_SIZE) {
optimize_instruction_batch(batch, batch_count, ctx);
batch_count = 0;
}
}
}
// Process remaining
if (batch_count > 0) {
optimize_instruction_batch(batch, batch_count, ctx);
}
}
Performance Comparison
Benchmark Results
// Test with 10,000 instruction function
void benchmark_optimization_passes() {
func_t *func = generate_large_function(10000);
// Old approach
clock_t start = clock();
ssa_optimize_old(func);
clock_t old_time = clock() - start;
// New combined approach
func = generate_large_function(10000); // Reset
start = clock();
optimize_with_context(func);
clock_t new_time = clock() - start;
printf("Old approach: %.3f ms\n", old_time * 1000.0 / CLOCKS_PER_SEC);
printf("New approach: %.3f ms\n", new_time * 1000.0 / CLOCKS_PER_SEC);
printf("Speedup: %.2fx\n", (float)old_time / new_time);
}
Integration Plan
- Phase 1: Implement combined local optimizations
- Phase 2: Add worklist infrastructure
- Phase 3: Share optimization context
- Phase 4: Benchmark and tune
- Phase 5: Remove old separate passes
Testing Strategy
# Correctness: ensure same optimization results
make test-opt-equivalence
# Performance: measure improvement
make benchmark-optimization
# Quality: ensure no regression in generated code
make test-code-quality
Success Criteria
- 25%+ reduction in optimization time
- Same or better code quality
- Reduced memory usage
- All tests pass
- Maintainable code structure
Metadata
Metadata
Assignees
Labels
enhancementNew feature or requestNew feature or request