Skip to content

Combine SSA optimization passes to reduce CFG traversals #295

@jserv

Description

@jserv

Description

The SSA optimizer currently makes multiple independent passes over the CFG, each reading and modifying instructions. Combining related optimizations into single traversals would improve cache locality and reduce compilation time by 25-30%.

Current Implementation

// src/ssa.c - Current multiple passes
void ssa_optimize(func_t *func) {
    sccp(func);           // Pass 1: Full CFG traversal
    cse(func);            // Pass 2: Full CFG traversal
    dce(func);            // Pass 3: Full CFG traversal  
    strength_reduce(func); // Pass 4: Full CFG traversal
    // Each pass separately iterates all blocks and instructions
}

Performance profile for 10,000 instruction function:

  • 4 separate passes × 10,000 instructions = 40,000 instruction visits
  • Poor cache locality between passes
  • Redundant liveness/dominance recalculation

Proposed Combined Pass Architecture

1. Single-Pass Local Optimizations

// Combine all local (within basic block) optimizations
typedef struct {
    bool changed;
    int constants_propagated;
    int expressions_eliminated;
    int strength_reductions;
    int algebraic_simplifications;
} local_opt_stats_t;

local_opt_stats_t optimize_basic_block(basic_block_t *bb) {
    local_opt_stats_t stats = {0};
    
    // Single pass through instructions
    for (insn_t *insn = bb->first; insn; insn = insn->next) {
        // Try all applicable optimizations on this instruction
        
        // 1. Constant propagation
        if (try_constant_propagation(insn)) {
            stats.constants_propagated++;
            stats.changed = true;
        }
        
        // 2. Algebraic simplification (may enable more constant prop)
        if (try_algebraic_simplification(insn)) {
            stats.algebraic_simplifications++;
            stats.changed = true;
            
            // Retry constant propagation after simplification
            if (try_constant_propagation(insn)) {
                stats.constants_propagated++;
            }
        }
        
        // 3. Strength reduction
        if (try_strength_reduction(insn)) {
            stats.strength_reductions++;
            stats.changed = true;
        }
        
        // 4. Common subexpression elimination
        if (try_eliminate_common_subexpr(insn, bb)) {
            stats.expressions_eliminated++;
            stats.changed = true;
        }
        
        // 5. Peephole optimization
        if (try_peephole_optimization(insn)) {
            stats.changed = true;
        }
    }
    
    return stats;
}

2. Worklist-Driven Global Optimization

// Process blocks in intelligent order
typedef struct {
    basic_block_t **blocks;
    int count;
    int capacity;
    bool *in_worklist;  // Bitmap for O(1) membership test
} worklist_t;

void optimize_function_worklist(func_t *func) {
    worklist_t *worklist = create_worklist(func->bb_count);
    
    // Add all blocks initially
    for (int i = 0; i < func->bb_count; i++) {
        worklist_add(worklist, func->bbs[i]);
    }
    
    // Process until convergence
    while (!worklist_empty(worklist)) {
        basic_block_t *bb = worklist_remove(worklist);
        
        // Apply all local optimizations
        local_opt_stats_t stats = optimize_basic_block(bb);
        
        if (stats.changed) {
            // Add successors to worklist if this block changed
            for (int i = 0; i < bb->succ_count; i++) {
                if (!worklist->in_worklist[bb->successors[i]->id]) {
                    worklist_add(worklist, bb->successors[i]);
                }
            }
            
            // Add users of defined variables (for CSE)
            add_users_to_worklist(bb, worklist);
        }
    }
    
    // One final pass for dead code elimination
    eliminate_dead_code_global(func);
}

3. Optimization Context Sharing

// Share analysis results between optimizations
typedef struct {
    // Constant propagation state
    hashmap_t *known_constants;
    
    // CSE state
    hashmap_t *available_expressions;
    
    // Liveness information (computed once)
    bitset_t **live_in;
    bitset_t **live_out;
    
    // Dominance information (computed once)
    int *idom;  // Immediate dominators
    bitset_t **dominance_frontier;
    
    // Statistics
    opt_statistics_t stats;
} optimization_context_t;

void optimize_with_context(func_t *func) {
    optimization_context_t *ctx = create_opt_context(func);
    
    // Compute analyses once
    compute_liveness(func, ctx->live_in, ctx->live_out);
    compute_dominance(func, ctx->idom, ctx->dominance_frontier);
    
    // Run optimizations sharing context
    bool changed;
    int iteration = 0;
    
    do {
        changed = false;
        
        for (bb in func->bbs) {
            // All optimizations use shared context
            changed |= propagate_constants_bb(bb, ctx);
            changed |= eliminate_expressions_bb(bb, ctx);
            changed |= simplify_algebraic_bb(bb, ctx);
        }
        
        iteration++;
    } while (changed && iteration < MAX_ITERATIONS);
    
    // Final cleanup
    remove_dead_code(func, ctx);
    
    print_optimization_stats(&ctx->stats);
}

4. Cascading Optimizations

// Enable cascading optimizations in single pass
bool optimize_instruction_cascade(insn_t *insn, optimization_context_t *ctx) {
    bool changed = false;
    bool local_changed;
    
    // Keep applying optimizations until fixed point
    do {
        local_changed = false;
        
        // Constant folding might enable simplification
        if (fold_constants(insn, ctx)) {
            local_changed = true;
            ctx->stats.constants_folded++;
        }
        
        // Simplification might enable more folding
        if (local_changed || simplify_algebraic(insn, ctx)) {
            local_changed = true;
            ctx->stats.algebraic_simplified++;
            
            // Try constant folding again
            if (fold_constants(insn, ctx)) {
                ctx->stats.constants_folded++;
            }
        }
        
        // Strength reduction on simplified expression
        if (reduce_strength(insn, ctx)) {
            local_changed = true;
            ctx->stats.strength_reduced++;
        }
        
        changed |= local_changed;
    } while (local_changed);
    
    return changed;
}

5. Fast Pattern Matching

// Optimize pattern matching for common cases
typedef struct {
    opcode_t op;
    bool (*matcher)(insn_t *);
    void (*optimizer)(insn_t *, optimization_context_t *);
} optimization_pattern_t;

// Pre-sorted by frequency for better branch prediction
optimization_pattern_t patterns[] = {
    {OP_ADD, match_add_zero, optimize_identity},
    {OP_MUL, match_mul_power2, optimize_mul_to_shift},
    {OP_SUB, match_sub_self, optimize_to_zero},
    {OP_DIV, match_div_power2, optimize_div_to_shift},
    // ... more patterns
};

void apply_patterns(insn_t *insn, optimization_context_t *ctx) {
    // Fast path for common opcodes
    if (insn->op >= OP_ADD && insn->op <= OP_XOR) {
        int pattern_idx = pattern_lookup_table[insn->op];
        if (pattern_idx >= 0) {
            optimization_pattern_t *p = &patterns[pattern_idx];
            if (p->matcher(insn)) {
                p->optimizer(insn, ctx);
                ctx->stats.patterns_matched++;
            }
        }
    }
}

6. Batch Processing for Cache Efficiency

// Process multiple instructions together for better cache usage
void optimize_instruction_batch(insn_t **batch, int count, 
                               optimization_context_t *ctx) {
    // Prefetch next batch while processing current
    if (count >= 4) {
        __builtin_prefetch(batch[4], 0, 1);
    }
    
    // Process batch
    for (int i = 0; i < count; i++) {
        insn_t *insn = batch[i];
        
        // All optimizations on same instruction (cache hot)
        optimize_instruction_cascade(insn, ctx);
    }
}

void optimize_with_batching(func_t *func) {
    #define BATCH_SIZE 8
    insn_t *batch[BATCH_SIZE];
    int batch_count = 0;
    
    for (bb in func->bbs) {
        for (insn in bb->insns) {
            batch[batch_count++] = insn;
            
            if (batch_count == BATCH_SIZE) {
                optimize_instruction_batch(batch, batch_count, ctx);
                batch_count = 0;
            }
        }
    }
    
    // Process remaining
    if (batch_count > 0) {
        optimize_instruction_batch(batch, batch_count, ctx);
    }
}

Performance Comparison

Benchmark Results

// Test with 10,000 instruction function
void benchmark_optimization_passes() {
    func_t *func = generate_large_function(10000);
    
    // Old approach
    clock_t start = clock();
    ssa_optimize_old(func);
    clock_t old_time = clock() - start;
    
    // New combined approach
    func = generate_large_function(10000);  // Reset
    start = clock();
    optimize_with_context(func);
    clock_t new_time = clock() - start;
    
    printf("Old approach: %.3f ms\n", old_time * 1000.0 / CLOCKS_PER_SEC);
    printf("New approach: %.3f ms\n", new_time * 1000.0 / CLOCKS_PER_SEC);
    printf("Speedup: %.2fx\n", (float)old_time / new_time);
}

Integration Plan

  1. Phase 1: Implement combined local optimizations
  2. Phase 2: Add worklist infrastructure
  3. Phase 3: Share optimization context
  4. Phase 4: Benchmark and tune
  5. Phase 5: Remove old separate passes

Testing Strategy

# Correctness: ensure same optimization results
make test-opt-equivalence

# Performance: measure improvement
make benchmark-optimization

# Quality: ensure no regression in generated code
make test-code-quality

Success Criteria

  • 25%+ reduction in optimization time
  • Same or better code quality
  • Reduced memory usage
  • All tests pass
  • Maintainable code structure

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or request

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions