Skip to content

Replace linear symbol lookup with hash table for O(1) performance #294

@jserv

Description

@jserv

Description

Symbol lookup currently uses linear search through linked lists, consuming 35% of compilation time for large programs. Implementing a hash table would reduce lookup complexity from O(n) to O(1) average case.

Current Implementation

// src/globals.c - Current linear search
symbol_t *find_symbol(char *name) {
    for (symbol_t *sym = symbols; sym; sym = sym->next) {
        if (strcmp(sym->name, name) == 0)
            return sym;
    }
    return NULL;
}

Performance profile shows:

  • 10,000 symbols: ~350ms lookup time (35% of total)
  • 1,000 symbols: ~35ms lookup time (20% of total)
  • 100 symbols: ~3ms lookup time (5% of total)

Proposed Hash Table Implementation

1. Data Structures

// Add to src/defs.h
#define SYMBOL_HASH_SIZE 1021  // Prime number for better distribution

typedef struct symbol_entry {
    symbol_t *symbol;
    struct symbol_entry *next;  // Chain for collision resolution
} symbol_entry_t;

typedef struct {
    symbol_entry_t *buckets[SYMBOL_HASH_SIZE];
    int total_symbols;
    int max_chain_length;  // For statistics
    
    // Statistics
    int lookups;
    int collisions;
    int hits;
    int misses;
} symbol_table_t;

// Global symbol tables
typedef struct {
    symbol_table_t *global_symbols;
    symbol_table_t *local_symbols;  // Current function scope
    symbol_table_t *types;          // Type definitions
} symbol_context_t;

2. Hash Function

// src/symbol_table.c - New file
// DJB2 hash function - simple and effective
uint32_t hash_string(const char *str) {
    uint32_t hash = 5381;
    int c;
    
    while ((c = *str++)) {
        hash = ((hash << 5) + hash) + c;  // hash * 33 + c
    }
    
    return hash;
}

int get_bucket_index(const char *name) {
    return hash_string(name) % SYMBOL_HASH_SIZE;
}

3. Symbol Table Operations

symbol_table_t *create_symbol_table(void) {
    symbol_table_t *table = calloc(1, sizeof(symbol_table_t));
    return table;
}

void insert_symbol(symbol_table_t *table, symbol_t *symbol) {
    int index = get_bucket_index(symbol->name);
    
    // Create new entry
    symbol_entry_t *entry = malloc(sizeof(symbol_entry_t));
    entry->symbol = symbol;
    
    // Insert at head of chain (O(1) insertion)
    entry->next = table->buckets[index];
    table->buckets[index] = entry;
    
    // Update statistics
    table->total_symbols++;
    if (entry->next) {
        table->collisions++;
    }
    
    // Track max chain length
    int chain_len = 0;
    for (symbol_entry_t *e = entry; e; e = e->next) {
        chain_len++;
    }
    if (chain_len > table->max_chain_length) {
        table->max_chain_length = chain_len;
    }
}

symbol_t *lookup_symbol(symbol_table_t *table, const char *name) {
    table->lookups++;
    
    int index = get_bucket_index(name);
    symbol_entry_t *entry = table->buckets[index];
    
    // Search chain
    while (entry) {
        if (strcmp(entry->symbol->name, name) == 0) {
            table->hits++;
            return entry->symbol;
        }
        entry = entry->next;
    }
    
    table->misses++;
    return NULL;
}

// Replace global find_symbol
symbol_t *find_symbol_fast(const char *name) {
    symbol_t *sym = NULL;
    
    // Check local scope first
    if (current_context->local_symbols) {
        sym = lookup_symbol(current_context->local_symbols, name);
        if (sym) return sym;
    }
    
    // Check global scope
    return lookup_symbol(current_context->global_symbols, name);
}

4. Dynamic Resizing (Optional Enhancement)

void resize_symbol_table(symbol_table_t *table) {
    // Only resize if load factor > 0.75
    float load_factor = (float)table->total_symbols / SYMBOL_HASH_SIZE;
    if (load_factor < 0.75) return;
    
    // Save old buckets
    symbol_entry_t *old_buckets[SYMBOL_HASH_SIZE];
    memcpy(old_buckets, table->buckets, sizeof(old_buckets));
    
    // Clear table
    memset(table->buckets, 0, sizeof(table->buckets));
    table->total_symbols = 0;
    table->collisions = 0;
    
    // Rehash all symbols with new size
    for (int i = 0; i < SYMBOL_HASH_SIZE; i++) {
        symbol_entry_t *entry = old_buckets[i];
        while (entry) {
            symbol_entry_t *next = entry->next;
            entry->next = NULL;
            
            // Reinsert with new hash
            int new_index = get_bucket_index(entry->symbol->name);
            entry->next = table->buckets[new_index];
            table->buckets[new_index] = entry;
            table->total_symbols++;
            
            entry = next;
        }
    }
}

5. Integration Points

// Update src/parser.c
void enter_scope(void) {
    // Create new local symbol table
    current_context->local_symbols = create_symbol_table();
}

void exit_scope(void) {
    // Print statistics if verbose
    if (verbose_mode) {
        print_symbol_table_stats(current_context->local_symbols);
    }
    
    // Free local symbol table
    free_symbol_table(current_context->local_symbols);
    current_context->local_symbols = NULL;
}

// Update symbol creation
symbol_t *create_symbol(const char *name, type_t *type) {
    symbol_t *sym = malloc(sizeof(symbol_t));
    sym->name = strdup(name);
    sym->type = type;
    
    // Insert into appropriate table
    if (current_scope == SCOPE_GLOBAL) {
        insert_symbol(current_context->global_symbols, sym);
    } else {
        insert_symbol(current_context->local_symbols, sym);
    }
    
    return sym;
}

6. Performance Monitoring

void print_symbol_table_stats(symbol_table_t *table) {
    printf("=== Symbol Table Statistics ===\n");
    printf("Total symbols:     %d\n", table->total_symbols);
    printf("Hash buckets:      %d\n", SYMBOL_HASH_SIZE);
    printf("Load factor:       %.2f\n", 
           (float)table->total_symbols / SYMBOL_HASH_SIZE);
    printf("Collisions:        %d (%.1f%%)\n", 
           table->collisions,
           table->collisions * 100.0 / table->total_symbols);
    printf("Max chain length:  %d\n", table->max_chain_length);
    printf("Total lookups:     %d\n", table->lookups);
    printf("Hit rate:          %.1f%%\n", 
           table->hits * 100.0 / table->lookups);
    
    // Distribution analysis
    int empty = 0, chains = 0;
    for (int i = 0; i < SYMBOL_HASH_SIZE; i++) {
        if (!table->buckets[i]) {
            empty++;
        } else if (table->buckets[i]->next) {
            chains++;
        }
    }
    printf("Empty buckets:     %d (%.1f%%)\n", 
           empty, empty * 100.0 / SYMBOL_HASH_SIZE);
    printf("Chains:            %d\n", chains);
}

Testing

Performance Benchmark

// tests/perf/symbol_lookup.c
#define NUM_SYMBOLS 10000

void benchmark_symbol_lookup() {
    clock_t start, end;
    
    // Create many symbols
    for (int i = 0; i < NUM_SYMBOLS; i++) {
        char name[32];
        sprintf(name, "symbol_%d", i);
        create_symbol(name, &type_int);
    }
    
    // Benchmark lookups
    start = clock();
    for (int i = 0; i < NUM_SYMBOLS * 10; i++) {
        char name[32];
        sprintf(name, "symbol_%d", rand() % NUM_SYMBOLS);
        find_symbol_fast(name);
    }
    end = clock();
    
    double time_spent = ((double)(end - start)) / CLOCKS_PER_SEC;
    printf("100,000 lookups in %.3f seconds\n", time_spent);
    printf("Average: %.3f microseconds per lookup\n", 
           time_spent * 1000000 / (NUM_SYMBOLS * 10));
}

Correctness Tests

// Verify hash table behaves identically to linear search
void test_symbol_table_correctness() {
    // Test insertion and lookup
    symbol_t *s1 = create_symbol("test1", &type_int);
    symbol_t *s2 = create_symbol("test2", &type_char);
    
    assert(find_symbol_fast("test1") == s1);
    assert(find_symbol_fast("test2") == s2);
    assert(find_symbol_fast("nonexistent") == NULL);
    
    // Test collision handling
    // Force collisions by creating symbols with same hash
    for (int i = 0; i < 100; i++) {
        char name[32];
        sprintf(name, "collision_%d", i * SYMBOL_HASH_SIZE);
        create_symbol(name, &type_int);
    }
    
    // Verify all can be found
    for (int i = 0; i < 100; i++) {
        char name[32];
        sprintf(name, "collision_%d", i * SYMBOL_HASH_SIZE);
        assert(find_symbol_fast(name) != NULL);
    }
}

Migration Plan

  1. Phase 1: Implement hash table alongside existing code
  2. Phase 2: Add compatibility layer with fallback
  3. Phase 3: Migrate all symbol lookups
  4. Phase 4: Remove old linear search code

Success Criteria

  • O(1) average lookup time
  • No functionality regression
  • 30%+ speedup for large programs
  • Hash collision rate < 10%
  • All existing tests pass

Metadata

Metadata

Assignees

No one assigned

    Labels

    enhancementNew feature or request

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions