In [12]:
import os
import numpy as np
import struct
import re
from enum import Enum
from typing import List, Dict, Tuple, Any, Optional
from dataclasses import dataclass

BANK_COUNT = 8          # Number of memory banks
ROWS_PER_BANK = 512     # Number of rows per bank
COLUMNS_PER_ROW = 64    # Number of columns per row
CLUSTERS_PER_BANK = 8   # Number of clusters per bank (for processing)

# Instruction Format (24-bit)
# [2-bit: op][6-bit: arg1][1-bit: RD][1-bit: WR][9-bit: addr][5-bit: reserved]

class OpCode(Enum):
    NOOP = 0    # No operation
    PROG = 1    # Program LUT cores
    EXE = 2     # Execute operation
    END = 3     # End execution

class FunctionCode(Enum):
    MAC_OP = 0       # Multiply-Accumulate
    RELU_OP = 1      # ReLU activation
    MAX_INDEX_OP = 2 # Max index finder

# PART 2: Memory Mapper
# ---------------------

@dataclass
class MemoryAddress:
    bank: int
    row: int
    col: int = 0

    def __lt__(self, other):
        if self.bank != other.bank:
            return self.bank < other.bank
        if self.row != other.row:
            return self.row < other.row
        return self.col < other.col

    def __str__(self):
        return f"Bank {self.bank}, Row {self.row}, Col {self.col}"

@dataclass
class MatrixAllocation:
    name: str
    address: MemoryAddress
    rows: int
    cols: int

class PIMMemoryMapper:
    """Maps matrices to PIM memory architecture"""

    def __init__(self, bank_count=BANK_COUNT, rows_per_bank=ROWS_PER_BANK,
                 cols_per_row=COLUMNS_PER_ROW, clusters_per_bank=CLUSTERS_PER_BANK):
        self.bank_count = bank_count
        self.rows_per_bank = rows_per_bank
        self.cols_per_row = cols_per_row
        self.clusters_per_bank = clusters_per_bank
        self.next_free_bank = 0
        self.next_free_row = 0
        self.allocations: Dict[str, MatrixAllocation] = {}

    def allocate_matrix(self, name: str, rows: int, cols: int) -> MemoryAddress:
        """Allocate memory for a matrix"""
        if rows > self.rows_per_bank:
            raise ValueError(f"Matrix {name} with {rows} rows exceeds bank capacity of {self.rows_per_bank}")

        addr = MemoryAddress(
            bank=self.next_free_bank,
            row=self.next_free_row
        )

        self.allocations[name] = MatrixAllocation(
            name=name,
            address=addr,
            rows=rows,
            cols=cols
        )

        # Update next free location
        self.next_free_row += rows
        if self.next_free_row >= self.rows_per_bank:
            self.next_free_row = 0
            self.next_free_bank = (self.next_free_bank + 1) % self.bank_count

        print(f"Allocated matrix {name} ({rows}x{cols}) at {addr}")
        return addr

    def get_row_address(self, matrix_name: str, i: int, j: int) -> int:
        """Get physical row address for matrix element (i,j)"""
        if matrix_name not in self.allocations:
            raise ValueError(f"Matrix not allocated: {matrix_name}")

        alloc = self.allocations[matrix_name]
        return (alloc.address.row + i) % self.rows_per_bank

    def get_cluster_mapping(self, matrix_a: str, matrix_b: str, matrix_c: str) -> List[Tuple[int, int]]:
        """Map matrix operations to clusters for optimal execution"""
        if matrix_c not in self.allocations:
            raise ValueError(f"Result matrix not allocated: {matrix_c}")

        alloc_c = self.allocations[matrix_c]
        mapping = []

        for i in range(alloc_c.rows):
            for j in range(alloc_c.cols):
                mapping.append((i, j))

        return mapping

# PART 3: Instruction Generator
# -----------------------------

class PIMInstructionGenerator:
    """Generates PIM-specific instructions based on 24-bit format"""

    def generate_program_instruction(self, core_ptr: int, function_word: int = 0) -> int:
        """Generate a PROG instruction to program LUT cores"""
        # PROG instruction: [2-bit:01][6-bit:corePtr][RD:0][WR:0][9-bit:0][5-bit:0]
        return (OpCode.PROG.value << 22) | ((core_ptr & 0x3F) << 16) | (function_word & 0x1F)

    def generate_execute_instruction(self, op_code: int) -> int:
        """Generate an EXE instruction to start an operation"""
        # EXE instruction: [2-bit:10][6-bit:opCode][RD:0][WR:0][9-bit:0][5-bit:0]
        return (OpCode.EXE.value << 22) | ((op_code & 0x3F) << 16)

    def generate_read_instruction(self, row_addr: int) -> int:
        """Generate a memory read instruction"""
        # READ: [8-bit:0][RD:1][WR:0][9-bit:rowAddr][5-bit:0]
        return (1 << 10) | ((row_addr & 0x1FF) << 5)

    def generate_write_instruction(self, row_addr: int) -> int:
        """Generate a memory write instruction"""
        # WRITE: [8-bit:0][RD:0][WR:1][9-bit:rowAddr][5-bit:0]
        return (1 << 9) | ((row_addr & 0x1FF) << 5)

    def generate_read_write_instruction(self, row_addr: int) -> int:
        """Generate a memory read-write instruction"""
        # READ+WRITE: [8-bit:0][RD:1][WR:1][9-bit:rowAddr][5-bit:0]
        return (1 << 10) | (1 << 9) | ((row_addr & 0x1FF) << 5)

    def generate_end_instruction(self) -> int:
        """Generate an END instruction"""
        # END instruction: [2-bit:11][6-bit:0][RD:0][WR:0][9-bit:0][5-bit:0]
        return (OpCode.END.value << 22)

    def generate_noop_instruction(self) -> int:
        """Generate a NOOP instruction"""
        # NOOP instruction: [2-bit:00][6-bit:0][RD:0][WR:0][9-bit:0][5-bit:0]
        return 0

    def format_instruction(self, instr: int) -> str:
        """Format an instruction as a human-readable string"""
        op_type = (instr >> 22) & 0x3
        op_code = (instr >> 16) & 0x3F
        read_bit = (instr >> 10) & 0x1
        write_bit = (instr >> 9) & 0x1
        row_addr = (instr >> 5) & 0x1FF
        reserved = instr & 0x1F

        # Convert to hex for display
        hex_instr = f"0x{instr:06x}"

        if op_type == OpCode.NOOP.value:
            return f"{hex_instr} NOOP"
        elif op_type == OpCode.PROG.value:
            return f"{hex_instr} PROG CORE={op_code}"
        elif op_type == OpCode.EXE.value:
            op_name = "MAC" if op_code == FunctionCode.MAC_OP.value else \
                     "RELU" if op_code == FunctionCode.RELU_OP.value else \
                     "MAX_INDEX" if op_code == FunctionCode.MAX_INDEX_OP.value else \
                     f"OP={op_code}"
            return f"{hex_instr} EXE {op_name}"
        elif op_type == OpCode.END.value:
            return f"{hex_instr} END"
        else:
            if read_bit and write_bit:
                return f"{hex_instr} RDWR ROW={row_addr}"
            elif read_bit:
                return f"{hex_instr} RD ROW={row_addr}"
            elif write_bit:
                return f"{hex_instr} WR ROW={row_addr}"
            else:
                return f"{hex_instr} UNKNOWN"

# PART 4: Matrix Multiplication Implementation

class MatrixMultiplication:
    """Implements matrix multiplication for PIM architecture"""

    def __init__(self, memory_mapper: PIMMemoryMapper, instruction_generator: PIMInstructionGenerator):
        self.memory_mapper = memory_mapper
        self.instruction_generator = instruction_generator

    def generate_matrix_mul_instructions(self,
                                       matrix_a: str, rows_a: int, cols_a: int,
                                       matrix_b: str, rows_b: int, cols_b: int,
                                       matrix_c: str) -> List[int]:
        """Generate instructions for matrix multiplication C = A * B"""
        instructions = []

        if cols_a != rows_b:
            raise ValueError(f"Matrix dimensions incompatible: {cols_a} != {rows_b}")

        # Allocate memory for matrices
        self.memory_mapper.allocate_matrix(matrix_a, rows_a, cols_a)
        self.memory_mapper.allocate_matrix(matrix_b, rows_b, cols_b)
        self.memory_mapper.allocate_matrix(matrix_c, rows_a, cols_b)

        # 1. Program the LUT cores
        for core in range(4):
            # Cores 0-3 for multiplication
            instructions.append(self.instruction_generator.generate_program_instruction(core, 0))

        for core in range(4, 8):
            # Cores 4-7 for addition
            instructions.append(self.instruction_generator.generate_program_instruction(core, 1))

        # Get optimal mapping of operations to clusters
        cluster_mapping = self.memory_mapper.get_cluster_mapping(matrix_a, matrix_b, matrix_c)

        # 2. Generate instructions for each result element C[i][j]
        for i, j in cluster_mapping:
            # Initialize accumulator to zero
            instructions.append(self.instruction_generator.generate_noop_instruction())

            # Process in chunks of 8 due to PIM architecture constraints
            for k in range(0, cols_a, 8):
                chunk_size = min(8, cols_a - k)

                # Read chunks from matrix A (row i)
                for chunk in range(chunk_size):
                    row_addr = self.memory_mapper.get_row_address(matrix_a, i, k + chunk)
                    instructions.append(self.instruction_generator.generate_read_instruction(row_addr))

                # Read chunks from matrix B (column j)
                for chunk in range(chunk_size):
                    row_addr = self.memory_mapper.get_row_address(matrix_b, k + chunk, j)
                    instructions.append(self.instruction_generator.generate_read_instruction(row_addr))

                # Execute MAC operation
                instructions.append(self.instruction_generator.generate_execute_instruction(FunctionCode.MAC_OP.value))

            # Write result to matrix C[i][j]
            result_row_addr = self.memory_mapper.get_row_address(matrix_c, i, j)
            instructions.append(self.instruction_generator.generate_write_instruction(result_row_addr))

        # 3. End execution
        instructions.append(self.instruction_generator.generate_end_instruction())

        return instructions

# PART 5: C++ Code Parser
# -----------------------

class CppParser:
    """Enhanced parser for C++ matrix multiplication code"""

    def parse_matrix_mul_code(self, code: str) -> Tuple[int, int, int, int]:
        """Parse C++ code to extract matrix dimensions"""
        rows_a = self._extract_dimension(code, ["SIZE_A_ROWS", "rows_a"])
        cols_a = self._extract_dimension(code, ["SIZE_A_COLS", "cols_a"])
        rows_b = self._extract_dimension(code, ["SIZE_B_ROWS", "rows_b"], cols_a)
        cols_b = self._extract_dimension(code, ["SIZE_B_COLS", "cols_b"])

        return rows_a, cols_a, rows_b, cols_b

    def _extract_dimension(self, code: str, dim_names: List[str], default_value: Optional[int] = None) -> int:
        """Extract dimension value from code with support for multiple naming patterns"""
        patterns = [
            rf"{name}\s*=\s*(\d+)",             # NAME = VALUE
            rf"#define\s+{name}\s+(\d+)",       # #define NAME VALUE
            rf"const\s+int\s+{name}\s*=\s*(\d+)",  # const int NAME = VALUE
            rf"int\s+{name}\s*=\s*(\d+)"        # int NAME = VALUE
        ]

        for name in dim_names:
            for pattern in patterns:
                match = re.search(pattern, code)
                if match:
                    return int(match.group(1))

        if default_value is None:
            for name in dim_names:
                # Look for array declarations like int A[64][32]
                array_pattern = rf"int\s+\w+\s*\[(\d+)\]"
                matches = re.findall(array_pattern, code)
                if matches:
                    return int(matches[0])

            raise ValueError(f"Could not find dimension {dim_names} in code")
        else:
            return default_valu

# PART 6: Output Generator
class OutputGenerator:
    """Generates binary and assembly output for PIM instructions"""

    def __init__(self, instruction_generator: PIMInstructionGenerator):
        self.instruction_generator = instruction_generator

    def generate_binary_output(self, instructions: List[int], filename: str) -> None:
        """Generate binary output file"""
        with open(filename, 'wb') as f:
            for instr in instructions:
                bytes_data = [(instr >> 16) & 0xFF,
                             (instr >> 8) & 0xFF,
                             instr & 0xFF]
                f.write(bytes(bytes_data))
        print(f"Binary output written to {filename}")

    def generate_assembly_listing(self, instructions: List[int], filename: str) -> None:
        """Generate assembly listing file"""
        with open(filename, 'w') as f:
            for i, instr in enumerate(instructions):
                f.write(f"{i:04d}: {self.instruction_generator.format_instruction(instr)}\n")
        print(f"Assembly listing written to {filename}")

# PART 7: Complete PIM Compiler

class PIMCompiler:
    """Complete compiler for PIM architecture"""

    def __init__(self):
        self.memory_mapper = PIMMemoryMapper()
        self.instruction_generator = PIMInstructionGenerator()
        self.matrix_mul = MatrixMultiplication(self.memory_mapper, self.instruction_generator)
        self.cpp_parser = CppParser()
        self.output_generator = OutputGenerator(self.instruction_generator)

    def compile_matrix_mul(self, source_code: str, output_prefix: str) -> None:
        """Compile matrix multiplication code to PIM instructions"""
        rows_a, cols_a, rows_b, cols_b = self.cpp_parser.parse_matrix_mul_code(source_code)

        print(f"Compiling matrix multiplication: {rows_a}x{cols_a} * {rows_b}x{cols_b}")

        instructions = self.matrix_mul.generate_matrix_mul_instructions(
            "A", rows_a, cols_a,
            "B", rows_b, cols_b,
            "C"
        )

        print(f"Generated {len(instructions)} PIM instructions")

        self.output_generator.generate_binary_output(instructions, f"{output_prefix}.bin")
        self.output_generator.generate_assembly_listing(instructions, f"{output_prefix}.asm")

        with open(f"{output_prefix}.cpp", "w") as f:
            f.write(source_code)

        print(f"Compilation complete. Output files: {output_prefix}.bin, {output_prefix}.asm, {output_prefix}.cpp")

# PART 8: Demo Usage

def main():
    """Demo usage of the PIM compiler with improved error handling"""
    matrix_mul_code = """
    #define SIZE_A_ROWS 64
    #define SIZE_A_COLS 32
    #define SIZE_B_COLS 16

    void matrix_multiply(int A[][SIZE_A_COLS], int B[][SIZE_B_COLS], int C[][SIZE_B_COLS]) {
        for (int i = 0; i < SIZE_A_ROWS; i++) {
            for (int j = 0; j < SIZE_B_COLS; j++) {
                C[i][j] = 0;
                for (int k = 0; k < SIZE_A_COLS; k++) {
                    C[i][j] += A[i][k] * B[k][j];
                }
            }
        }
    }
    """

    try:
        compiler = PIMCompiler()
        compiler.compile_matrix_mul(matrix_mul_code, "matrix_mul_output")

        with open("matrix_mul_output.asm", "r") as f:
            print("\nSample of generated instructions:")
            for i, line in enumerate(f):
                print(line.strip())
                if i >= 10:  # Show first 10 instructions
                    print("...")
                    break
    except ValueError as e:
        print(f"Error during compilation: {str(e)}")
        print("Attempting to extract dimensions directly from code...")

        rows_a = 64
        cols_a = 32
        cols_b = 16

        loop_pattern = r"for\s*\([^;]*;\s*\w+\s*<\s*(\d+)"
        matches = re.findall(loop_pattern, matrix_mul_code)
        if len(matches) >= 3:
            rows_a = int(matches[0])
            cols_b = int(matches[1])
            cols_a = int(matches[2])

        print(f"Using dimensions: {rows_a}x{cols_a} * {cols_a}x{cols_b}")

        memory_mapper = PIMMemoryMapper()
        instruction_generator = PIMInstructionGenerator()
        matrix_mul = MatrixMultiplication(memory_mapper, instruction_generator)
        output_generator = OutputGenerator(instruction_generator)

        # Generate instructions
        instructions = matrix_mul.generate_matrix_mul_instructions(
            "A", rows_a, cols_a,
            "B", cols_a, cols_b,
            "C"
        )

        print(f"Generated {len(instructions)} PIM instructions")

        output_generator.generate_binary_output(instructions, "matrix_mul_output.bin")
        output_generator.generate_assembly_listing(instructions, "matrix_mul_output.asm")

        with open("matrix_mul_output.asm", "r") as f:
            print("\nSample of generated instructions:")
            for i, line in enumerate(f):
                print(line.strip())
                if i >= 10:  # Show first 10 instructions
                    print("...")
                    break
def visualize_instruction_distribution(asm_file):
    """Visualize the distribution of instructions"""
    import matplotlib.pyplot as plt
    from collections import Counter

    with open(asm_file, 'r') as f:
        lines = f.readlines()

    instr_types = []
    for line in lines:
        if 'PROG' in line:
            instr_types.append('PROG')
        elif 'EXE' in line:
            instr_types.append('EXE')
        elif 'RD' in line and 'WR' in line:
            instr_types.append('RDWR')
        elif 'RD' in line:
            instr_types.append('RD')
        elif 'WR' in line:
            instr_types.append('WR')
        elif 'END' in line:
            instr_types.append('END')
        elif 'NOOP' in line:
            instr_types.append('NOOP')

    counter = Counter(instr_types)

    plt.figure(figsize=(10, 6))
    plt.bar(counter.keys(), counter.values())
    plt.title('Distribution of Instructions')
    plt.xlabel('Instruction Type')
    plt.ylabel('Count')
    plt.grid(axis='y', linestyle='--', alpha=0.7)

    for i, (key, value) in enumerate(counter.items()):
        plt.text(i, value + 0.5, str(value), ha='center')

    plt.tight_layout()
    plt.savefig('instruction_distribution.png')
    plt.show()

    return counter


In [13]:

!pip install llvmlite

import llvmlite.binding as llvm
import llvmlite.ir as ir
from ctypes import CFUNCTYPE, c_int
import sys
import numpy as np
from typing import List, Dict, Tuple, Any, Optional

llvm.initialize()
llvm.initialize_native_target()
llvm.initialize_native_asmprinter()

# PART 1: LLVM Frontend for Matrix Multiplication

class LLVMMatrixMultiplication:
    """Generates LLVM IR for matrix multiplication"""

    def __init__(self, rows_a: int, cols_a: int, cols_b: int):
        self.rows_a = rows_a
        self.cols_a = cols_a  # Also equals rows_b
        self.cols_b = cols_b

        # Create module and initialize
        self.module = ir.Module(name="pim_matrix_mul")
        self.module.triple = llvm.get_default_triple()

    def generate_ir(self) -> str:
      """Generate LLVM IR for matrix multiplication with fixed control flow"""
      int_type = ir.IntType(32)
      matrix_a_type = ir.ArrayType(ir.ArrayType(int_type, self.cols_a), self.rows_a)
      matrix_b_type = ir.ArrayType(ir.ArrayType(int_type, self.cols_b), self.cols_a)
      matrix_c_type = ir.ArrayType(ir.ArrayType(int_type, self.cols_b), self.rows_a)

      func_type = ir.FunctionType(
          ir.VoidType(),
          [
              ir.PointerType(matrix_a_type),
              ir.PointerType(matrix_b_type),
              ir.PointerType(matrix_c_type)
          ]
      )

      func = ir.Function(self.module, func_type, name="matrix_multiply")

      matrix_a_arg = func.args[0]
      matrix_b_arg = func.args[1]
      matrix_c_arg = func.args[2]

      entry_block = func.append_basic_block(name="entry")
      builder = ir.IRBuilder(entry_block)

      i_ptr = builder.alloca(int_type, name="i_ptr")
      j_ptr = builder.alloca(int_type, name="j_ptr")
      k_ptr = builder.alloca(int_type, name="k_ptr")

      # Initialize i = 0
      builder.store(ir.Constant(int_type, 0), i_ptr)

      # Branch to i_loop_start
      i_loop_start = builder.append_basic_block("i_loop_start")
      builder.branch(i_loop_start)

      # i_loop_start block
      builder.position_at_end(i_loop_start)
      i_current = builder.load(i_ptr, name="i_current")
      i_cond = builder.icmp_signed('<', i_current, ir.Constant(int_type, self.rows_a))
      i_loop_body = builder.append_basic_block("i_loop_body")
      i_loop_end = builder.append_basic_block("i_loop_end")
      builder.cbranch(i_cond, i_loop_body, i_loop_end)

      # i_loop_body block
      builder.position_at_end(i_loop_body)
      # Initialize j = 0
      builder.store(ir.Constant(int_type, 0), j_ptr)
      j_loop_start = builder.append_basic_block("j_loop_start")
      builder.branch(j_loop_start)

      # j_loop_start block
      builder.position_at_end(j_loop_start)
      j_current = builder.load(j_ptr, name="j_current")
      j_cond = builder.icmp_signed('<', j_current, ir.Constant(int_type, self.cols_b))
      j_loop_body = builder.append_basic_block("j_loop_body")
      j_loop_end = builder.append_basic_block("j_loop_end")
      builder.cbranch(j_cond, j_loop_body, j_loop_end)

      # j_loop_body block
      builder.position_at_end(j_loop_body)
      # Initialize C[i][j] = 0
      i_idx = builder.load(i_ptr, name="i_idx_init")
      j_idx = builder.load(j_ptr, name="j_idx_init")
      indices = [ir.Constant(int_type, 0), i_idx, j_idx]
      c_element_ptr = builder.gep(matrix_c_arg, indices)
      builder.store(ir.Constant(int_type, 0), c_element_ptr)

      # Initialize k = 0
      builder.store(ir.Constant(int_type, 0), k_ptr)
      k_loop_start = builder.append_basic_block("k_loop_start")
      builder.branch(k_loop_start)

      # k_loop_start block
      builder.position_at_end(k_loop_start)
      k_current = builder.load(k_ptr, name="k_current")
      k_cond = builder.icmp_signed('<', k_current, ir.Constant(int_type, self.cols_a))
      k_loop_body = builder.append_basic_block("k_loop_body")
      k_loop_end = builder.append_basic_block("k_loop_end")
      builder.cbranch(k_cond, k_loop_body, k_loop_end)

      # k_loop_body block
      builder.position_at_end(k_loop_body)
      # Load indices for this iteration
      i_idx = builder.load(i_ptr, name="i_idx_k")
      j_idx = builder.load(j_ptr, name="j_idx_k")
      k_idx = builder.load(k_ptr, name="k_idx")

      # Get A[i][k]
      indices_a = [ir.Constant(int_type, 0), i_idx, k_idx]
      a_element_ptr = builder.gep(matrix_a_arg, indices_a)
      a_element = builder.load(a_element_ptr)

      # Get B[k][j]
      indices_b = [ir.Constant(int_type, 0), k_idx, j_idx]
      b_element_ptr = builder.gep(matrix_b_arg, indices_b)
      b_element = builder.load(b_element_ptr)

      # Multiply A[i][k] * B[k][j]
      mul_result = builder.mul(a_element, b_element)

      # Get current C[i][j]
      indices_c = [ir.Constant(int_type, 0), i_idx, j_idx]
      c_element_ptr = builder.gep(matrix_c_arg, indices_c)
      c_element = builder.load(c_element_ptr)

      # Add mul_result to C[i][j]
      add_result = builder.add(c_element, mul_result)
      builder.store(add_result, c_element_ptr)

      # Increment k
      k_idx = builder.load(k_ptr)
      k_next = builder.add(k_idx, ir.Constant(int_type, 1))
      builder.store(k_next, k_ptr)
      builder.branch(k_loop_start)

      # k_loop_end block
      builder.position_at_end(k_loop_end)
      # Increment j
      j_idx = builder.load(j_ptr)
      j_next = builder.add(j_idx, ir.Constant(int_type, 1))
      builder.store(j_next, j_ptr)
      builder.branch(j_loop_start)

      # j_loop_end block
      builder.position_at_end(j_loop_end)
      # Increment i
      i_idx = builder.load(i_ptr)
      i_next = builder.add(i_idx, ir.Constant(int_type, 1))
      builder.store(i_next, i_ptr)
      builder.branch(i_loop_start)

      # i_loop_end block
      builder.position_at_end(i_loop_end)
      builder.ret_void()

      # Verify the module
      llvm.parse_assembly(str(self.module))

      return str(self.module)

# PART 2: LLVM IR to PIM Custom Backend
# -------------------------------------

class LLVMToPIMTranslator:
    """Translates LLVM IR to PIM instructions"""

    def __init__(self, memory_mapper, instruction_generator):
        self.memory_mapper = memory_mapper
        self.instruction_generator = instruction_generator

    def translate(self, llvm_ir: str, rows_a: int, cols_a: int, rows_b: int, cols_b: int) -> List[int]:
        """Translate LLVM IR to PIM instructions"""
        # Parse the LLVM IR
        module = llvm.parse_assembly(llvm_ir)
        module.verify()

        # Create execution engine
        target_machine = llvm.Target.from_default_triple().create_target_machine()

        self.memory_mapper.allocate_matrix("A", rows_a, cols_a)
        self.memory_mapper.allocate_matrix("B", rows_b, cols_b)
        self.memory_mapper.allocate_matrix("C", rows_a, cols_b)

        instructions = []

        # 1. Program the LUT cores
        for core in range(4):
            # Cores 0-3 for multiplication
            instructions.append(self.instruction_generator.generate_program_instruction(core, 0))

        for core in range(4, 8):
            # Cores 4-7 for addition
            instructions.append(self.instruction_generator.generate_program_instruction(core, 1))

        for i in range(rows_a):
            for j in range(cols_b):
                # Initialize C[i][j] to 0
                c_addr = self.memory_mapper.get_row_address("C", i, j)
                instructions.append(self.instruction_generator.generate_write_instruction(c_addr))

                for k in range(0, cols_a, 8):
                    chunk_size = min(8, cols_a - k)

                    for chunk in range(chunk_size):
                        a_addr = self.memory_mapper.get_row_address("A", i, k + chunk)
                        instructions.append(self.instruction_generator.generate_read_instruction(a_addr))

                    for chunk in range(chunk_size):
                        b_addr = self.memory_mapper.get_row_address("B", k + chunk, j)
                        instructions.append(self.instruction_generator.generate_read_instruction(b_addr))

                    for chunk in range(chunk_size):
                        mul_instr = self.instruction_generator.generate_compute_instruction(
                            core=chunk % 4,  # Use multiplication cores (0-3)
                            op_code=0,       # Multiplication
                            src1_idx=chunk,
                            src2_idx=chunk + chunk_size,
                            dst_idx=chunk + 2 * chunk_size
                        )
                        instructions.append(mul_instr)

                    # Then, accumulate results into C[i][j]
                    for chunk in range(chunk_size):
                        add_instr = self.instruction_generator.generate_compute_instruction(
                            core=4 + (chunk % 4),  # Use addition cores (4-7)
                            op_code=1,           # Addition
                            src1_idx=c_addr % 256,  # Current C[i][j] value
                            src2_idx=chunk + 2 * chunk_size,  # Multiplication result
                            dst_idx=c_addr % 256  # Store back to C[i][j]
                        )
                        instructions.append(add_instr)

                    # Write final value back to C[i][j]
                    instructions.append(self.instruction_generator.generate_write_instruction(c_addr))

        return instructions

# PART 3: Memory Mapper and Instruction Generator
# ----------------------------------------------

class PIMMemoryMapper:
    """Maps matrices to PIM memory addresses"""

    def __init__(self, base_address: int = 0x1000):
        self.base_address = base_address
        self.matrix_info = {}  # Stores matrix metadata
        self.current_address = base_address

    def allocate_matrix(self, name: str, rows: int, cols: int) -> int:
        """Allocate memory for a matrix and return its base address"""
        matrix_size = rows * cols * 4  # 4 bytes per element
        base_addr = self.current_address

        self.matrix_info[name] = {
            "rows": rows,
            "cols": cols,
            "base_address": base_addr
        }

        self.current_address += matrix_size
        return base_addr

    def get_row_address(self, matrix_name: str, row: int, col: int) -> int:
        """Get the address for a specific matrix element"""
        if matrix_name not in self.matrix_info:
            raise ValueError(f"Matrix {matrix_name} not allocated")

        matrix = self.matrix_info[matrix_name]
        if row >= matrix["rows"] or col >= matrix["cols"]:
            raise IndexError(f"Index ({row}, {col}) out of bounds for matrix {matrix_name}")

        # Calculate address
        offset = (row * matrix["cols"] + col) * 4  # 4 bytes per element
        return matrix["base_address"] + offset

class PIMInstructionGenerator:
    """Generates PIM-specific instructions"""

    def __init__(self):
        pass

    def generate_read_instruction(self, address: int) -> int:
        """Generate a read instruction for the given address"""
        # Instruction format: [31:24] op_code, [23:0] address
        op_code = 0x01  # Read operation
        return (op_code << 24) | (address & 0xFFFFFF)

    def generate_write_instruction(self, address: int) -> int:
        """Generate a write instruction for the given address"""
        # Instruction format: [31:24] op_code, [23:0] address
        op_code = 0x02  # Write operation
        return (op_code << 24) | (address & 0xFFFFFF)

    def generate_compute_instruction(self, core: int, op_code: int, src1_idx: int, src2_idx: int, dst_idx: int) -> int:
        """Generate a compute instruction

        Args:
            core: Core ID (0-7)
            op_code: Operation code (0=multiply, 1=add)
            src1_idx: Source 1 index
            src2_idx: Source 2 index
            dst_idx: Destination index
        """
        # Instruction format: [31:28] op_code, [27:24] core, [23:16] src1, [15:8] src2, [7:0] dst
        instr_op_code = 0x03

        compute_params = (
            ((core & 0xF) << 24) |
            ((op_code & 0xF) << 20) |
            ((src1_idx & 0xFF) << 16) |
            ((src2_idx & 0xFF) << 8) |
            (dst_idx & 0xFF)
        )

        return (instr_op_code << 28) | compute_params

    def generate_program_instruction(self, core: int, function: int) -> int:
        """Generate a program instruction for configuring a core

        Args:
            core: Core ID (0-7)
            function: Function to program (0=multiply, 1=add)
        """
        # Instruction format: [31:28] op_code, [27:24] core, [23:16] function
        op_code = 0x04  # Program operation

        program_params = ((core & 0xF) << 24) | ((function & 0xFF) << 16)

        return (op_code << 28) | program_params


# PART 4: Main function to execute the system
# ------------------------------------------

def main():
    # Example matrix dimensions
    rows_a = 4
    cols_a = 4
    rows_b = 4
    cols_b = 4
    llvm_generator = LLVMMatrixMultiplication(rows_a, cols_a, cols_b)
    llvm_ir = llvm_generator.generate_ir()
    print("Generated LLVM IR:")
    print(llvm_ir)

    memory_mapper = PIMMemoryMapper()
    instruction_generator = PIMInstructionGenerator()

    translator = LLVMToPIMTranslator(memory_mapper, instruction_generator)
    instructions = translator.translate(llvm_ir, rows_a, cols_a, rows_b, cols_b)

    print(f"\nGenerated {len(instructions)} PIM instructions")
    for i, instr in enumerate(instructions[:10]):
        print(f"Instruction {i}: 0x{instr:08x}")

    if len(instructions) > 10:
        print(f"... {len(instructions) - 10} more instructions")

if __name__ == "__main__":
    main()

Generated LLVM IR:
; ModuleID = "pim_matrix_mul"
target triple = "x86_64-unknown-linux-gnu"
target datalayout = ""

define void @"matrix_multiply"([4 x [4 x i32]]* %".1", [4 x [4 x i32]]* %".2", [4 x [4 x i32]]* %".3")
{
entry:
  %"i_ptr" = alloca i32
  %"j_ptr" = alloca i32
  %"k_ptr" = alloca i32
  store i32 0, i32* %"i_ptr"
  br label %"i_loop_start"
i_loop_start:
  %"i_current" = load i32, i32* %"i_ptr"
  %".7" = icmp slt i32 %"i_current", 4
  br i1 %".7", label %"i_loop_body", label %"i_loop_end"
i_loop_body:
  store i32 0, i32* %"j_ptr"
  br label %"j_loop_start"
i_loop_end:
  ret void
j_loop_start:
  %"j_current" = load i32, i32* %"j_ptr"
  %".11" = icmp slt i32 %"j_current", 4
  br i1 %".11", label %"j_loop_body", label %"j_loop_end"
j_loop_body:
  %"i_idx_init" = load i32, i32* %"i_ptr"
  %"j_idx_init" = load i32, i32* %"j_ptr"
  %".13" = getelementptr [4 x [4 x i32]], [4 x [4 x i32]]* %".3", i32 0, i32 %"i_idx_init", i32 %"j_idx_init"
  store i32 0, i32* %".13"
  store i32 0,