In [1]:

def generate_pe_groups():
    """Generate PE_groups.sv with all PE modules and exact multipliers"""
    
    with open('PE_groups.sv', 'w') as f:
        f.write("// Auto-generated PE groups for 64x64 systolic array\n")
        f.write("// Each group of 3 PEs shares one exact multiplier\n")
        f.write("// Last column PEs get individual exact multipliers\n\n")
        
        exact_mult_index = 0
        
        # Generate PE groups for columns 0-62 (groups of 3)
        for x in range(0, 64):  # rows
            for y in range(0, 63//3):  # column groups (0 to 20, since 63//3 = 21 groups)
                f.write(f"""
// =============================x:={x}==y:={y}========================================
    module exact_mult_{exact_mult_index} ( input logic [3:0]a,  input logic [3:0]b,  output logic [7:0]p );
        assign p = a*b; 
    endmodule

    //  FSM is the solution to ANY hardware modeling problem (most at least)
    // Check if you need to implement 2's compliment or something since we are doing manual signed multiplication
    // Cuz remember in vivado negative numbers were wierd (probably in 2s complement ), might need to rewrite later sigh

    // PE_1_{x}__{3*y}_{exact_mult_index} - Uses exact multiplier when counter_for_exact_mult_usage == 2'b01
    module PE_1_{x}__{3*y}_{exact_mult_index} #(
        parameter DATA_WIDTH = 8
    )(
        input logic signed [DATA_WIDTH-1:0] a, b,
        input logic fast_clk,    // 4ns time period
        input logic clk,       // 12ns time period
        input logic rst,
        input logic [1:0] counter_for_exact_mult_usage,
        output logic signed [DATA_WIDTH-1:0] c, d,
        output logic signed [2*DATA_WIDTH-1:0] C_out
    );

        // Unsigned absolute values and sign handling
        logic [DATA_WIDTH-1:0] a_abs, b_abs; logic a_sign, b_sign, result_sign;
        // Split into high and low parts (unsigned)
        logic [3:0] aL, bL;
        logic [3:0] aH, bH;
        logic [7:0] exact_mult_result;

        // Convert signed inputs to unsigned absolute values
        always_comb begin
            a_sign = a[DATA_WIDTH-1];  // Extract sign bit
            b_sign = b[DATA_WIDTH-1];  // Extract sign bit
            result_sign = a_sign ^ b_sign;  // XOR for final sign
            
            // Get absolute values
            if (a_sign) begin
                a_abs = ~a + 1'b1;  // Two's complement negation
            end else begin
                a_abs = a;
            end
            
            if (b_sign) begin
                b_abs = ~b + 1'b1;  // Two's complement negation
            end else begin
                b_abs = b;
            end
        end

        // Split absolute values into high and low parts
        assign aL = a_abs[3:0];
        assign aH = a_abs[7:4];
        assign bL = b_abs[3:0];
        assign bH = b_abs[7:4];

        logic [2*DATA_WIDTH-1:0] partial_sum;
        logic [3:0] input_to_a, input_to_b;
        
        // Exact multiplier instance
        exact_mult_{exact_mult_index} m1(
            .a(input_to_a),
            .b(input_to_b),
            .p(exact_mult_result)
        );
        
        // Combinational logic for exact multiplier inputs
        always_comb begin
            if (counter_for_exact_mult_usage == 2'b01) begin
                input_to_a = aH;
                input_to_b = bH;
            end 
        end
        
        // Sequential logic for partial product accumulation
        always_ff @(posedge fast_clk or posedge rst) begin
            if (rst) begin
                partial_sum <= 0;
                C_out <= 0;
            end else 
            begin
                case(counter_for_exact_mult_usage)
                    2'd1: begin
                        partial_sum <= approx_5(aL, bL) + (exact_mult_result << 8);  // L*L + H*H
                    end
                    2'd2: begin
                        partial_sum <= partial_sum + (approx_5(aH, bL) << 4)  ;  // + H*L + H*H
                    end
                    2'd3: begin // 3rd clock we will COMMIT our output 
                        partial_sum <= partial_sum + (approx_5(aL, bH) << 4);  // + L*H
                        
                        // Apply sign to final result
                        if (result_sign) begin
                            C_out <= C_out + ~(partial_sum + (approx_5(aL, bH) << 4)) + 1'b1;  // Two's complement negation
                        end else begin
                            C_out <= C_out + partial_sum + (approx_5(aL, bH) << 4);
                        end
                    end
                endcase
            end
        end

        always_ff @(posedge clk or posedge rst) begin
            if (rst) begin
                c <= 0;
                d <= 0;
            end else begin
                c <= a;
                d <= b;
            end
        end
    endmodule


    // PE_2_{x}__{3*y+1}_{exact_mult_index} - Uses exact multiplier when counter_for_exact_mult_usage == 2'b10
    module PE_2_{x}__{3*y+1}_{exact_mult_index} #(
        parameter DATA_WIDTH = 8
    )(
        input logic signed [DATA_WIDTH-1:0] a, b,
        input logic fast_clk,    // 4ns time period
        input logic clk,       // 12ns time period
        input logic rst,
        input logic [1:0] counter_for_exact_mult_usage,
        output logic signed [DATA_WIDTH-1:0] c, d,
        output logic signed [2*DATA_WIDTH-1:0] C_out
    );

        // Unsigned absolute values and sign handling
        logic [DATA_WIDTH-1:0] a_abs, b_abs;
        logic a_sign, b_sign, result_sign;
        
        // Split into high and low parts (unsigned)
        logic [3:0] aL, bL;
        logic [3:0] aH, bH;
        logic [7:0] exact_mult_result;

        // Convert signed inputs to unsigned absolute values
        always_comb begin
            a_sign = a[DATA_WIDTH-1];  // Extract sign bit
            b_sign = b[DATA_WIDTH-1];  // Extract sign bit
            result_sign = a_sign ^ b_sign;  // XOR for final sign
            
            // Get absolute values
            if (a_sign) begin
                a_abs = ~a + 1'b1;  // Two's complement negation
            end else begin
                a_abs = a;
            end
            
            if (b_sign) begin
                b_abs = ~b + 1'b1;  // Two's complement negation
            end else begin
                b_abs = b;
            end
        end

        // Split absolute values into high and low parts
        assign aL = a_abs[3:0];
        assign aH = a_abs[7:4];
        assign bL = b_abs[3:0];
        assign bH = b_abs[7:4];

        logic [2*DATA_WIDTH-1:0] partial_sum;
        logic [3:0] input_to_a, input_to_b;
        
        // Exact multiplier instance
        exact_mult_{exact_mult_index} m1(
            .a(input_to_a),
            .b(input_to_b),
            .p(exact_mult_result)
        );
        
        // Combinational logic for exact multiplier inputs
        always_comb begin
            if (counter_for_exact_mult_usage == 2'b10) begin
                input_to_a = aH;
                input_to_b = bH;
            end 
        end
        
        // Sequential logic for partial product accumulation
        always_ff @(posedge fast_clk or posedge rst) begin
            if (rst) begin
                partial_sum <= 0;
                C_out <= 0;
            end else
            begin
                case(counter_for_exact_mult_usage)
                    2'd1: begin
                        partial_sum <= approx_4(aL, bL) ;  // L*L 
                    end
                    2'd2: begin
                        partial_sum <= partial_sum + (approx_4(aH, bL) << 4) + (exact_mult_result << 8)  ;  // + H*L
                    end
                    2'd3: begin // 3rd clock we will COMMIT our output 
                        partial_sum <= partial_sum + (approx_4(aL, bH) << 4)  ;  // + L*H + H*H
                        
                        // Apply sign to final result
                        if (result_sign) begin
                            C_out <= C_out + ~(partial_sum + (approx_4(aL, bH) << 4)) + 1'b1;  // Two's complement negation
                        end else begin
                            C_out <= C_out + partial_sum + (approx_4(aL, bH) << 4);
                        end
                    end
                endcase
            end
        end

        always_ff @(posedge clk or posedge rst) begin
            if (rst) begin
                c <= 0;
                d <= 0;
            end else begin
                c <= a;
                d <= b;
            end
        end
    endmodule

    // PE_3_{x}__{3*y+2}_{exact_mult_index} - Uses exact multiplier when counter_for_exact_mult_usage == 2'b11
    module PE_3_{x}__{3*y+2}_{exact_mult_index} #(
        parameter DATA_WIDTH = 8
    )(
        input logic signed [DATA_WIDTH-1:0] a, b,
        input logic fast_clk,    // 2ns time period
        input logic clk,       // 12ns time period
        input logic rst,
        input logic [1:0] counter_for_exact_mult_usage,
        output logic signed [DATA_WIDTH-1:0] c, d,
        output logic signed [2*DATA_WIDTH-1:0] C_out
    );

        // Unsigned absolute values and sign handling
        logic [DATA_WIDTH-1:0] a_abs, b_abs;
        logic a_sign, b_sign, result_sign;
        
        // Split into high and low parts (unsigned)
        logic [3:0] aL, bL;
        logic [3:0] aH, bH;
        logic [7:0] exact_mult_result;

        // Convert signed inputs to unsigned absolute values
        always_comb begin
            a_sign = a[DATA_WIDTH-1];  // Extract sign bit
            b_sign = b[DATA_WIDTH-1];  // Extract sign bit
            result_sign = a_sign ^ b_sign;  // XOR for final sign
            
            // Get absolute values
            if (a_sign) begin
                a_abs = ~a + 1'b1;  // Two's complement negation
            end else begin
                a_abs = a;
            end
            
            if (b_sign) begin
                b_abs = ~b + 1'b1;  // Two's complement negation
            end else begin
                b_abs = b;
            end
        end

        // Split absolute values into high and low parts
        assign aL = a_abs[3:0];
        assign aH = a_abs[7:4];
        assign bL = b_abs[3:0];
        assign bH = b_abs[7:4];

        logic [2*DATA_WIDTH-1:0] partial_sum;
        logic [3:0] input_to_a, input_to_b;
        
        // Exact multiplier instance
        exact_mult_{exact_mult_index} m1(
            .a(input_to_a),
            .b(input_to_b),
            .p(exact_mult_result)
        );
        
        // Combinational logic for exact multiplier inputs
        always_comb begin
            if (counter_for_exact_mult_usage == 2'b11) begin
                input_to_a = aH;
                input_to_b = bH;
            end 
        end
        
        // Sequential logic for partial product accumulation
        always_ff @(posedge fast_clk or posedge rst) begin
            if (rst) begin
                partial_sum <= 0;
                C_out <= 0;
            end else begin
                case(counter_for_exact_mult_usage)
                    2'd1: begin
                        partial_sum <= approx_3(aL, bL) ;  // L*L + H*H
                    end
                    2'd2: begin
                        partial_sum <= partial_sum + (approx_3(aH, bL) << 4);  // + H*L
                    end
                    2'd3: begin // 3rd clock we will COMMIT our output 
                        partial_sum <= partial_sum + (approx_3(aL, bH) << 4) + (exact_mult_result << 8) ;  // + L*H
                        
                        // Apply sign to final result
                        if (result_sign) begin
                            C_out <= C_out + ~(partial_sum + (approx_3(aL, bH) << 4)  + (exact_mult_result << 8)  ) + 1'b1;  // Two's complement negation
                        end else begin
                            C_out <= C_out + partial_sum + (approx_3(aL, bH) << 4) + (exact_mult_result << 8)  ;
                        end
                    end
                endcase
            end
        end

        always_ff @(posedge clk or posedge rst) begin
            if (rst) begin
                c <= 0;
                d <= 0;
            end else begin
                c <= a;
                d <= b;
            end
        end
    endmodule
// ====================================================================
        """)
                exact_mult_index += 1
        
        # Generate PE groups for rows 0-62 (groups of 3)
        y = 63
        for x in range(0, 63//3):  # row groups (0 to 20, since 63//3 = 21 groups)
            f.write(f"""
// =============================x:={x}==y:={y}========================================
    module exact_mult_{exact_mult_index} ( input logic [3:0]a,  input logic [3:0]b,  output logic [7:0]p );
        assign p = a*b; 
    endmodule

    //  FSM is the solution to ANY hardware modeling problem (most at least)
    // Check if you need to implement 2's compliment or something since we are doing manual signed multiplication
    // Cuz remember in vivado negative numbers were wierd (probably in 2s complement ), might need to rewrite later sigh

    // PE_1_{3*x}__{y}_{exact_mult_index} - Uses exact multiplier when counter_for_exact_mult_usage == 2'b01
    module PE_1_{3*x}__{y}_{exact_mult_index} #(
        parameter DATA_WIDTH = 8
    )(
        input logic signed [DATA_WIDTH-1:0] a, b,
        input logic fast_clk,    // 4ns time period
        input logic clk,       // 12ns time period
        input logic rst,
        input logic [1:0] counter_for_exact_mult_usage,
        output logic signed [DATA_WIDTH-1:0] c, d,
        output logic signed [2*DATA_WIDTH-1:0] C_out
    );

        // Unsigned absolute values and sign handling
        logic [DATA_WIDTH-1:0] a_abs, b_abs; logic a_sign, b_sign, result_sign;
        // Split into high and low parts (unsigned)
        logic [3:0] aL, bL;
        logic [3:0] aH, bH;
        logic [7:0] exact_mult_result;

        // Convert signed inputs to unsigned absolute values
        always_comb begin
            a_sign = a[DATA_WIDTH-1];  // Extract sign bit
            b_sign = b[DATA_WIDTH-1];  // Extract sign bit
            result_sign = a_sign ^ b_sign;  // XOR for final sign
            
            // Get absolute values
            if (a_sign) begin
                a_abs = ~a + 1'b1;  // Two's complement negation
            end else begin
                a_abs = a;
            end
            
            if (b_sign) begin
                b_abs = ~b + 1'b1;  // Two's complement negation
            end else begin
                b_abs = b;
            end
        end

        // Split absolute values into high and low parts
        assign aL = a_abs[3:0];
        assign aH = a_abs[7:4];
        assign bL = b_abs[3:0];
        assign bH = b_abs[7:4];

        logic [2*DATA_WIDTH-1:0] partial_sum;
        logic [3:0] input_to_a, input_to_b;
        
        // Exact multiplier instance
        exact_mult_{exact_mult_index} m1(
            .a(input_to_a),
            .b(input_to_b),
            .p(exact_mult_result)
        );
        
        // Combinational logic for exact multiplier inputs
        always_comb begin
            if (counter_for_exact_mult_usage == 2'b01) begin
                input_to_a = aH;
                input_to_b = bH;
            end 
        end
        
        // Sequential logic for partial product accumulation
        always_ff @(posedge fast_clk or posedge rst) begin
            if (rst) begin
                partial_sum <= 0;
                C_out <= 0;
            end else 
            begin
                case(counter_for_exact_mult_usage)
                    2'd1: begin
                        partial_sum <= approx_5(aL, bL) + (exact_mult_result << 8);  // L*L + H*H
                    end
                    2'd2: begin
                        partial_sum <= partial_sum + (approx_5(aH, bL) << 4)  ;  // + H*L + H*H
                    end
                    2'd3: begin // 3rd clock we will COMMIT our output 
                        partial_sum <= partial_sum + (approx_5(aL, bH) << 4);  // + L*H
                        
                        // Apply sign to final result
                        if (result_sign) begin
                            C_out <= C_out + ~(partial_sum + (approx_5(aL, bH) << 4)) + 1'b1;  // Two's complement negation
                        end else begin
                            C_out <= C_out + partial_sum + (approx_5(aL, bH) << 4);
                        end
                    end
                endcase
            end
        end

        always_ff @(posedge clk or posedge rst) begin
            if (rst) begin
                c <= 0;
                d <= 0;
            end else begin
                c <= a;
                d <= b;
            end
        end
    endmodule


    // PE_2_{3*x+1}__{y}_{exact_mult_index} - Uses exact multiplier when counter_for_exact_mult_usage == 2'b10
    module PE_2_{3*x+1}__{y}_{exact_mult_index} #(
        parameter DATA_WIDTH = 8
    )(
        input logic signed [DATA_WIDTH-1:0] a, b,
        input logic fast_clk,    // 4ns time period
        input logic clk,       // 12ns time period
        input logic rst,
        input logic [1:0] counter_for_exact_mult_usage,
        output logic signed [DATA_WIDTH-1:0] c, d,
        output logic signed [2*DATA_WIDTH-1:0] C_out
    );

        // Unsigned absolute values and sign handling
        logic [DATA_WIDTH-1:0] a_abs, b_abs;
        logic a_sign, b_sign, result_sign;
        
        // Split into high and low parts (unsigned)
        logic [3:0] aL, bL;
        logic [3:0] aH, bH;
        logic [7:0] exact_mult_result;

        // Convert signed inputs to unsigned absolute values
        always_comb begin
            a_sign = a[DATA_WIDTH-1];  // Extract sign bit
            b_sign = b[DATA_WIDTH-1];  // Extract sign bit
            result_sign = a_sign ^ b_sign;  // XOR for final sign
            
            // Get absolute values
            if (a_sign) begin
                a_abs = ~a + 1'b1;  // Two's complement negation
            end else begin
                a_abs = a;
            end
            
            if (b_sign) begin
                b_abs = ~b + 1'b1;  // Two's complement negation
            end else begin
                b_abs = b;
            end
        end

        // Split absolute values into high and low parts
        assign aL = a_abs[3:0];
        assign aH = a_abs[7:4];
        assign bL = b_abs[3:0];
        assign bH = b_abs[7:4];

        logic [2*DATA_WIDTH-1:0] partial_sum;
        logic [3:0] input_to_a, input_to_b;
        
        // Exact multiplier instance
        exact_mult_{exact_mult_index} m1(
            .a(input_to_a),
            .b(input_to_b),
            .p(exact_mult_result)
        );
        
        // Combinational logic for exact multiplier inputs
        always_comb begin
            if (counter_for_exact_mult_usage == 2'b10) begin
                input_to_a = aH;
                input_to_b = bH;
            end 
        end
        
        // Sequential logic for partial product accumulation
        always_ff @(posedge fast_clk or posedge rst) begin
            if (rst) begin
                partial_sum <= 0;
                C_out <= 0;
            end else
            begin
                case(counter_for_exact_mult_usage)
                    2'd1: begin
                        partial_sum <= approx_4(aL, bL) ;  // L*L 
                    end
                    2'd2: begin
                        partial_sum <= partial_sum + (approx_4(aH, bL) << 4) + (exact_mult_result << 8)  ;  // + H*L
                    end
                    2'd3: begin // 3rd clock we will COMMIT our output 
                        partial_sum <= partial_sum + (approx_4(aL, bH) << 4)  ;  // + L*H + H*H
                        
                        // Apply sign to final result
                        if (result_sign) begin
                            C_out <= C_out + ~(partial_sum + (approx_4(aL, bH) << 4)) + 1'b1;  // Two's complement negation
                        end else begin
                            C_out <= C_out + partial_sum + (approx_4(aL, bH) << 4);
                        end
                    end
                endcase
            end
        end

        always_ff @(posedge clk or posedge rst) begin
            if (rst) begin
                c <= 0;
                d <= 0;
            end else begin
                c <= a;
                d <= b;
            end
        end
    endmodule

    // PE_3_{3*x+2}__{y}_{exact_mult_index} - Uses exact multiplier when counter_for_exact_mult_usage == 2'b11
    module PE_3_{3*x+2}__{y}_{exact_mult_index} #(
        parameter DATA_WIDTH = 8
    )(
        input logic signed [DATA_WIDTH-1:0] a, b,
        input logic fast_clk,    // 2ns time period
        input logic clk,       // 12ns time period
        input logic rst,
        input logic [1:0] counter_for_exact_mult_usage,
        output logic signed [DATA_WIDTH-1:0] c, d,
        output logic signed [2*DATA_WIDTH-1:0] C_out
    );

        // Unsigned absolute values and sign handling
        logic [DATA_WIDTH-1:0] a_abs, b_abs;
        logic a_sign, b_sign, result_sign;
        
        // Split into high and low parts (unsigned)
        logic [3:0] aL, bL;
        logic [3:0] aH, bH;
        logic [7:0] exact_mult_result;

        // Convert signed inputs to unsigned absolute values
        always_comb begin
            a_sign = a[DATA_WIDTH-1];  // Extract sign bit
            b_sign = b[DATA_WIDTH-1];  // Extract sign bit
            result_sign = a_sign ^ b_sign;  // XOR for final sign
            
            // Get absolute values
            if (a_sign) begin
                a_abs = ~a + 1'b1;  // Two's complement negation
            end else begin
                a_abs = a;
            end
            
            if (b_sign) begin
                b_abs = ~b + 1'b1;  // Two's complement negation
            end else begin
                b_abs = b;
            end
        end

        // Split absolute values into high and low parts
        assign aL = a_abs[3:0];
        assign aH = a_abs[7:4];
        assign bL = b_abs[3:0];
        assign bH = b_abs[7:4];

        logic [2*DATA_WIDTH-1:0] partial_sum;
        logic [3:0] input_to_a, input_to_b;
        
        // Exact multiplier instance
        exact_mult_{exact_mult_index} m1(
            .a(input_to_a),
            .b(input_to_b),
            .p(exact_mult_result)
        );
        
        // Combinational logic for exact multiplier inputs
        always_comb begin
            if (counter_for_exact_mult_usage == 2'b11) begin
                input_to_a = aH;
                input_to_b = bH;
            end 
        end
        
        // Sequential logic for partial product accumulation
        always_ff @(posedge fast_clk or posedge rst) begin
            if (rst) begin
                partial_sum <= 0;
                C_out <= 0;
            end else begin
                case(counter_for_exact_mult_usage)
                    2'd1: begin
                        partial_sum <= approx_3(aL, bL) ;  // L*L + H*H
                    end
                    2'd2: begin
                        partial_sum <= partial_sum + (approx_3(aH, bL) << 4);  // + H*L
                    end
                    2'd3: begin // 3rd clock we will COMMIT our output 
                        partial_sum <= partial_sum + (approx_3(aL, bH) << 4) + (exact_mult_result << 8) ;  // + L*H
                        
                        // Apply sign to final result
                        if (result_sign) begin
                            C_out <= C_out + ~(partial_sum + (approx_3(aL, bH) << 4)  + (exact_mult_result << 8)  ) + 1'b1;  // Two's complement negation
                        end else begin
                            C_out <= C_out + partial_sum + (approx_3(aL, bH) << 4) + (exact_mult_result << 8)  ;
                        end
                    end
                endcase
            end
        end

        always_ff @(posedge clk or posedge rst) begin
            if (rst) begin
                c <= 0;
                d <= 0;
            end else begin
                c <= a;
                d <= b;
            end
        end
    endmodule
// ====================================================================
""")
            exact_mult_index += 1

        x,y = 63,63
        f.write(f"""

module exact_mult_{exact_mult_index} ( input logic [3:0]a,  input logic [3:0]b,  output logic [7:0]p );
    assign p = a*b; 
endmodule
    
module PE_1_63__63_1365 #(
    parameter DATA_WIDTH = 8
)(
    input logic signed [DATA_WIDTH-1:0] a, b,
    input logic fast_clk,    // 4ns time period
    input logic clk,       // 12ns time period
    input logic rst,
    input logic [1:0] counter_for_exact_mult_usage,
    output logic signed [DATA_WIDTH-1:0] c, d,
    output logic signed [2*DATA_WIDTH-1:0] C_out
);

    // Unsigned absolute values and sign handling
    logic [DATA_WIDTH-1:0] a_abs, b_abs;
    logic a_sign, b_sign, result_sign;
    
    // Split into high and low parts (unsigned)
    logic [3:0] aL, bL;
    logic [3:0] aH, bH;
    logic [7:0] exact_mult_result;

    // Convert signed inputs to unsigned absolute values
    always_comb begin
        a_sign = a[DATA_WIDTH-1];  // Extract sign bit
        b_sign = b[DATA_WIDTH-1];  // Extract sign bit
        result_sign = a_sign ^ b_sign;  // XOR for final sign
        
        // Get absolute values
        if (a_sign) begin
            a_abs = ~a + 1'b1;  // Two's complement negation
        end else begin
            a_abs = a;
        end
        
        if (b_sign) begin
            b_abs = ~b + 1'b1;  // Two's complement negation
        end else begin
            b_abs = b;
        end
    end

    // Split absolute values into high and low parts
    assign aL = a_abs[3:0];
    assign aH = a_abs[7:4];
    assign bL = b_abs[3:0];
    assign bH = b_abs[7:4];

    logic [2*DATA_WIDTH-1:0] partial_sum;
    logic [3:0] input_to_a, input_to_b;
    
    // Exact multiplier instance
    exact_mult_1 m1(
        .a(input_to_a),
        .b(input_to_b),
        .p(exact_mult_result)
    );
    
    // Combinational logic for exact multiplier inputs
    always_comb begin
        if (counter_for_exact_mult_usage == 2'b01) begin
            input_to_a = aH;
            input_to_b = bH;
        end 
    end
    
    // Sequential logic for partial product accumulation
    always_ff @(posedge fast_clk or posedge rst) begin
        if (rst) begin
            partial_sum <= 0;
            C_out <= 0;
        end else 
        begin
            case(counter_for_exact_mult_usage)
                2'd1: begin
                    partial_sum <= approx_5(aL, bL) + (exact_mult_result << 8);  // L*L + H*H
                end
                2'd2: begin
                    partial_sum <= partial_sum + (approx_5(aH, bL) << 4)  ;  // + H*L + H*H
                end
                2'd3: begin // 3rd clock we will COMMIT our output 
                    partial_sum <= partial_sum + (approx_5(aL, bH) << 4);  // + L*H
                    
                    // Apply sign to final result
                    if (result_sign) begin
                        C_out <= C_out + ~(partial_sum + (approx_5(aL, bH) << 4)) + 1'b1;  // Two's complement negation
                    end else begin
                        C_out <= C_out + partial_sum + (approx_5(aL, bH) << 4);
                    end
                end
            endcase
        end
    end

    always_ff @(posedge clk or posedge rst) begin
        if (rst) begin
            c <= 0;
            d <= 0;
        end else begin
            c <= a;
            d <= b;
        end
    end
endmodule
""") 
    print(f"Generated PE_groups.sv with {exact_mult_index} exact multipliers")

In [2]:
def get_pe_module_name(i, j):
    """
    Get the PE module name based on position (i, j)
    Returns the appropriate PE module name with correct indexing
    """
    # Special case for the very last PE (63, 63)
    if i == 63 and j == 63:
        return "PE_1_63__63_1365"
    
    # For columns 0-62 (grouped by columns)
    if j < 63:
        group_index = (i * 21) + (j // 3)  # Each row has 21 groups, j//3 gives group within row
        pe_index_in_group = (j % 3) + 1    # 1, 2, or 3
        return f"PE_{pe_index_in_group}_{i}__{j}_{group_index}"
    
    # For column 63 (last column, grouped by rows)
    else:  # j == 63
        if i < 63:
            group_index = 1344 + (i // 3)  # Start after column groups (64*21=1344)
            pe_index_in_group = (i % 3) + 1  # 1, 2, or 3
            return f"PE_{pe_index_in_group}_{i}__{j}_{group_index}"


def generate_systolic_array():
    """Generate systolic_array.sv with manual connections for all 4096 PEs"""
    
    with open('systolic_array.sv', 'w') as f:
        f.write("""// Auto-generated 64x64 systolic array with manual PE connections
// Each PE is uniquely instantiated with proper connections

module systolic_array #(
    parameter DATA_WIDTH = 8,
    parameter SIZE = 64
)(
    input logic signed [DATA_WIDTH-1:0] A [0:SIZE-1],
    input logic signed [DATA_WIDTH-1:0] B [0:SIZE-1],
    input logic fast_clk,    // 4ns time period
    input logic clk,         // 12ns time period
    input logic rst,
    input logic [1:0] counter_for_exact_mult_usage,
    output logic signed [2*DATA_WIDTH-1:0] C [0:SIZE-1][0:SIZE-1]
);

    // Internal wires as 2D arrays
    logic signed [DATA_WIDTH-1:0] c [0:SIZE-1][0:SIZE-1];
    logic signed [DATA_WIDTH-1:0] d [0:SIZE-1][0:SIZE-1];
    logic signed [2*DATA_WIDTH-1:0] C_out [0:SIZE-1][0:SIZE-1];

    // Manual instantiation of all 4096 PEs
""")
        
        # Generate all PE instances manually
        for i in range(64):
            for j in range(64):
                pe_name = get_pe_module_name(i, j)
                
                f.write(f"\n    // PE at position ({i}, {j})\n")
                f.write(f"    {pe_name} #(\n")
                f.write(f"        .DATA_WIDTH(DATA_WIDTH)\n")
                f.write(f"    ) pe_{i}_{j} (\n")
                
                # Determine input connections based on PE type
                if i == 0 and j == 0:
                    # Type 1: Very first PE (0,0)
                    f.write(f"        .a(A[0]),\n")
                    f.write(f"        .b(B[0]),\n")
                elif i == 0:
                    # Type 2: First row (i=0, j>0)
                    f.write(f"        .a(c[{i}][{j-1}]),\n")
                    f.write(f"        .b(B[{j}]),\n")
                elif j == 0:
                    # Type 3: First column (i>0, j=0)
                    f.write(f"        .a(A[{i}]),\n")
                    f.write(f"        .b(d[{i-1}][{j}]),\n")
                else:
                    # Type 4: Internal PEs (i>0, j>0)
                    f.write(f"        .a(c[{i}][{j-1}]),\n")
                    f.write(f"        .b(d[{i-1}][{j}]),\n")
                
                # Common connections for all PEs
                f.write(f"        .fast_clk(fast_clk),\n")
                f.write(f"        .clk(clk),\n")
                f.write(f"        .rst(rst),\n")
                f.write(f"        .counter_for_exact_mult_usage(counter_for_exact_mult_usage),\n")
                f.write(f"        .c(c[{i}][{j}]),\n")
                f.write(f"        .d(d[{i}][{j}]),\n")
                f.write(f"        .C_out(C_out[{i}][{j}])\n")
                f.write(f"    );\n")
        
        # Output assignment
        f.write(f"""
    // Assign output C from C_out
    always_comb begin
        for (int i = 0; i < SIZE; i++) begin
            for (int j = 0; j < SIZE; j++) begin
                C[i][j] = C_out[i][j];
            end
        end
    end

endmodule
""")
    
    print(f"Generated systolic_array.sv with manual connections for all 4096 PEs")


def main():
    """Main function to generate both files"""
    print("Starting RTL generation...")
    
    # Generate PE groups file
    print("Generating PE_groups.sv...")
    generate_pe_groups()
    
    # Generate systolic array file
    print("Generating systolic_array.sv...")
    generate_systolic_array()
    
    print("RTL generation completed!")
    print("Files generated:")
    print("- PE_groups.sv")
    print("- systolic_array.sv")


if __name__ == "__main__":
    main()

Starting RTL generation...
Generating PE_groups.sv...
Generated PE_groups.sv with 1365 exact multipliers
Generating systolic_array.sv...
Generated systolic_array.sv with manual connections for all 4096 PEs
RTL generation completed!
Files generated:
- PE_groups.sv
- systolic_array.sv
