<a href="https://colab.research.google.com/github/vbonato/cnnTestBench/blob/main/cnnTestBench.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
import ipywidgets as widgets
from IPython.display import display, clear_output, Markdown
import os
import math

# --- 1. Core Code Generation Functions (HLS INFERENCE - SRC Folder) ---

def generate_c_style_loader_function():
    """Generates a C-style function to load floats and quantize them to type_t."""
    code = [
        '// --- WEIGHT LOADING & QUANTIZATION FUNCTION ---',
        '// Loads 32-bit floats from a binary file and converts them to type_t',
        'int load_and_quantize_weights(const char* file_path, type_t* dest_buffer, size_t num_elements, float scale_factor) {',
        '    FILE* fp = fopen(file_path, "rb");',
        '    if (!fp) {',
        '        printf("ERROR: Could not open file for reading: %s\\n", file_path);',
        '        return 0; // Failure',
        '    }',
        '',
        '    // Create a temporary buffer to hold the float data from the file',
        '    float* temp_buffer = (float*) malloc(num_elements * sizeof(float));',
        '    if (!temp_buffer) {',
        '        printf("ERROR: Could not allocate memory for temporary float buffer.\\n");',
        '        fclose(fp);',
        '        return 0; // Failure',
        '    }',
        '',
        '    // Read the entire block of floats',
        '    size_t elements_read = fread(temp_buffer, sizeof(float), num_elements, fp);',
        '    fclose(fp);',
        '',
        '    if (elements_read != num_elements) {',
        '        printf("ERROR: Expected to read %zu elements from %s, but got %zu.\\n", num_elements, file_path, elements_read);',
        '        free(temp_buffer);',
        '        return 0; // Failure',
        '    }',
        '',
        '    // Quantize: convert floats to type_t using the scale factor',
        '    for (size_t i = 0; i < num_elements; ++i) {',
        '        dest_buffer[i] = (type_t)(temp_buffer[i] * scale_factor);',
        '    }',
        '',
        '    free(temp_buffer);',
        '    printf("Successfully loaded and quantized %zu elements from %s\\n", num_elements, file_path);',
        '    return 1; // Success',
        '}\n'
    ]
    return '\n'.join(code)

def generate_makefile_code(num_layers):
    """Generates the Makefile content for HLS compilation."""
    TARGET = f'conv{num_layers}'
    TB_OBJ = f'conv_tb{num_layers}.o'
    TB_SRC = f'conv_tb{num_layers}.cpp'
    TB_HDR = f'conv_tb{num_layers}.h'
    CNN_HDR = f'conv{num_layers}.h'
    CNN_SRC = f'conv{num_layers}.cpp'

    makefile_lines = [
        f'{TARGET}: check_dirs {TARGET}.o {TB_OBJ}',
        f'\tclang++ {TARGET}.o {TB_OBJ} -o ../bin/{TARGET} -lm\n',

        f'{TB_OBJ}: {TB_SRC} {TB_HDR} {CNN_HDR}',
        f'\tclang++ -c {TB_SRC} -o {TB_OBJ}\n',

        f'{TARGET}.o: {CNN_SRC} {CNN_HDR}',
        f'\tclang++ -c {CNN_SRC} -o {TARGET}.o\n',

        'check_dirs:',
        '\t@mkdir -p ../bin\n',

        '.PHONY: clean',
        'clean:',
        f'\trm -f {TARGET}.o {TB_OBJ}',
        f'\trm -f ../bin/{TARGET}',
        f'\trm -f ../bin/output.bin\n'
    ]
    return '\n'.join(makefile_lines)

def generate_testbench_header_code(num_layers):
    """Generates the testbench header C++ code (conv_tbX.h)."""
    code = f"#ifndef CONV_TB_H\n#define CONV_TB_H\n\n"
    code += f"// ** GLOBAL TESTBENCH CONSTANTS **\n"
    code += f"const int RANDROOF = 256; // Max value for randomized input/weights/bias\n"
    code += f"\n#endif"
    return code

# def generate_testbench_code(num_layers, params):
#     """Generates the testbench C++ code, now using GLOBAL_DIM."""
#     N_CLASSES = params['N_CLASSES']
#     GLOBAL_DIM = params['GLOBAL_DIM']

#     code_lines = [
#         '#include <cstdio>',
#         '#include <cstdlib>',
#         '#include <iostream>',
#         '#include <cmath>',
#         f'#include "conv_tb{num_layers}.h"',
#         f'#include "conv{num_layers}.h"\n',
#         'int main(void) {'
#     ]

#     # --- Memory Allocation ---
#     H1_size = f"H1 * H1" if GLOBAL_DIM == 2 else f"H1"

#     code_lines.append(f' \t// Input Buffer (C1 x H1 x (H1 if 2D) | {GLOBAL_DIM}D)')
#     code_lines.append(f' \ttype_t *I1 = (type_t *) malloc(C1 * {H1_size} * sizeof(type_t));')

#     code_lines.append(f' \t// Weight Buffers for {num_layers} Conv Layers')
#     for i in range(1, num_layers + 1):
#         R_size = f" * R{i}" if GLOBAL_DIM == 2 else ""
#         code_lines.append(f' \ttype_t *W{i} = (type_t *) malloc(M{i} * C{i} * R{i}{R_size} * sizeof(type_t));')

#     code_lines.append(f' \t// Bias Buffers for {num_layers} Conv Layers')
#     for i in range(1, num_layers + 1):
#         code_lines.append(f' \ttype_t *B{i} = (type_t *) malloc(M{i} * sizeof(type_t));')

#     code_size_n = f"M{num_layers} * E{num_layers}"
#     if GLOBAL_DIM == 2:
#         code_size_n += f" * E{num_layers}"

#     code_lines.append(f' \t// Weight Buffer for Final FC Layer (Input size: {code_size_n})')
#     code_lines.append(f' \ttype_t *W_fc = (type_t *) malloc({code_size_n} * N_CLASSES * sizeof(type_t));')

#     code_lines.append(f' \t// Bias Buffer for Final FC Layer')
#     code_lines.append(f' \ttype_t *B_fc = (type_t *) malloc(N_CLASSES * sizeof(type_t));')
#     code_lines.append(f' \t// Final Output Buffer (Softmax probabilities)')
#     code_lines.append(f' \tfloat *O_final = (float *) calloc(N_CLASSES, sizeof(float));')

#     # --- Initialization ---
#     code_lines.append('\n \tsrand(1);')
#     code_lines.append(f' \t// Initialize Input I1')
#     code_lines.append(f' \tfor(unsigned j = 0; j < C1 * {H1_size}; j++)')
#     code_lines.append(' \t\tI1[j] = rand() % RANDROOF;')

#     code_lines.append(f'\n \t// Initialize Conv Weights and Biases')
#     for i in range(1, num_layers + 1):
#         R_size = f" * R{i}" if GLOBAL_DIM == 2 else ""
#         code_lines.append(f' \tfor(unsigned j = 0; j < M{i} * C{i} * R{i}{R_size}; j++)')
#         code_lines.append(f' \t\tW{i}[j] = rand() % RANDROOF;')
#         code_lines.append(f' \tfor(unsigned j = 0; j < M{i}; j++)')
#         code_lines.append(f' \t\tB{i}[j] = rand() % RANDROOF;')

#     code_lines.append(f'\n \t// Initialize FC Weights and Biases')
#     code_lines.append(f' \tfor(unsigned j = 0; j < {code_size_n} * N_CLASSES; j++)')
#     code_lines.append(f' \t\tW_fc[j] = rand() % RANDROOF;')
#     code_lines.append(f' \tfor(unsigned j = 0; j < N_CLASSES; j++)')
#     code_lines.append(f' \t\tB_fc[j] = rand() % RANDROOF;')

#     # CNN Call, Prediction, and Freeing
#     code_lines.append('\n \t// Perform CNN inference')
#     cnn_call = ' \tcnn(I1'
#     for i in range(1, num_layers + 1):
#         cnn_call += f', W{i}, B{i}'
#     cnn_call += f', W_fc, B_fc, O_final);'
#     code_lines.append(cnn_call)

#     # --- Prediction and Print Logic ---
#     code_lines.append('\n \t// Find the maximum probability (the prediction)')
#     code_lines.append(' \tfloat max_val = O_final[0];')
#     code_lines.append(' \tint predicted_class = 0;')
#     code_lines.append(' \tfor (int k = 1; k < N_CLASSES; k++) {')
#     code_lines.append(' \t\tif (O_final[k] > max_val) {')
#     code_lines.append(' \t\t\tmax_val = O_final[k];')
#     code_lines.append(' \t\t\tpredicted_class = k;')
#     code_lines.append(' \t\t}')
#     code_lines.append(' \t}')
#     code_lines.append('\n \tprintf("--- Classification Result (Softmax Probabilities) ---\\n");')
#     code_lines.append(' \tprintf("Predicted Class Index: %d (with probability %.4f)\\n", predicted_class, max_val);')
#     code_lines.append(' \tprintf("All Probabilities:\\n");')
#     code_lines.append(' \tfor (int k = 0; k < N_CLASSES; k++) {')
#     code_lines.append(' \t\tprintf(" \tClass %d: %.4f\\n", k, O_final[k]);')
#     code_lines.append(' \t}')
#     code_lines.append(' \tprintf("---------------------------------------------------\\n");')

#     # Output to file
#     code_lines.append(f'\n \tFILE *opf = fopen("../bin/output.bin", "wb");')
#     code_lines.append(f' \tif (opf == NULL) {{')
#     code_lines.append(f' \t\tperror("Error opening output.bin");')
#     code_lines.append(f' \t\treturn EXIT_FAILURE;')
#     code_lines.append(f' \t}}')
#     code_lines.append(f' \tfwrite(O_final, sizeof(float), N_CLASSES, opf);')
#     code_lines.append(' \tfclose(opf);')

#     # Free memory
#     code_lines.append('\n \t// Free allocated memory')
#     code_lines.append(' \tif(I1) free(I1);')
#     for i in range(1, num_layers + 1):
#         code_lines.append(f' \tif(W{i}) free(W{i});')
#         code_lines.append(f' \tif(B{i}) free(B{i});')

#     code_lines.append(' \tif(W_fc) free(W_fc);')
#     code_lines.append(' \tif(B_fc) free(B_fc);')
#     code_lines.append(' \tif(O_final) free(O_final);')

#     code_lines.append('\n \treturn EXIT_SUCCESS;\n}')
#     return '\n'.join(code_lines)

def generate_testbench_code(num_layers, params):
    """Generates the testbench C++ code with weight loading and quantization."""
    N_CLASSES = params['N_CLASSES']
    GLOBAL_DIM = params['GLOBAL_DIM']

    # ** NEW: Get the C-style loader function code **
    loader_code = generate_c_style_loader_function()

    code_lines = [
        '#include <cstdio>',
        '#include <cstdlib>',
        '#include <iostream>',
        '#include <cmath>',
        f'#include "conv_tb{num_layers}.h"',
        f'#include "conv{num_layers}.h"\n',

        # ** NEW: Inject the loader function **
        loader_code,

        'int main(void) {'
    ]

    # --- Memory Allocation (This part is unchanged) ---
    H1_size = f"H1 * H1" if GLOBAL_DIM == 2 else f"H1"
    code_lines.append(f'    // Input Buffer (C1 x H1 x (H1 if 2D) | {GLOBAL_DIM}D)')
    code_lines.append(f'    type_t *I1 = (type_t *) malloc(C1 * {H1_size} * sizeof(type_t));')
    code_lines.append(f'    // Weight Buffers for {num_layers} Conv Layers')
    for i in range(1, num_layers + 1):
        R_size = f" * R{i}" if GLOBAL_DIM == 2 else ""
        code_lines.append(f'    type_t *W{i} = (type_t *) malloc(M{i} * C{i} * R{i}{R_size} * sizeof(type_t));')
    code_lines.append(f'    // Bias Buffers for {num_layers} Conv Layers')
    for i in range(1, num_layers + 1):
        code_lines.append(f'    type_t *B{i} = (type_t *) malloc(M{i} * sizeof(type_t));')
    code_size_n = f"M{num_layers} * E{num_layers}" + (f" * E{num_layers}" if GLOBAL_DIM == 2 else "")
    code_lines.append(f'    // Weight Buffer for Final FC Layer (Input size: {code_size_n})')
    code_lines.append(f'    type_t *W_fc = (type_t *) malloc({code_size_n} * N_CLASSES * sizeof(type_t));')
    code_lines.append(f'    // Bias Buffer for Final FC Layer')
    code_lines.append(f'    type_t *B_fc = (type_t *) malloc(N_CLASSES * sizeof(type_t));')
    code_lines.append(f'    // Final Output Buffer (Softmax probabilities)')
    code_lines.append(f'    float *O_final = (float *) calloc(N_CLASSES, sizeof(float));')


    # --- ** REPLACED INITIALIZATION WITH LOADING ** ---
    code_lines.append('\n    // --- Load and Quantize Trained Weights ---')
    code_lines.append('    const float SCALE_FACTOR = 256.0f; // Represents 8 fractional bits')
    code_lines.append('    int all_weights_loaded = 1;')

    # Load Conv Weights and Biases
    for i in range(1, num_layers + 1):
        R_size = f" * R{i}" if GLOBAL_DIM == 2 else ""
        W_size_str = f"M{i} * C{i} * R{i}{R_size}"
        code_lines.append(f'    all_weights_loaded &= load_and_quantize_weights("W{i}.bin", W{i}, {W_size_str}, SCALE_FACTOR);')
        code_lines.append(f'    all_weights_loaded &= load_and_quantize_weights("B{i}.bin", B{i}, M{i}, SCALE_FACTOR);')

    # Load FC Weights and Biases
    code_lines.append(f'    all_weights_loaded &= load_and_quantize_weights("W_fc.bin", W_fc, {code_size_n} * N_CLASSES, SCALE_FACTOR);')
    code_lines.append(f'    all_weights_loaded &= load_and_quantize_weights("B_fc.bin", B_fc, N_CLASSES, SCALE_FACTOR);')

    code_lines.append('\n    if (!all_weights_loaded) {')
    code_lines.append('        printf("FATAL: Failed to load one or more weight files. Exiting.\\n");')
    code_lines.append('        return EXIT_FAILURE;')
    code_lines.append('    }')

    # Initialize Input I1 with dummy data for inference
    code_lines.append('\n    // Initialize Input I1 with dummy data (e.g., all 1s after quantization)')
    code_lines.append('    // Note: The input data must also be quantized with the same scale factor!')
    code_lines.append(f'    for(unsigned j = 0; j < C1 * {H1_size}; j++) {{')
    code_lines.append('        float dummy_input_val = 1.0f / 255.0f;')
    code_lines.append('        I1[j] = (type_t)(dummy_input_val * SCALE_FACTOR);')
    code_lines.append('    }')

    # --- (The rest of the file is mostly unchanged) ---
    # CNN Call, Prediction, and Freeing
    code_lines.append('\n    // Perform CNN inference')
    cnn_call = '    cnn(I1'
    for i in range(1, num_layers + 1):
        cnn_call += f', W{i}, B{i}'
    cnn_call += f', W_fc, B_fc, O_final);'
    code_lines.append(cnn_call)

    # Prediction and Print Logic
    code_lines.append('\n    // Find the maximum probability (the prediction)')
    # ... (this section remains the same)
    code_lines.append('    float max_val = O_final[0];')
    code_lines.append('    int predicted_class = 0;')
    code_lines.append('    for (int k = 1; k < N_CLASSES; k++) {')
    code_lines.append('        if (O_final[k] > max_val) {')
    code_lines.append('            max_val = O_final[k];')
    code_lines.append('            predicted_class = k;')
    code_lines.append('        }')
    code_lines.append('    }')
    code_lines.append('\n    printf("--- Classification Result (Softmax Probabilities) ---\\n");')
    code_lines.append('    printf("Predicted Class Index: %d (with probability %.4f)\\n", predicted_class, max_val);')
    code_lines.append('    printf("All Probabilities:\\n");')
    code_lines.append('    for (int k = 0; k < N_CLASSES; k++) {')
    code_lines.append('        printf("     Class %d: %.4f\\n", k, O_final[k]);')
    code_lines.append('    }')
    code_lines.append('    printf("---------------------------------------------------\\n");')

    # Output to file
    code_lines.append(f'\n    FILE *opf = fopen("../bin/output.bin", "wb");')
    # ... (this section remains the same)
    code_lines.append(f'    if (opf == NULL) {{')
    code_lines.append(f'        perror("Error opening output.bin");')
    code_lines.append(f'        return EXIT_FAILURE;')
    code_lines.append(f'    }}')
    code_lines.append(f'    fwrite(O_final, sizeof(float), N_CLASSES, opf);')
    code_lines.append('    fclose(opf);')

    # Free memory
    code_lines.append('\n    // Free allocated memory')
    # ... (this section remains the same)
    code_lines.append('    if(I1) free(I1);')
    for i in range(1, num_layers + 1):
        code_lines.append(f'    if(W{i}) free(W{i});')
        code_lines.append(f'    if(B{i}) free(B{i});')
    code_lines.append('    if(W_fc) free(W_fc);')
    code_lines.append('    if(B_fc) free(B_fc);')
    code_lines.append('    if(O_final) free(O_final);')
    code_lines.append('\n    return EXIT_SUCCESS;\n}')
    return '\n'.join(code_lines)

def generate_conv_code(num_layers, params):
    """Generates the HLS convolution and wrapper C++ implementation code."""
    GLOBAL_DIM = params['GLOBAL_DIM']

    code = f'#include "conv{num_layers}.h"\n#include <math.h>\n\n'
    code += "#include <stdio.h>\n\n"

    # ReLU function
    code += "type_t relu(type_t x) {\n"
    code += " \treturn (x > 0) ? x : 0;\n"
    code += "}\n\n"

    # Softmax function
    code += "void softmax(type_t input[N_CLASSES], float output[N_CLASSES]) {\n"
    code += " \tfloat sum = 0.0f;\n"
    code += " \tfloat max_val = (float)input[0];\n"
    code += " \tfor (int k = 1; k < N_CLASSES; k++) {\n"
    code += " \t\tif ((float)input[k] > max_val) max_val = (float)input[k];\n"
    code += " \t}\n"
    code += " \tfor (int k = 0; k < N_CLASSES; k++) {\n"
    code += " \t\toutput[k] = expf((float)input[k] - max_val);\n"
    code += " \t\tsum += output[k];\n"
    code += " \t}\n"
    code += " \tfor (int k = 0; k < N_CLASSES; k++) {\n"
    code += " \t\toutput[k] /= sum;\n"
    code += " \t}\n"
    code += "}\n\n"

    # --- GENERATE SPECIALIZED 1D CONV FUNCTIONS ---
    for i in range(1, num_layers + 1):
        code += f"// ** Specialized 1D Convolution Layer {i} **\n"
        code += f"void conv_1d_{i}(type_t I{i}[C{i} * H{i}], type_t W{i}[M{i} * C{i} * R{i}], type_t B{i}[M{i}], type_t O{i}[M{i} * E{i}]) {{\n"
        code += f" \tfor(int m = 0; m < M{i}; m++) {{\n"
        code += f" \t\tfor(int x = 0; x < E{i}; x++) {{\n"

        code += f" \t\t\tO{i}[x + m * E{i}] = B{i}[m];\n"
        code += f" \t\t\t#pragma HLS PIPELINE II=1\n"

        code += f" \t\t\tfor(int c = 0; c < C{i}; c++) {{\n"
        code += f" \t\t\t\tfor(int l = 0; l < R{i}; l++) {{\n"

        code += f" \t\t\t\t\tint h2 = x * S{i} - PAD{i} + l;\n"
        code += f" \t\t\t\t\ttype_t val = (h2 < 0 || h2 >= H{i}) ? 0 : I{i}[h2 + c * H{i}];\n"
        code += f" \t\t\t\t\tO{i}[x + m * E{i}] += val * W{i}[l + c * R{i} + m * C{i} * R{i}];\n"

        code += f" \t\t\t\t}}\n"
        code += f" \t\t\t}}\n"
        code += f" \t\t}}\n"
        code += f" \t}}\n"
        code += f"}}\n\n"


    # --- GENERATE SPECIALIZED 2D CONV FUNCTIONS ---
    for i in range(1, num_layers + 1):
        code += f"// ** Specialized 2D Convolution Layer {i} **\n"
        code += f"void conv_2d_{i}(type_t I{i}[C{i} * H{i} * H{i}], type_t W{i}[M{i} * C{i} * R{i} * R{i}], type_t B{i}[M{i}], type_t O{i}[M{i} * E{i} * E{i}]) {{\n"
        code += f" \tfor(int m = 0; m < M{i}; m++) {{\n"
        code += f" \t\tfor(int y = 0; y < E{i}; y++) {{\n"
        code += f" \t\t\tfor(int x = 0; x < E{i}; x++) {{\n"

        code += f" \t\t\t\tO{i}[x + (y + (m * E{i})) * E{i}] = B{i}[m];\n"
        code += f" \t\t\t\t#pragma HLS PIPELINE II=1\n"

        code += f" \t\t\t\tfor(int c = 0; c < C{i}; c++) {{\n"
        code += f" \t\t\t\t\tfor(int k = 0; k < R{i}; k++) {{\n"
        code += f" \t\t\t\t\t\tfor(int l = 0; l < R{i}; l++) {{\n"

        code += f" \t\t\t\t\t\t\tint h1 = y * S{i} - PAD{i} + k;\n"
        code += f" \t\t\t\t\t\t\tint h2 = x * S{i} - PAD{i} + l;\n"
        code += f" \t\t\t\t\t\t\ttype_t val = (h1 < 0 || h1 >= H{i} || h2 < 0 || h2 >= H{i}) ? 0 : I{i}[h2 + (h1 + (c * H{i})) * H{i}];\n"
        code += f" \t\t\t\t\t\t\tO{i}[x + (y + (m * E{i})) * E{i}] += val * W{i}[l + (k + (c + (m * C{i})) * R{i}) * R{i}];\n"

        code += f" \t\t\t\t\t\t}}\n"
        code += f" \t\t\t\t\t}}\n"
        code += f" \t\t\t\t}}\n"
        code += f" \t\t\t}}\n"
        code += f" \t\t}}\n"
        code += f" \t}}\n"
        code += f"}}\n\n"


    # Final Dedicated FC layer
    input_size_n = f"M{num_layers} * E{num_layers}"
    if GLOBAL_DIM == 2:
        input_size_n += f" * E{num_layers}"

    code += f"// ** Fully Connected Layer **\n"
    code += f"void fc_layer(type_t input[{input_size_n}], type_t W_fc[{input_size_n} * N_CLASSES], type_t B_fc[N_CLASSES], type_t output[N_CLASSES]) {{\n"
    code += f" \tfor (int k = 0; k < N_CLASSES; k++) {{\n"
    code += f" \t\toutput[k] = B_fc[k];\n"
    code += f" \t\tfor (int i = 0; i < {input_size_n}; i++) {{\n"
    code += f" \t\t\toutput[k] += input[i] * W_fc[i + k * {input_size_n}];\n"
    code += f" \t\t}}\n"
    code += f" \t}}\n"
    code += f"}}\n\n"

    # --- WRAPPER CNN FUNCTION (HLS Dataflow) ---
    code += "// ** Wrapper CNN Function (HLS Inference) **\n"
    code += "void cnn(type_t *input"
    for i in range(1, num_layers + 1):
        code += f", type_t *W{i}, type_t *B{i}"
    code += ", type_t *W_fc, type_t *B_fc, float *output) {\n"
    code += "#pragma HLS DATAFLOW\n"

    # Intermediate buffers (sizes adjusted based on GLOBAL_DIM)
    for i in range(1, num_layers + 1):
        E_size = f" * E{i}" if GLOBAL_DIM == 2 else ""
        code += f" \tstatic type_t O{i}[M{i} * E{i}{E_size}];\n"

    code += f" \tstatic type_t O_raw[N_CLASSES];\n\n"

    # Chaining layers using C preprocessor conditionals based on GLOBAL_DIM
    for i in range(1, num_layers + 1):
        O_size = f"M{i} * E{i}"
        if GLOBAL_DIM == 2:
            O_size += f" * E{i}"

        I_name = "input" if i == 1 else f"O{i-1}"

        code += f"\n \t// --- Layer {i} ({GLOBAL_DIM}D) ---\n"
        code += f" #if GLOBAL_DIM == 1\n"
        code += f" \tconv_1d_{i}({I_name}, W{i}, B{i}, O{i});\n"
        code += f" #elif GLOBAL_DIM == 2\n"
        code += f" \tconv_2d_{i}({I_name}, W{i}, B{i}, O{i});\n"
        code += f" #endif\n"

        # Apply ReLU
        code += f" \t// Apply ReLU\n"
        code += f" \tfor (int j = 0; j < {O_size}; j++) O{i}[j] = relu(O{i}[j]);\n"


    # Final FC and Softmax layers
    code += "\n \t// Final Layer: Fully Connected (FC) Classification Head\n"
    code += f" \tfc_layer(O{num_layers}, W_fc, B_fc, O_raw);\n"

    code += "\n \t// Softmax Layer: Convert raw scores to probability distribution\n"
    code += f" \tsoftmax(O_raw, output);\n"

    code += "}\n"
    return code

def generate_convh_code(num_layers, params):
    """Generates the convolution header C++ code with calculated parameters (HLS)."""
    GLOBAL_DIM = params['GLOBAL_DIM']

    code = "#ifndef CONV_H\n#define CONV_H\n\n#include <cstddef>\n#include <stdio.h>\n\ntypedef unsigned type_t;\n\n"

    # Global Parameters
    code += f"// ** GLOBAL PARAMETERS **\n"
    code += f"const size_t N_CLASSES = {params['N_CLASSES']};\n"
    code += f"const size_t GLOBAL_DIM = {GLOBAL_DIM}; // All layers are {GLOBAL_DIM}D\n"
    code += f"const size_t BIAS_ON = 1;\n\n"

    # Input Parameters (H1 is the 1D dimension, or one side of a square 2D image)
    code += "// ** INPUT PARAMETERS **\n"
    code += f"const size_t C1 = {params['C1']}; // Input channels\n"
    code += f"const size_t H1 = {params['H1']}; // Input size (seq length or side length H) \n\n"

    for i in range(1, num_layers + 1):
        if i == 1:
            C_name = 'C1'
            H_name = 'H1'
        else:
            C_name = f'C{i}'
            H_name = f'H{i}'

        code += f"// ** CONV LAYER {i} **\n"
        if i > 1:
             code += f"const size_t {C_name} = {params[C_name]};\n" # Input Channels
             code += f"const size_t {H_name} = {params[H_name]};\n" # Input Size

        code += f"const size_t M{i} = {params[f'M{i}']};\n" # Output Channels
        code += f"const size_t R{i} = {params[f'R{i}']};\n" # Kernel Size
        code += f"const size_t S{i} = {params[f'S{i}']};\n" # Stride
        code += f"const size_t E{i} = {params[f'E{i}']};\n" # Output Size

        code += f"const size_t F{i} = {params[f'F{i}']};\n"
        code += f"const size_t PAD{i} = {params[f'PAD{i}']};\n\n"

    # Function prototypes
    for i in range(1, num_layers + 1):
        # 1D Conv Prototype
        code += f"void conv_1d_{i}(type_t I{i}[C{i} * H{i}], type_t W{i}[M{i} * C{i} * R{i}], type_t B{i}[M{i}], type_t O{i}[M{i} * E{i}]);\n"
        # 2D Conv Prototype
        code += f"void conv_2d_{i}(type_t I{i}[C{i} * H{i} * H{i}], type_t W{i}[M{i} * C{i} * R{i} * R{i}], type_t B{i}[M{i}], type_t O{i}[M{i} * E{i} * E{i}]);\n"

    # FC and Softmax prototypes
    input_size_n = f"M{num_layers} * E{num_layers}"
    if GLOBAL_DIM == 2:
        input_size_n += f" * E{num_layers}"

    code += f"void fc_layer(type_t input[{input_size_n}], type_t W_fc[{input_size_n} * N_CLASSES], type_t B_fc[N_CLASSES], type_t output[N_CLASSES]);\n"
    code += f"void softmax(type_t input[N_CLASSES], float output[N_CLASSES]);\n"

    # CNN prototype
    code += "\nvoid cnn(type_t *input"
    for i in range(1, num_layers + 1):
        code += f", type_t *W{i}, type_t *B{i}"
    code += ", type_t *W_fc, type_t *B_fc, float *output);\n"

    code += "\n#endif"
    return code


# --- 2. BACKPROPAGATION TRAINING CODE (X86 CPU - BACKPROP Folder) ---

def generate_save_weights_function():
    """Generates the C++ function to save a weight/bias array to a binary file."""
    save_func_code = [
        '// --- WEIGHT SAVING FUNCTION ---',
        '#include <fstream>',
        '',
        'void save_weights(const std::string& file_path, const dtype_t* data, size_t num_elements) {',
        '    std::ofstream out_file(file_path, std::ios::binary);',
        '    if (!out_file.is_open()) {',
        '        std::cerr << "ERROR: Could not open file for writing: " << file_path << std::endl;',
        '        return;',
        '    }',
        '    // Write the raw bytes of the array to the file',
        '    out_file.write(reinterpret_cast<const char*>(data), num_elements * sizeof(dtype_t));',
        '    out_file.close();',
        '    std::cout << "Saved " << num_elements << " elements to " << file_path << std::endl;',
        '}\n'
    ]
    return '\n'.join(save_func_code)

def generate_backprop_makefile_code():
    """Generates the Makefile for x86 training, using mandatory tabs."""
    makefile_lines = [
        'TARGET = cnn_trainer',
        'SRC = backprop_main.cpp backprop.cpp',
        '\n.PHONY: all $(TARGET) clean', # Added .PHONY and removed $(TARGET) from all dependencies
        'all:',
        # NOTE: This line MUST start with a hard TAB in the generated file.
        '\tclang++ -O3 -Wall -std=c++11 $(SRC) -o ../bin/$(TARGET) -lm\n',

        'clean:',
        # NOTE: This line MUST start with a hard TAB in the generated file.
        '\trm -f ../bin/$(TARGET)\n'
    ]
    return '\n'.join(makefile_lines)

def generate_backprop_header_code(num_layers, params):
    """Generates backprop header with fixed-size array definitions (float)."""
    GLOBAL_DIM = params['GLOBAL_DIM']

    code = "#ifndef BACKPROP_H\n#define BACKPROP_H\n\n#include <cstddef>\n#include <cmath>\n#include <iostream>\n\ntypedef float dtype_t;\n\n"

    # ** GENERATED CONSTANTS ** (Used for array sizing)
    code += f"const size_t N_CLASSES = {params['N_CLASSES']};\n"
    code += f"const size_t GLOBAL_DIM = {GLOBAL_DIM};\n\n"

    # Layer Constants
    for i in range(1, num_layers + 1):
        # I/O Dimensions
        code += f"const size_t C{i} = {params[f'C{i}']}; // Input Channels\n"
        code += f"const size_t H{i} = {params[f'H{i}']}; // Input Size (H)\n"
        code += f"const size_t M{i} = {params[f'M{i}']}; // Output Channels\n"
        code += f"const size_t E{i} = {params[f'E{i}']}; // Output Size (E)\n"

        # Kernel/Stride/Pad
        code += f"const size_t R{i} = {params[f'R{i}']};\n"
        code += f"const size_t S{i} = {params[f'S{i}']};\n"
        code += f"const size_t PAD{i} = {params[f'PAD{i}']};\n\n"

    # ** CALCULATED BUFFER SIZES (Crucial for fixed allocation) **

    # Define sizes for all intermediate layers
    for i in range(1, num_layers + 1):
        # E_size is E{i} if 1D, or E{i} * E{i} if 2D
        E_size = f" * E{i}" if GLOBAL_DIM == 2 else ""
        code += f"const size_t L{i}_FMAP_SIZE = M{i} * E{i}{E_size};\n"

    # Size of the final Conv output / FC input
    fc_input_size = f"L{num_layers}_FMAP_SIZE"

    code += f"const size_t FC_INPUT_SIZE = {fc_input_size};\n"
    code += f"const size_t FC_WEIGHT_SIZE = FC_INPUT_SIZE * N_CLASSES;\n\n"

    # --- Function Prototypes (using fixed array sizes) ---

    # Forward Pass prototype
    # Dynamically build the argument list for all W_i, B_i, O_i
    fwd_proto = "void cnn_fwd(const dtype_t I[C1 * H1 * (GLOBAL_DIM == 2 ? H1 : 1)]"
    for i in range(1, num_layers + 1):
        W_size = f" * R{i}" if GLOBAL_DIM == 2 else ""
        fwd_proto += f",\n             dtype_t W{i}[M{i} * C{i} * R{i}{W_size}], dtype_t B{i}[M{i}]"
        fwd_proto += f", dtype_t O{i}[L{i}_FMAP_SIZE]"
    fwd_proto += f",\n             dtype_t W_fc[FC_WEIGHT_SIZE], dtype_t B_fc[N_CLASSES], dtype_t O_RAW[N_CLASSES]);\n\n"
    code += fwd_proto

    # Backward Pass prototype
    bwd_proto = "void cnn_bwd(const dtype_t I[C1 * H1 * (GLOBAL_DIM == 2 ? H1 : 1)], const size_t LABEL[1],\n"
    bwd_proto += "             const dtype_t O_RAW[N_CLASSES], dtype_t dO_raw[N_CLASSES]"
    for i in range(1, num_layers + 1):
        W_size = f" * R{i}" if GLOBAL_DIM == 2 else ""
        bwd_proto += f",\n             const dtype_t W{i}[M{i} * C{i} * R{i}{W_size}]"
        bwd_proto += f", const dtype_t O{i}[L{i}_FMAP_SIZE]"
        bwd_proto += f", dtype_t dW{i}[M{i} * C{i} * R{i}{W_size}], dtype_t dB{i}[M{i}]"
    bwd_proto += f",\n             const dtype_t W_fc[FC_WEIGHT_SIZE], dtype_t dW_fc[FC_WEIGHT_SIZE], dtype_t dB_fc[N_CLASSES]);\n\n"
    code += bwd_proto

    # Utils
    code += "void update_weights(dtype_t* W, const dtype_t* dW, size_t size, dtype_t lr);\n"
    code += "void cross_entropy_softmax_bwd(size_t num_classes, const dtype_t* scores, const size_t* labels, dtype_t* dO_raw);\n"
    code += "void relu_bwd(size_t size, const dtype_t* O_store, const dtype_t* dO, dtype_t* dI);\n"

    code += "\n#endif"
    return code

def generate_load_data_function(N_CLASSES, INPUT_W):
    """Generates the C++ load_data function body, using C++ Raw String Literals
    to correctly define the regex pattern and returning both features (dtype_t)
    and raw integer labels (size_t)."""

    load_data_code = [
        '// --- DATA LOADING FUNCTION ---',
        '#include <fstream>',
        '#include <sstream>',
        '#include <vector>',
        '#include <regex>',
        '#include <cmath>',
        '#include <algorithm>\n',

        f'long load_data(const std::string& file_path, std::vector<dtype_t>& X_out, std::vector<dtype_t>& Y_out_one_hot, std::vector<size_t>& Y_out_raw, int N_CLASSES, int INPUT_W) {{',
        '    std::ifstream file(file_path);',
        '    if (!file.is_open()) {',
        '        // Return 0 on error, main() will handle the fatal error gracefully.',
        '        std::cerr << "ERROR: Could not open file: " << file_path << std::endl; return 0;',
        '    }',
        '    std::stringstream buffer;',
        '    buffer << file.rdbuf();',
        '    std::string content = buffer.str(); // Buffer contents for regex search',

        '    // Regex to match the pattern: {{x, y, z}, label}',
        # FIX: Using r'' (Python raw string) prevents the SyntaxWarning, while R"()" is correct for C++.
        r'    std::regex pattern(R"(\{\{([-+]?\d+\.?\d*e?[-+]?\d*),([-+]?\d+\.?\d*e?[-+]?\d*),([-+]?\d+\.?\d*e?[-+]?\d*)\},(\d+)\})");',

        '    auto matches_begin = std::sregex_iterator(content.begin(), content.end(), pattern);',
        '    auto matches_end = std::sregex_iterator();',
        '    long num_samples = 0;',

        '    for (std::sregex_iterator i = matches_begin; i != matches_end; ++i) {',
        '        std::smatch match = *i;',
        '        // 1. Add Features (X). Start from match[1] (first float capture group).',
        '        for(int j = 1; j <= INPUT_W; ++j) {',
        '            X_out.push_back(std::stof(match[j].str()));',
        '        }',
        '        // 2. Add Label (Y_out_raw and Y_out_one_hot)',
        '        // The label (int) is captured by match[INPUT_W + 1].',
        '        int label = std::stoi(match[INPUT_W + 1].str());',
        '        Y_out_raw.push_back(label); // Store raw label',
        '        std::vector<dtype_t> one_hot(N_CLASSES, 0.0f);',
        '        if (label >= 0 && label < N_CLASSES) {',
        '            one_hot[label] = 1.0f;',
        '        } else {',
        '            std::cerr << "WARNING: Invalid Label " << label << " in file " << file_path << ". Skipping sample." << std::endl;',
        '            for(int j = 0; j < INPUT_W; ++j) { X_out.pop_back(); }',
        '            Y_out_raw.pop_back(); // Rollback raw label too',
        '            continue;',
        '        }',
        '        Y_out_one_hot.insert(Y_out_one_hot.end(), one_hot.begin(), one_hot.end());',
        '        num_samples++;',
        '    }',
        '    std::cout << "Successfully loaded " << num_samples << " samples from " << file_path << std::endl;',
        '    return num_samples;',
        '}\n'
    ]
    return load_data_code

def generate_backprop_main_code(num_layers, params):
    """Generates the main training loop (backprop_main.cpp) with weight saving."""

    N_CLASSES = params['N_CLASSES']
    INPUT_W = params['C1']

    # 1. Generate the C++ data loading function
    load_data_code = generate_load_data_function(N_CLASSES, INPUT_W)

    # ** NEW: Generate the C++ weight saving function **
    save_weights_code = generate_save_weights_function()

    code = [
        '#include "backprop.h"',
        '#include <stdlib.h>',
        '#include <time.h>',
        '#include <stdio.h>',
        '#include <string.h> // For memset',
        '#include <iostream> // For cout/cerr/endl',
        'using namespace std;\n',

        '// ** NOTE: Training with actual data loading from train.dat/test.dat **\n',
    ]

    # Inject the C++ data loading function here
    code.extend(load_data_code)

    # ** NEW: Inject the C++ weight saving function here **
    code.append(save_weights_code)


    code.extend([
        'int main() {',
        '    srand(time(0));',
        '    const int RANDROOF = 100;',

        '    // 1. Training Hyperparameters',
        '    const size_t NUM_EPOCHS = 10;',
        '    const dtype_t LEARNING_RATE = 0.001f;',
        '    const size_t BATCH_SIZE = 32;',

        '    // 2. Data Loading and Setup',
        '    const std::string TRAIN_DATA_FILE = "train.dat";',
        '    const std::string TEST_DATA_FILE = "test.dat";',

        '    // Use vectors to hold data dynamically loaded from files',
        '    std::vector<dtype_t> X_train_vec, Y_train_vec_one_hot, X_test_vec, Y_test_vec_one_hot;',
        '    std::vector<size_t> Y_train_vec_raw, Y_test_vec_raw; // Raw integer labels',

        f'    long N_TRAIN_SAMPLES = load_data(TRAIN_DATA_FILE, X_train_vec, Y_train_vec_one_hot, Y_train_vec_raw, {N_CLASSES}, {INPUT_W});',
        f'    long N_TEST_SAMPLES = load_data(TEST_DATA_FILE, X_test_vec, Y_test_vec_one_hot, Y_test_vec_raw, {N_CLASSES}, {INPUT_W});',

        '    if (N_TRAIN_SAMPLES == 0) {',
        '        cerr << "FATAL ERROR: Training data could not be loaded." << endl; return 1;',
        '    }',

        '    // Pointers to the data.',
        '    dtype_t* in_data_train = X_train_vec.data();',
        '    size_t* out_data_train_raw = Y_train_vec_raw.data(); // Raw integer labels (used in cnn_bwd)',

        '    // 3. Weight and Gradient Arrays (Fixed size)',
    ])

    # W and B for all layers
    for i in range(1, num_layers + 1):
        W_size = f" * R{i}" if params['GLOBAL_DIM'] == 2 else ""
        code.append(f'    dtype_t W{i}[M{i} * C{i} * R{i}{W_size}];')
        code.append(f'    dtype_t B{i}[M{i}];')

    code.append(f'    dtype_t W_fc[FC_WEIGHT_SIZE];')
    code.append('    dtype_t B_fc[N_CLASSES];\n')

    # ... (rest of the code for gradients, buffers, and initialization is unchanged) ...
    # 4. Gradient Arrays
    code.append('    // 4. Gradient Arrays (Must be cleared per batch)')
    for i in range(1, num_layers + 1):
        W_size = f" * R{i}" if params['GLOBAL_DIM'] == 2 else ""
        code.append(f'    dtype_t dW{i}[M{i} * C{i} * R{i}{W_size}];')
        code.append(f'    dtype_t dB{i}[M{i}];')

    code.append(f'    dtype_t dW_fc[FC_WEIGHT_SIZE];')
    code.append(f'    dtype_t dB_fc[N_CLASSES];')
    code.append(f'    dtype_t dO_raw[N_CLASSES];\n')

    # 5. Intermediate Buffers (Needed for Forward/Backward Pass)
    code.append('    // 5. Intermediate Buffers (Activation stores)')
    for i in range(1, num_layers + 1):
        code.append(f'    dtype_t O{i}[L{i}_FMAP_SIZE];')

    code.append(f'    dtype_t O_RAW[N_CLASSES];\n')

    # Initialization
    code.append('    // Initialize weights only')
    code.append('    for (size_t i = 0; i < FC_WEIGHT_SIZE; ++i) W_fc[i] = (dtype_t)(rand() % RANDROOF) / RANDROOF;')
    code.append('    for (size_t i = 0; i < N_CLASSES; ++i) B_fc[i] = (dtype_t)(rand() % RANDROOF) / RANDROOF;')

    for i in range(1, num_layers + 1):
        W_size = f" * R{i}" if params['GLOBAL_DIM'] == 2 else ""
        code.append(f"    for (size_t i = 0; i < (M{i} * C{i} * R{i}{W_size}); ++i) W{i}[i] = (dtype_t)(rand() % RANDROOF) / RANDROOF;")
        code.append(f"    for (size_t i = 0; i < M{i}; ++i) B{i}[i] = (dtype_t)(rand() % RANDROOF) / RANDROOF;")

    code.append('\n    cout << "Starting training with " << N_TRAIN_SAMPLES << " samples for " << NUM_EPOCHS << " epochs..." << endl;')

    # --- TRAINING LOOP (Batch-based) ---
    code.append('    for (size_t epoch = 0; epoch < NUM_EPOCHS; ++epoch) {')
    code.append('        // Loop over training samples in batches')
    code.append('        for (long i = 0; i < N_TRAIN_SAMPLES; i += BATCH_SIZE) {')

    code.append('            // Determine the actual size of the current batch (for the last one)')
    code.append('            size_t current_batch_size = (i + BATCH_SIZE <= N_TRAIN_SAMPLES) ? BATCH_SIZE : (N_TRAIN_SAMPLES - i);')

    # Clear gradients
    code.append('            // Clear gradients for the batch')
    for i in range(1, num_layers + 1):
        W_size = f" * R{i}" if params['GLOBAL_DIM'] == 2 else ""
        code.append(f'            memset(dW{i}, 0, sizeof(dtype_t) * (M{i} * C{i} * R{i}{W_size}));')
        code.append(f'            memset(dB{i}, 0, sizeof(dB{i}));')

    code.append(f'            memset(dW_fc, 0, sizeof(dW_fc));')
    code.append(f'            memset(dB_fc, 0, sizeof(dB_fc));\n')

    # Per-sample loop
    code.append('            // The following loop processes each sample in the batch:')
    code.append('            for (size_t sample = 0; sample < current_batch_size; ++sample) {')
    code.append(f'                // Calculate pointer to the current sample and its RAW integer label')
    code.append(f'                dtype_t* current_sample_I1 = in_data_train + (i + sample) * {INPUT_W};')
    code.append(f'                size_t* current_sample_LABEL = out_data_train_raw + (i + sample);')

    # 1. Forward Pass Call
    fwd_call = f'                // 1. Forward Pass\n'
    fwd_call += f'                cnn_fwd(current_sample_I1'
    for i in range(1, num_layers + 1):
        fwd_call += f', W{i}, B{i}, O{i}'
    fwd_call += f', W_fc, B_fc, O_RAW);'
    code.append(fwd_call)

    # 2. Backward Pass Call
    bwd_call = f'                // 2. Backward Pass (Accumulates gradients, expects size_t* label)\n'
    bwd_call += f'                cnn_bwd(current_sample_I1, current_sample_LABEL, O_RAW, dO_raw'
    for i in range(1, num_layers + 1):
        bwd_call += f', W{i}, O{i}, dW{i}, dB{i}'
    bwd_call += f', W_fc, dW_fc, dB_fc);'
    code.append(bwd_call)
    code.append('            }\n') # Close inner sample loop

    # 3. Update Weights Call (outside the sample loop, once per batch)
    code.append('            // 3. Update Weights (Simple SGD on accumulated gradients)')
    code.append('            update_weights(W_fc, dW_fc, FC_WEIGHT_SIZE, LEARNING_RATE);')
    code.append('            update_weights(B_fc, dB_fc, N_CLASSES, LEARNING_RATE);')
    for i in range(num_layers, 0, -1):
        W_size = f" * R{i}" if params['GLOBAL_DIM'] == 2 else ""
        code.append(f'            update_weights(W{i}, dW{i}, (M{i} * C{i} * R{i}{W_size}), LEARNING_RATE);')
        code.append(f'            update_weights(B{i}, dB{i}, M{i}, LEARNING_RATE);')
    code.append('        }\n') # Close batch loop

    code.append('        printf("Epoch %zu complete. (Testing/Evaluation not yet implemented)\\n", epoch + 1);')
    code.append('    }') # Close epoch loop

    code.append('\n    cout << "Training finished. Weights updated." << endl;')

    # ** NEW: Add calls to save all weights and biases **
    code.append('\n    // --- SAVE FINAL WEIGHTS ---')
    code.append('    cout << "Saving final model weights to binary files..." << endl;')
    for i in range(1, num_layers + 1):
        W_size_str = f"M{i} * C{i} * R{i}" + (f" * R{i}" if params['GLOBAL_DIM'] == 2 else "")
        code.append(f'    save_weights("W{i}.bin", W{i}, {W_size_str});')
        code.append(f'    save_weights("B{i}.bin", B{i}, M{i});')

    code.append('    save_weights("W_fc.bin", W_fc, FC_WEIGHT_SIZE);')
    code.append('    save_weights("B_fc.bin", B_fc, N_CLASSES);')

    code.append('\n    return 0;\n}')
    return '\n'.join(code)

def generate_backprop_cpp_code(num_layers, params):
    """Generates the backprop.cpp implementation (Core float logic) with N-layer support."""

    code = ['#include "backprop.h"\n']

    # --- Utility Functions ---
    code.append('// Softmax Cross-Entropy Loss Derivative (dL/dO_raw)')
    code.append('void cross_entropy_softmax_bwd(size_t num_classes, const dtype_t* scores, const size_t* labels, dtype_t* dO_raw) {')
    code.append('    dtype_t exp_sum = 0.0f;')
    code.append('    for (size_t k = 0; k < num_classes; ++k) { exp_sum += expf(scores[k]); }')
    code.append('    ')
    code.append('    for (size_t k = 0; k < num_classes; ++k) {')
    code.append('        dtype_t softmax_k = expf(scores[k]) / exp_sum;')
    code.append('        dO_raw[k] = softmax_k;')
    code.append('        // Subtract 1 from the correct class (the derivative of the loss)')
    code.append('        if (k == labels[0]) { dO_raw[k] -= 1.0f; }')
    code.append('    }')
    code.append('}')

    code.append('\n// Simple ReLU Forward (modifies array in place)')
    code.append('void relu_fwd(size_t size, dtype_t* x) {')
    code.append('    for (size_t i = 0; i < size; ++i) { x[i] = (x[i] > 0.0f) ? x[i] : 0.0f; }')
    code.append('}')

    code.append('\n// ReLU Backward (dL/dI = dL/dO * mask)')
    code.append('void relu_bwd(size_t size, const dtype_t* O_store, const dtype_t* dO, dtype_t* dI) {')
    code.append('    for (size_t i = 0; i < size; ++i) {')
    code.append('        // O_store holds the pre-ReLU output (input to ReLU). If > 0, gradient is passed.')
    code.append('        dI[i] = (O_store[i] > 0.0f) ? dO[i] : 0.0f;')
    code.append('    }')
    code.append('}')

    code.append('\n// Weight Update (Simple SGD)')
    code.append('void update_weights(dtype_t* W, const dtype_t* dW, size_t size, dtype_t lr) {')
    code.append('    for (size_t i = 0; i < size; ++i) { W[i] -= lr * dW[i]; }')
    code.append('}')

    # --- Full Layer Implementations (Simplified to 1D for Example) ---

    # FC Layer Forward
    input_size_n = f"FC_INPUT_SIZE"
    code.append('\n// ** Fully Connected Layer Forward **')
    code.append(f"void fc_layer_fwd(const dtype_t input[{input_size_n}], const dtype_t W_fc[FC_WEIGHT_SIZE], const dtype_t B_fc[N_CLASSES], dtype_t output[N_CLASSES]) {{")
    code.append(f" \tfor (size_t k = 0; k < N_CLASSES; k++) {{")
    code.append(f" \t\toutput[k] = B_fc[k];")
    code.append(f" \t\tfor (size_t i = 0; i < {input_size_n}; i++) {{")
    code.append(f" \t\t\toutput[k] += input[i] * W_fc[i + k * {input_size_n}];")
    code.append(f" \t\t}}")
    code.append(f" \t}}")
    code.append(f"}}")


    # FC Layer Backward
    code.append('\n// ** Fully Connected Layer Backward **')
    code.append(f"void fc_layer_bwd(const dtype_t input[{input_size_n}], const dtype_t W_fc[FC_WEIGHT_SIZE], const dtype_t dO_raw[N_CLASSES],\n")
    code.append(f"                  dtype_t dI[FC_INPUT_SIZE], dtype_t dW_fc[FC_WEIGHT_SIZE], dtype_t dB_fc[N_CLASSES]) {{")

    code.append("    // dB: Sum of dO_raw (dL/dO of FC layer)")
    code.append("    for (size_t k = 0; k < N_CLASSES; ++k) { dB_fc[k] = dO_raw[k]; }")

    code.append("    // dW: input * dO_raw (dL/dW = I * dL/dO)")
    code.append("    for (size_t k = 0; k < N_CLASSES; ++k) {")
    code.append("        for (size_t i = 0; i < FC_INPUT_SIZE; ++i) {")
    code.append("            dW_fc[i + k * FC_INPUT_SIZE] += input[i] * dO_raw[k];")
    code.append("        }")
    code.append("    }")

    code.append("    // dI: W * dO_raw (dL/dI = W * dL/dO), clears dI first")
    code.append("    for (size_t i = 0; i < FC_INPUT_SIZE; ++i) { dI[i] = 0.0f; }")
    code.append("    for (size_t k = 0; k < N_CLASSES; ++k) {")
    code.append("        for (size_t i = 0; i < FC_INPUT_SIZE; ++i) {")
    code.append("            dI[i] += W_fc[i + k * FC_INPUT_SIZE] * dO_raw[k];")
    code.append("        }")
    code.append("    }")
    code.append(f"}}")

    # Conv 1D Forward
    code.append('\n// ** CONV 1D Forward **')
    code.append(f"void conv_1d_fwd(size_t H, size_t C, size_t R, size_t M, size_t S, size_t PAD,\n")
    code.append(f"                 const dtype_t* I, const dtype_t* W, const dtype_t* B, dtype_t* O) {{")
    code.append(f"    size_t E = (H - R + 2 * PAD) / S + 1;")
    code.append(f" \tfor(size_t m = 0; m < M; m++) {{")
    code.append(f" \t\tfor(size_t x = 0; x < E; x++) {{")
    code.append(f" \t\t\tO[x + m * E] = B[m];")
    code.append(f" \t\t\tfor(size_t c = 0; c < C; c++) {{")
    code.append(f" \t\t\t\tfor(size_t l = 0; l < R; l++) {{")
    code.append(f" \t\t\t\t\tsize_t h2 = x * S - PAD + l;")
    code.append(f" \t\t\t\t\tif (h2 < H) {{")
    code.append(f" \t\t\t\t\t\tO[x + m * E] += I[h2 + c * H] * W[l + c * R + m * C * R];")
    code.append(f" \t\t\t\t\t}}")
    code.append(f" \t\t\t\t}}")
    code.append(f" \t\t\t}}")
    code.append(f" \t\t}}")
    code.append(f" \t}}")
    code.append(f"}}")

    # Conv 1D Backward
    code.append('\n// ** CONV 1D Backward (Simplified) **')
    code.append(f"void conv_1d_bwd(size_t H, size_t C, size_t R, size_t M, size_t S, size_t PAD,\n")
    code.append(f"                 const dtype_t* I, const dtype_t* dO, const dtype_t* W,\n")
    code.append(f"                 dtype_t* dI, dtype_t* dW, dtype_t* dB) {{")
    code.append(f"    size_t E = (H - R + 2 * PAD) / S + 1;")

    code.append("    // dW (Gradient w.r.t Weights) - Accumulates")
    code.append("    for (size_t m = 0; m < M; ++m) {")
    code.append("        for (size_t c = 0; c < C; ++c) {")
    code.append("            for (size_t l = 0; l < R; ++l) {")
    code.append("                dtype_t grad_w = 0.0f;")
    code.append("                for (size_t x = 0; x < E; ++x) {")
    code.append("                    size_t h2 = x * S - PAD + l;")
    code.append("                    if (h2 < H) {")
    code.append("                        grad_w += I[h2 + c * H] * dO[x + m * E];")
    code.append("                    }")
    code.append("                }")
    code.append("                dW[l + c * R + m * C * R] += grad_w;")
    code.append("            }")
    code.append("        }")
    code.append("    }")

    code.append("    // dB (Gradient w.r.t Biases) - Accumulates")
    code.append("    for (size_t m = 0; m < M; ++m) {")
    code.append("        dtype_t grad_b = 0.0f;")
    code.append("        for (size_t x = 0; x < E; ++x) {")
    code.append("            grad_b += dO[x + m * E];")
    code.append("        }")
    code.append("        dB[m] += grad_b;")
    code.append("    }")

    code.append("    // dI (Gradient w.r.t Input) - Accumulates on existing dI (handled by memset in main loop)")
    code.append("    for (size_t m = 0; m < M; ++m) {")
    code.append("        for (size_t c = 0; c < C; ++c) {")
    code.append("            for (size_t l = 0; l < R; ++l) {")
    code.append("                for (size_t x = 0; x < E; ++x) {")
    code.append("                    size_t h2 = x * S - PAD + l;")
    code.append("                    if (h2 < H) {")
    code.append("                        dI[h2 + c * H] += W[l + c * R + m * C * R] * dO[x + m * E];")
    code.append("                    }")
    code.append("                }")
    code.append("            }")
    code.append("        }")
    code.append("    }")
    code.append(f"}}")

    # Conv 2D Forward/Backward (Placeholders to support the headers)
    code.append('\n// ** CONV 2D Forward/Backward (Placeholders) **')
    code.append('// Full 2D logic must be implemented here if GLOBAL_DIM == 2 is used.')

    # --- WRAPPER FWD (N-Layer Support) ---
    code.append('\n// ** WRAPPER FWD (For x86 Training) **')

    # Dynamically build the function signature
    fwd_signature = f"void cnn_fwd(const dtype_t I[C1 * H1 * (GLOBAL_DIM == 2 ? H1 : 1)]"
    for i in range(1, num_layers + 1):
        W_size = f" * R{i}" if params['GLOBAL_DIM'] == 2 else ""
        fwd_signature += f",\n             dtype_t W{i}[M{i} * C{i} * R{i}{W_size}], dtype_t B{i}[M{i}]"
        fwd_signature += f", dtype_t O{i}[L{i}_FMAP_SIZE]"
    fwd_signature += f",\n             dtype_t W_fc[FC_WEIGHT_SIZE], dtype_t B_fc[N_CLASSES], dtype_t O_RAW[N_CLASSES])\n{{"
    code.append(fwd_signature)

    # Chain layers
    for i in range(1, num_layers + 1):
        I_name = "I" if i == 1 else f"O{i-1}"
        H_name = "H1" if i == 1 else f"E{i-1}"

        code.append(f'    // --- LAYER {i}: CONV + ReLU ---')
        code.append(' #if GLOBAL_DIM == 1')
        code.append(f' \tconv_1d_fwd({H_name}, C{i}, R{i}, M{i}, S{i}, PAD{i}, {I_name}, W{i}, B{i}, O{i});')
        code.append(f' \trelu_fwd(L{i}_FMAP_SIZE, O{i}); // O{i} now holds ReLU output')
        code.append(' #else')
        code.append(' \t// conv_2d_fwd(...)')
        code.append(' #endif\n')

    code.append('    // --- FINAL LAYER: FC ---')
    code.append(f' \tfc_layer_fwd(O{num_layers}, W_fc, B_fc, O_RAW);')
    code.append('}')


    # --- WRAPPER BWD (N-Layer Support) ---
    code.append('\n// ** WRAPPER BWD (For x86 Training) **')

    # Dynamically build the function signature
    bwd_signature = f"void cnn_bwd(const dtype_t I[C1 * H1 * (GLOBAL_DIM == 2 ? H1 : 1)], const size_t LABEL[1],\n"
    bwd_signature += "             const dtype_t O_RAW[N_CLASSES], dtype_t dO_raw[N_CLASSES]"
    for i in range(1, num_layers + 1):
        W_size = f" * R{i}" if params['GLOBAL_DIM'] == 2 else ""
        bwd_signature += f",\n             const dtype_t W{i}[M{i} * C{i} * R{i}{W_size}]"
        bwd_signature += f", const dtype_t O{i}[L{i}_FMAP_SIZE]"
        bwd_signature += f", dtype_t dW{i}[M{i} * C{i} * R{i}{W_size}], dtype_t dB{i}[M{i}]"
    bwd_signature += f",\n             const dtype_t W_fc[FC_WEIGHT_SIZE], dtype_t dW_fc[FC_WEIGHT_SIZE], dtype_t dB_fc[N_CLASSES])\n{{"
    code.append(bwd_signature)

    code.append('    // --- 1. Softmax/Loss Backward (Calculates dL/dO_raw) ---')
    code.append('    cross_entropy_softmax_bwd(N_CLASSES, O_RAW, LABEL, dO_raw);\n')

    # Gradient Buffer Declaration
    code.append('    // Gradient buffers for chaining BWD passes')
    code.append(f'    dtype_t dI_fc[FC_INPUT_SIZE];') # Gradient from FC to L_N
    for i in range(2, num_layers + 1): # L_2 to L_N
        code.append(f'    dtype_t dI{i}[L{i-1}_FMAP_SIZE];') # Gradient from L_i to L_{i-1}
    code.append(f'    dtype_t dI1[C1 * H1 * (GLOBAL_DIM == 2 ? H1 : 1)]; // Gradient from L1 to Input\n')


    # Start Backward Pass (FC Layer)
    code.append('    // --- 2. FC Backward (Calculates dW_fc, dB_fc, and dI_fc) ---')
    code.append(f'    fc_layer_bwd(O{num_layers}, W_fc, dO_raw, dI_fc, dW_fc, dB_fc); // O{num_layers} is the input to FC\n')

    # Loop Backward Pass (Conv Layers in Reverse Order)
    for i in range(num_layers, 0, -1):
        I_name = "I" if i == 1 else f"O{i-1}"
        H_name = "H1" if i == 1 else f"E{i-1}"

        # dO_name is the gradient coming into this layer (L_i)
        dO_name = "dI_fc" if i == num_layers else f"dI{i+1}"

        # dI_name is the gradient going out to the previous layer (L_{i-1} or I)
        dI_name = "dI1" if i == 1 else f"dI{i}"

        # 3. ReLU Backward
        code.append(f'    // --- ReLU Backward (Layer {i}) ---')
        code.append(f'    relu_bwd(L{i}_FMAP_SIZE, O{i}, {dO_name}, {dO_name}); // {dO_name} now holds the dL/dO of Conv {i}')

        # 4. Conv Backward
        code.append(f'    // --- Conv Backward (Layer {i}) ---')
        code.append(' #if GLOBAL_DIM == 1')
        code.append(f' \tconv_1d_bwd({H_name}, C{i}, R{i}, M{i}, S{i}, PAD{i}, {I_name}, {dO_name}, W{i}, {dI_name}, dW{i}, dB{i});')
        code.append(' #else')
        code.append(' \t// conv_2d_bwd(...)')
        code.append(' #endif\n')

    code.append('}')

    return '\n'.join(code)


# --- 3. Parameter Calculation and GUI Setup (Unchanged) ---

def calculate_output_size(H_in, R, S, P):
    """Calculates output size E based on HLS-style padding/stride. We enforce E = ceil(H_in / S)"""
    return math.ceil(H_in / S)

def generate_parameter_widgets(num_layers):
    """Generates the parameter input widgets with PyTorch SimpleCNN defaults."""

    # Global Parameters
    n_classes_widget = widgets.IntText(value=6, description='N_CLASSES:', min=1, style={'description_width': 'initial'})

    # Global Dimension Selector: Set to 1D
    global_dim_widget = widgets.Dropdown(options=[(1, 1), (2, 2)], value=1, description='GLOBAL DIMENSION:', style={'description_width': 'initial'})
    bias_on_widget = widgets.IntText(value=1, description='Bias ON (1=Yes):', min=1, max=1, style={'description_width': 'initial'}, disabled=True)

    # Input Feature Map (Layer 1 Input)
    c1_widget = widgets.IntText(value=3, description='Input C1 (Channels):', min=1, style={'description_width': 'initial'})
    # H1 is dynamic (X_train.shape[1]), using 100 as an example
    h1_widget = widgets.IntText(value=100, description='Input H1 (Size: Seq Length):', min=1, style={'description_width': 'initial'})

    widgets_list = [
        widgets.HTML(value="<h3>Global Configuration:</h3>"),
        global_dim_widget,
        widgets.HBox([n_classes_widget, bias_on_widget]),
        widgets.HTML(value="<h3>Input Feature Map (Layer 1 Input):</h3>"),
        c1_widget,
        h1_widget,
        widgets.HTML(value=f"<h3>Convolution Layers (1 to {num_layers}):</h3>")
    ]

    layer_widgets = {}

    for i in range(1, num_layers + 1):
        # Layer 1 matches PyTorch model: M=16, R=3, S=2 (to emulate MaxPool(k=2, s=2))
        m_value = 16 if i == 1 else 32
        r_value = 3 if i == 1 else 3
        s_value = 2 if i == 1 else 1 # Stride of 2 simulates the MaxPool layer for output size

        m_i = widgets.IntText(value=m_value, description=f'L{i} M (Ch Out):', min=1, style={'description_width': 'initial'})
        r_i = widgets.IntText(value=r_value, description=f'L{i} R (Kernel):', min=1, style={'description_width': 'initial'})
        s_i = widgets.IntText(value=s_value, description=f'L{i} S (Stride):', min=1, style={'description_width': 'initial'})

        layer_widgets[i] = (m_i, r_i, s_i)

        widgets_list.append(widgets.VBox([
            widgets.HTML(value=f"<h4>Conv Layer {i}</h4>"),
            widgets.HBox([m_i, r_i, s_i])
        ]))

    params_vbox = widgets.VBox(widgets_list)

    return params_vbox, layer_widgets, c1_widget, h1_widget, n_classes_widget, global_dim_widget

def collect_and_calculate_params(num_layers, layer_widgets, c1_widget, h1_widget, n_classes_widget, global_dim_widget):
    """Collects user input and calculates dependent parameters, using a single GLOBAL_DIM."""

    params = {}

    # Global Parameters
    params['N_CLASSES'] = n_classes_widget.value
    params['GLOBAL_DIM'] = global_dim_widget.value
    GLOBAL_DIM = params['GLOBAL_DIM']

    # Initial Input (Layer 1 Input)
    params['C1'] = c1_widget.value
    params['H1'] = h1_widget.value

    H_prev = params['H1']
    M_prev = params['C1']

    for i in range(1, num_layers + 1):
        # M_i, R_i, S_i are retrieved from the widgets
        m_i, r_i, s_i = layer_widgets[i]

        params[f'M{i}'] = m_i.value
        params[f'R{i}'] = r_i.value
        params[f'S{i}'] = s_i.value

        # --- Calculate Input C_i and H_i for Layer i ---
        params[f'C{i}'] = M_prev
        params[f'H{i}'] = H_prev

        H_curr = params[f'H{i}']

        # Calculated Output Size (E_i)
        params[f'E{i}'] = calculate_output_size(H_curr, params[f'R{i}'], params[f'S{i}'], 0)

        # Calculated Padding and Padded Size
        E_curr = params[f'E{i}']
        F_i = (E_curr * params[f'S{i}'] + params[f'R{i}'] - 1)
        params[f'F{i}'] = F_i

        # Padding is calculated assuming symmetric padding to enforce output size E_i
        params[f'PAD{i}'] = (F_i - H_curr) // 2

        # --- Setup for next layer (i+1) ---
        H_prev = E_curr
        M_prev = params[f'M{i}']

        # Validation
        if params[f'R{i}'] > H_curr and H_curr > 1:
            raise ValueError(f"Layer {i}: Kernel size R{i} must be less than or equal to input size H{i}.")
        if params['N_CLASSES'] < 1:
            raise ValueError("N_CLASSES must be 1 or greater.")

        # 2D Specific Validation
        if GLOBAL_DIM == 2 and params[f'R{i}'] % 2 == 0:
             raise ValueError(f"Layer {i}: 2D kernel R{i} must be odd to ensure symmetric padding in HLS style (R=3, 5, 7...).")

    return params


# --- 4. GUI and Execution ---

layer_slider = widgets.IntSlider(value=1, min=1, max=5, step=1, description='CNN Layers:', continuous_update=False, style={'description_width': 'initial'})
generate_button = widgets.Button(description='Generate Code (HLS INF + x86 BKP)', button_style='success')
output_area = widgets.Output()
code_controls_vbox = widgets.VBox()

# Initial setup of parameter widgets
param_vbox, layer_widgets_map, c1_input, h1_input, n_classes_input, global_dim_input = generate_parameter_widgets(layer_slider.value)
code_controls_vbox.children = (param_vbox,)

def update_widgets(change):
    """Update parameter widgets when the number of layers changes."""
    global layer_widgets_map, c1_input, h1_input, n_classes_input, global_dim_input

    num_layers = layer_slider.value
    new_param_vbox, layer_widgets_map, c1_input, h1_input, n_classes_input, global_dim_input = generate_parameter_widgets(num_layers)

    code_controls_vbox.children = (new_param_vbox,)

    with output_area:
        clear_output()
        display(Markdown(f"Parameters updated for **{num_layers} layers**. Click 'Generate Code'."))

layer_slider.observe(update_widgets, names='value')

def on_button_click(b):
    """Code generation main logic."""
    with output_area:
        clear_output()

        num_layers = layer_slider.value
        i = num_layers

        try:
            # 1. Collect and Calculate Parameters
            params = collect_and_calculate_params(num_layers, layer_widgets_map, c1_input, h1_input, n_classes_input, global_dim_input)
            GLOBAL_DIM = params['GLOBAL_DIM']

            # 2. Setup Folder
            root_folder = f'./repo/generatedCNN_{i}Layers_{GLOBAL_DIM}D'
            source_folder = os.path.join(root_folder, 'src')
            bin_folder = os.path.join(root_folder, 'bin')
            backprop_folder = os.path.join(root_folder, 'backprop')

            os.makedirs(source_folder, exist_ok=True)
            os.makedirs(bin_folder, exist_ok=True)
            os.makedirs(backprop_folder, exist_ok=True)

            # 3. Generate HLS Files (Preserved)
            with open(f'{source_folder}/Makefile{i}', 'w') as f: f.write(generate_makefile_code(i))
            with open(f'{source_folder}/conv_tb{i}.cpp', 'w') as f: f.write(generate_testbench_code(i, params))
            with open(f'{source_folder}/conv_tb{i}.h', 'w') as f: f.write(generate_testbench_header_code(i))
            with open(f'{source_folder}/conv{i}.h', 'w') as f: f.write(generate_convh_code(i, params))
            with open(f'{source_folder}/conv{i}.cpp', 'w') as f: f.write(generate_conv_code(i, params))

            # 4. Generate Backpropagation Files (NEW and FIXED)
            with open(f'{backprop_folder}/Makefile', 'w') as f: f.write(generate_backprop_makefile_code())
            with open(f'{backprop_folder}/backprop.h', 'w') as f: f.write(generate_backprop_header_code(i, params))
            with open(f'{backprop_folder}/backprop_main.cpp', 'w') as f: f.write(generate_backprop_main_code(i, params))
            with open(f'{backprop_folder}/backprop.cpp', 'w') as f: f.write(generate_backprop_cpp_code(i, params))

            display(Markdown(f'## ✅ Success! All files for **{i} {GLOBAL_DIM}D layers** generated.'))
            display(Markdown(f'HLS Inference code in `{source_folder}`. **x86 Training code in `{backprop_folder}`.**'))

            # Print a summary of the calculated parameters
            param_summary = [f"| Layer | Type | D (Dim) | C (In) | H (In) | M (Out) | R (K) | S (Str) | E (Out) | Activation |"]
            param_summary.append("|---|---|---|---|---|---|---|---|---|---|")

            H1_display = f"{params['H1']}x{params['H1']}" if GLOBAL_DIM == 2 else f"{params['H1']}"
            param_summary.append(f"| Input | Image | **{GLOBAL_DIM}** | {params['C1']} | {H1_display} | N/A | N/A | N/A | N/A | No |")

            for j in range(1, num_layers + 1):
                H_display = f"{params[f'H{j}']}x{params[f'H{j}']}" if GLOBAL_DIM == 2 else f"{params[f'H{j}']}"
                E_display = f"{params[f'E{j}']}x{params[f'E{j}']}" if GLOBAL_DIM == 2 else f"{params[f'E{j}']}"

                param_summary.append(f"| {j} | CONV | **{GLOBAL_DIM}** | {params[f'C{j}']} | {H_display} | {params[f'M{j}']} | {params[f'R{j}']} | {params[f'S{j}']} | {E_display} | **ReLU** |")

            input_size = params[f'M{num_layers}'] * params[f'E{num_layers}']
            if GLOBAL_DIM == 2:
                input_size *= params[f'E{num_layers}']

            param_summary.append(f"| N+1 | **FC** | N/A | {input_size} (flat) | N/A | **{params['N_CLASSES']}** | N/A | N/A | N/A | **Softmax** |")

            display(Markdown('### Calculated CNN Parameters\n' + '\n'.join(param_summary)))

            display(Markdown('### Execution Instructions (SRC/BIN/BACKPROP Structure)'))
            display(Markdown(
                f"1. **HLS Inference Target (SRC Folder):**\n"
                f"   * Navigate to the Source: `cd {source_folder}`\n"
                f"   * Compile: `make -f Makefile{i} conv{i}`\n"
                f"   * Run: Go to the parent directory: `cd ../` and execute: `./bin/conv{i}`\n\n"
                f"2. **x86 Training Target (BACKPROP Folder):**\n"
                f"   * Navigate to Backprop: `cd {backprop_folder}`\n"
                f"   * Compile (Use the correct target!): `make all`\n"
                f"   * Run: Go to the parent directory: `cd ../` and execute: `./bin/cnn_trainer`\n"
            ))

        except ValueError as e:
            display(Markdown(f'## ❌ Error: {e}'))

        except Exception as e:
            display(Markdown(f'## ❌ An unexpected error occurred: {e}'))


generate_button.on_click(on_button_click)

# FINAL DISPLAY: Display the components only once
display(layer_slider, code_controls_vbox, generate_button, output_area)



IntSlider(value=1, continuous_update=False, description='CNN Layers:', max=5, min=1, style=SliderStyle(descrip…

VBox(children=(VBox(children=(HTML(value='<h3>Global Configuration:</h3>'), Dropdown(description='GLOBAL DIMEN…

Button(button_style='success', description='Generate Code (HLS INF + x86 BKP)', style=ButtonStyle())

Output()