# PURE PYTHON AND PROFILING

In [None]:
import math
import time # For profiling

# --- Toy Model Dimensions (same as before) ---
LATENT_DIM = 4
N_MELS = 2
FIXED_FRAMES = 4
INITIAL_FRAMES = FIXED_FRAMES // 4
INITIAL_CHANNELS = 4
CONV1_OUT_CHANNELS = 2
CONV2_OUT_CHANNELS = 1

# --- Helper Functions / Basic Layers (structure same as before) ---
# SimpleLinear, SimpleReLU, SimpleUpsampleNearest1D, SimpleConv1d
# (Their internal logic remains the same, no timing code inside them for now
# to keep them clean. Timing will be done around their calls.)

class SimpleLinear:
    def __init__(self, input_dim, output_dim, fixed_weights=None, fixed_bias=None):
        self.input_dim = input_dim
        self.output_dim = output_dim
        if fixed_weights:
            if len(fixed_weights) != output_dim or len(fixed_weights[0]) != input_dim:
                raise ValueError(f"Fixed weights shape mismatch for Linear. Expected ({output_dim}x{input_dim}), got ({len(fixed_weights)}x{len(fixed_weights[0]) if fixed_weights else 0})")
            self.weights = fixed_weights
        else:
            self.weights = [[(0.01 * (r*input_dim + c + 1)) for c in range(input_dim)] for r in range(output_dim)]
        if fixed_bias:
            if len(fixed_bias) != output_dim:
                raise ValueError(f"Fixed bias shape mismatch for Linear. Expected ({output_dim}), got ({len(fixed_bias)})")
            self.bias = fixed_bias
        else:
            self.bias = [(0.01 * (i + 1)) for i in range(output_dim)]

    def forward(self, input_vector):
        if len(input_vector) != self.input_dim:
            raise ValueError(f"Expected input_vector of dimension {self.input_dim}, got {len(input_vector)}")
        output_vector = [b for b in self.bias]
        for i in range(self.output_dim):
            for j in range(self.input_dim):
                output_vector[i] += self.weights[i][j] * input_vector[j]
        return output_vector

class SimpleReLU:
    def forward(self, input_data):
        if isinstance(input_data, (int, float)):
            return max(0, input_data)
        elif isinstance(input_data, list):
            return [self.forward(x) for x in input_data]
        else:
            raise TypeError(f"Unsupported type for ReLU: {type(input_data)}")

class SimpleUpsampleNearest1D:
    def __init__(self, scale_factor):
        self.scale_factor = scale_factor
        if not isinstance(scale_factor, int) or scale_factor <= 0:
            raise ValueError("scale_factor must be a positive integer.")

    def forward(self, input_tensor_channels_frames):
        output_tensor = []
        if not input_tensor_channels_frames or \
           not isinstance(input_tensor_channels_frames[0], list):
            raise ValueError("UpsampleNearest1D expects a list of lists (channels x frames).")
        for channel_data in input_tensor_channels_frames:
            if not isinstance(channel_data, list):
                raise ValueError("Each channel in UpsampleNearest1D input must be a list of frames.")
            upsampled_channel = []
            for frame_value in channel_data:
                upsampled_channel.extend([frame_value] * self.scale_factor)
            output_tensor.append(upsampled_channel)
        return output_tensor

class SimpleConv1d:
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, fixed_kernels=None, fixed_biases=None):
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding

        if fixed_kernels:
            if len(fixed_kernels) != out_channels or \
               (in_channels > 0 and (len(fixed_kernels[0]) != in_channels or \
                                     len(fixed_kernels[0][0]) != kernel_size)) or \
               (in_channels == 0 and kernel_size > 0 and fixed_kernels and fixed_kernels[0] and len(fixed_kernels[0]) != 0) :
                expected_shape_str = f"({out_channels}x{in_channels}x{kernel_size})"
                actual_shape_str = f"({len(fixed_kernels)}x{len(fixed_kernels[0]) if fixed_kernels and len(fixed_kernels)>0 else 'N/A'}x{len(fixed_kernels[0][0]) if fixed_kernels and len(fixed_kernels)>0 and fixed_kernels[0] and len(fixed_kernels[0])>0 else 'N/A'})"
                raise ValueError(f"Fixed kernels shape mismatch for Conv1d. Expected {expected_shape_str}, got {actual_shape_str}")
            self.kernels = fixed_kernels
        else:
            self.kernels = [[[(0.01 * (oc*in_channels*kernel_size + ic*kernel_size + k + 1)) for k in range(kernel_size)]
                             for ic in range(in_channels)]
                            for oc in range(out_channels)]
        if fixed_biases:
            if len(fixed_biases) != out_channels:
                 raise ValueError(f"Fixed biases shape mismatch for Conv1d. Expected ({out_channels}), got ({len(fixed_biases)})")
            self.biases = fixed_biases
        else:
            self.biases = [(0.01 * (i+1)) for i in range(out_channels)]

    def forward(self, input_tensor_channels_frames):
        if not isinstance(input_tensor_channels_frames, list) or \
            (self.in_channels > 0 and (not input_tensor_channels_frames or not isinstance(input_tensor_channels_frames[0], list))):
            raise ValueError(f"Conv1d input must be a list of lists (channels x frames). Got: {type(input_tensor_channels_frames)}")

        if len(input_tensor_channels_frames) != self.in_channels:
            if not (self.in_channels == 0 and len(input_tensor_channels_frames) == 0):
                raise ValueError(f"Expected {self.in_channels} input channels, got {len(input_tensor_channels_frames)}")
        if self.in_channels == 0:
            return [[self.biases[oc]] for oc in range(self.out_channels)]

        num_frames_in_original = len(input_tensor_channels_frames[0])
        padded_input = []
        if self.padding > 0:
            for channel_data in input_tensor_channels_frames:
                padded_channel = [0.0] * self.padding + channel_data + [0.0] * self.padding
                padded_input.append(padded_channel)
        else:
            padded_input = [list(cd) for cd in input_tensor_channels_frames]

        num_frames_padded = len(padded_input[0])
        if self.kernel_size > num_frames_padded :
            num_frames_out = 0
        else:
            num_frames_out = math.floor((num_frames_padded - self.kernel_size) / self.stride + 1)

        if num_frames_out <= 0 :
            return [[] for _ in range(self.out_channels)]

        output_tensor_outchannels_frames = [[0.0] * num_frames_out for _ in range(self.out_channels)]
        for oc in range(self.out_channels):
            for f_out_idx in range(num_frames_out):
                receptive_field_start = f_out_idx * self.stride
                current_sum = self.biases[oc]
                for ic in range(self.in_channels):
                    for k_idx in range(self.kernel_size):
                        input_val_idx = receptive_field_start + k_idx
                        current_sum += padded_input[ic][input_val_idx] * self.kernels[oc][ic][k_idx]
                output_tensor_outchannels_frames[oc][f_out_idx] = current_sum
        return output_tensor_outchannels_frames

# --- Simplified Decoder with Fixed Weights (and Profiling in forward) ---
class SimpleDecoder:
    def __init__(self):
        self.n_output_channels = N_MELS
        self.initial_frames = INITIAL_FRAMES
        self.initial_channels = INITIAL_CHANNELS

        self.weights_fc = [
            [0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
            [0.2, 0.4, 0.6, 0.8], [0.1, 0.3, 0.5, 0.7]
        ]
        self.bias_fc = [0.01, 0.02, 0.03, 0.04]
        self.fc = SimpleLinear(LATENT_DIM, self.initial_channels * self.initial_frames,
                               fixed_weights=self.weights_fc, fixed_bias=self.bias_fc)

        self.kernels_conv1 = [
            [[0.1, 0.2, 0.1], [0.3, 0.1, 0.2], [0.2, 0.3, 0.1], [0.1, 0.1, 0.3]],
            [[0.4, 0.2, 0.1], [0.1, 0.3, 0.4], [0.3, 0.2, 0.2], [0.2, 0.4, 0.1]]
        ]
        self.biases_conv1 = [0.05, 0.06]

        self.kernels_conv2 = [
            [[0.5, 0.1, 0.2], [0.2, 0.3, 0.5]]
        ]
        self.biases_conv2 = [0.07]

        self.kernels_conv3 = [ [[0.8]], [[0.9]] ]
        self.biases_conv3 = [0.08, 0.09]

        self.decode_layer_configs = [ # Store type and specific layer for targeted profiling
            (SimpleReLU, SimpleReLU()),
            (SimpleUpsampleNearest1D, SimpleUpsampleNearest1D(scale_factor=2)),
            (SimpleConv1d, SimpleConv1d(self.initial_channels, CONV1_OUT_CHANNELS, kernel_size=3, stride=1, padding=1,
                                        fixed_kernels=self.kernels_conv1, fixed_biases=self.biases_conv1), "Conv1"),
            (SimpleReLU, SimpleReLU()),
            (SimpleUpsampleNearest1D, SimpleUpsampleNearest1D(scale_factor=2)),
            (SimpleConv1d, SimpleConv1d(CONV1_OUT_CHANNELS, CONV2_OUT_CHANNELS, kernel_size=3, stride=1, padding=1,
                                        fixed_kernels=self.kernels_conv2, fixed_biases=self.biases_conv2), "Conv2"),
            (SimpleReLU, SimpleReLU()),
            (SimpleConv1d, SimpleConv1d(CONV2_OUT_CHANNELS, self.n_output_channels, kernel_size=1, stride=1, padding=0,
                                        fixed_kernels=self.kernels_conv3, fixed_biases=self.biases_conv3), "Conv3_Output"),
            (SimpleReLU, SimpleReLU())
        ]
        # self.decode_layers is just the list of layer instances
        self.decode_layers = [lc[1] for lc in self.decode_layer_configs]


    def reshape_to_channels_frames(self, flat_list, channels, frames):
        if len(flat_list) != channels * frames:
            raise ValueError(f"Cannot reshape list of size {len(flat_list)} into ({channels}, {frames})")
        reshaped = []
        for i in range(channels):
            reshaped.append(flat_list[i * frames : (i + 1) * frames])
        return reshaped

    def forward(self, z, verbose=False, collect_timings=False):
        timings = {} # To store timings for different parts

        # --- FC Layer ---
        if collect_timings: t_start = time.perf_counter()
        x = self.fc.forward(z)
        if collect_timings: timings['fc'] = time.perf_counter() - t_start
        if verbose: print(f"After FC: {x}")

        # --- Reshape ---
        if collect_timings: t_start = time.perf_counter()
        x = self.reshape_to_channels_frames(x, self.initial_channels, self.initial_frames)
        if collect_timings: timings['reshape'] = time.perf_counter() - t_start
        if verbose: print(f"After Reshape: {x}")

        # --- Decode Layers ---
        # Initialize cumulative timings for layer types
        if collect_timings:
            timings['relu_total'] = 0
            timings['upsample_total'] = 0
            # Individual conv timings will be stored by their given names
            # (e.g., timings['Conv1'], timings['Conv2'])

        for i, (layer_type, layer_instance, *layer_name_tuple) in enumerate(self.decode_layer_configs):
            layer_display_name = layer_name_tuple[0] if layer_name_tuple else layer_instance.__class__.__name__ + f"_{i}"

            if collect_timings: t_layer_start = time.perf_counter()
            x = layer_instance.forward(x) # Use the instance from decode_layer_configs
            if collect_timings:
                layer_time = time.perf_counter() - t_layer_start
                if layer_type == SimpleReLU:
                    timings['relu_total'] += layer_time
                elif layer_type == SimpleUpsampleNearest1D:
                    timings['upsample_total'] += layer_time
                elif layer_type == SimpleConv1d and layer_name_tuple: # Specific named Conv layer
                    timings[layer_name_tuple[0]] = layer_time
                # else:
                #    timings[layer_display_name] = layer_time # Generic timing for other layers if any

            if verbose:
                num_channels_out = len(x) if isinstance(x, list) and x else 0
                num_frames_out = len(x[0]) if num_channels_out > 0 and isinstance(x[0], list) and x[0] else 0
                if num_channels_out > 0 and not x[0]: num_frames_out = 0
                print(f"After Layer {i} ({layer_display_name}): Shape=({num_channels_out}x{num_frames_out}), Output={[[round(val, 4) for val in ch] for ch in x]}")

        recon_mel_spec = x
        if collect_timings:
            return recon_mel_spec, timings
        else:
            return recon_mel_spec

# --- Benchmarking and Example Usage ---
if __name__ == "__main__":
    print("--- Toy Model Decoder with Fixed Weights & Input ---")
    # ... (Dimension printouts remain the same) ...

    decoder = SimpleDecoder()
    fixed_z = [0.1, 0.5, 0.2, 0.6]
    print(f"\nFixed Input latent vector z: {fixed_z}")

    # --- Single Verbose Run (for verification) ---
    print("\n--- Starting a single verbose forward pass (for verification) ---")
    reconstructed_output_verbose, _ = decoder.forward(fixed_z, verbose=True, collect_timings=True) # also collect timings once
    print("\n--- Final Output (from verbose run) ---")
    if reconstructed_output_verbose:
        # ... (print output as before) ...
        output_channels = len(reconstructed_output_verbose)
        output_frames = 0
        if output_channels > 0 and isinstance(reconstructed_output_verbose[0], list):
             output_frames = len(reconstructed_output_verbose[0])
        print(f"Shape: {output_channels} channels x {output_frames} frames")
        for r_idx, r_channel_data in enumerate(reconstructed_output_verbose):
            print(f"  Channel {r_idx}: {[round(val, 4) for val in r_channel_data]}")


    # --- Benchmarking Runs ---
    print("\n--- Starting Benchmarking ---")
    num_benchmark_runs = 100 # Number of times to run the forward pass for averaging

    total_forward_time = 0
    # To store aggregated timings for each component
    aggregated_component_timings = {}

    for i in range(num_benchmark_runs):
        run_start_time = time.perf_counter()
        # For benchmarking, run without verbose, but collect timings
        _, component_timings = decoder.forward(fixed_z, verbose=False, collect_timings=True)
        run_end_time = time.perf_counter()

        total_forward_time += (run_end_time - run_start_time)

        for key, value in component_timings.items():
            if key not in aggregated_component_timings:
                aggregated_component_timings[key] = 0
            aggregated_component_timings[key] += value
        if (i + 1) % (num_benchmark_runs // 10 if num_benchmark_runs >=10 else 1) == 0:
            print(f"  Completed run {i+1}/{num_benchmark_runs}")


    # Calculate averages
    avg_forward_time = total_forward_time / num_benchmark_runs
    avg_component_timings = {k: v / num_benchmark_runs for k, v in aggregated_component_timings.items()}

    print("\n--- Benchmarking Results ---")
    print(f"Number of benchmark runs: {num_benchmark_runs}")
    print(f"Average total forward pass time: {avg_forward_time:.8f} seconds")

    print("\nAverage time per component:")
    # Sort for consistent output order
    sorted_components = sorted(avg_component_timings.items(), key=lambda item: item[1], reverse=True)

    for component, avg_time in sorted_components:
        percentage_of_total = (avg_time / avg_forward_time) * 100 if avg_forward_time > 0 else 0
        print(f"  - {component:<15}: {avg_time:.8f} seconds ({percentage_of_total:.2f}%)")

    # --- Sanity checks (same as before) ---
    assert decoder.decode_layers[5].in_channels == CONV1_OUT_CHANNELS
    assert decoder.decode_layers[5].out_channels == CONV2_OUT_CHANNELS
    print("\nDeterministic model setup and profiling complete.")

--- Toy Model Decoder with Fixed Weights & Input ---

Fixed Input latent vector z: [0.1, 0.5, 0.2, 0.6]

--- Starting a single verbose forward pass (for verification) ---
After FC: [0.42, 0.99, 0.85, 0.72]
After Reshape: [[0.42], [0.99], [0.85], [0.72]]
After Layer 0 (SimpleReLU_0): Shape=(4x1), Output=[[0.42], [0.99], [0.85], [0.72]]
After Layer 1 (SimpleUpsampleNearest1D_1): Shape=(4x2), Output=[[0.42, 0.42], [0.99, 0.99], [0.85, 0.85], [0.72, 0.72]]
After Layer 2 (Conv1): Shape=(2x2), Output=[[1.101, 1.141], [1.579, 1.565]]
After Layer 3 (SimpleReLU_3): Shape=(2x2), Output=[[1.101, 1.141], [1.579, 1.565]]
After Layer 4 (SimpleUpsampleNearest1D_4): Shape=(2x4), Output=[[1.101, 1.101, 1.141, 1.141], [1.579, 1.579, 1.565, 1.565]]
After Layer 5 (Conv2): Shape=(1x4), Output=[[1.6635, 2.5308, 2.5306, 1.5371]]
After Layer 6 (SimpleReLU_6): Shape=(1x4), Output=[[1.6635, 2.5308, 2.5306, 1.5371]]
After Layer 7 (Conv3_Output): Shape=(2x4), Output=[[1.4108, 2.1046, 2.1045, 1.3097], [1.5872, 2.3

# Conversion To Fixed

In [None]:
import math # For round

# --- Fixed-Point Parameters ---
FRACTIONAL_BITS = 8
TOTAL_BITS = 16
INTEGER_BITS = TOTAL_BITS - FRACTIONAL_BITS - 1 # -1 for the sign bit
SCALE = 1 << FRACTIONAL_BITS  # 2**8 = 256

# Calculate min and max representable scaled integer values for signed 16-bit
MIN_SFIXED_VAL = -(1 << (TOTAL_BITS - 1))  # -32768
MAX_SFIXED_VAL = (1 << (TOTAL_BITS - 1)) - 1 #  32767

def float_to_sfixed(val_float, f_bits=FRACTIONAL_BITS):
    """Converts a float to a scaled signed integer for fixed-point representation."""
    scale_factor = 1 << f_bits
    scaled_val = int(round(val_float * scale_factor))
    # Clamp to the defined TOTAL_BITS signed range
    clamped_val = max(MIN_SFIXED_VAL, min(MAX_SFIXED_VAL, scaled_val))
    return clamped_val

def sfixed_to_float(val_sfixed, f_bits=FRACTIONAL_BITS):
    """Converts a scaled signed integer back to a float."""
    scale_factor = 1 << f_bits
    return float(val_sfixed) / scale_factor

print(f"--- Fixed-Point Configuration ---")
print(f"TOTAL_BITS: {TOTAL_BITS}, FRACTIONAL_BITS: {FRACTIONAL_BITS}, INTEGER_BITS (excluding sign): {INTEGER_BITS}")
print(f"SCALE: {SCALE}")
print(f"MIN_SFIXED_VAL (scaled int): {MIN_SFIXED_VAL}")
print(f"MAX_SFIXED_VAL (scaled int): {MAX_SFIXED_VAL}")
print(f"Representable float range: {sfixed_to_float(MIN_SFIXED_VAL):.4f} to {sfixed_to_float(MAX_SFIXED_VAL):.4f}\n")

--- Fixed-Point Configuration ---
TOTAL_BITS: 16, FRACTIONAL_BITS: 8, INTEGER_BITS (excluding sign): 7
SCALE: 256
MIN_SFIXED_VAL (scaled int): -32768
MAX_SFIXED_VAL (scaled int): 32767
Representable float range: -128.0000 to 127.9961



# FC GOLDEN OUTPUT

In [None]:
import math # For round

# --- Fixed-Point Parameters ---
FRACTIONAL_BITS = 8
TOTAL_BITS = 16
INTEGER_BITS = TOTAL_BITS - FRACTIONAL_BITS - 1 # -1 for the sign bit
SCALE = 1 << FRACTIONAL_BITS  # 2**8 = 256

# Calculate min and max representable scaled integer values for signed 16-bit
MIN_SFIXED_VAL = -(1 << (TOTAL_BITS - 1))  # -32768
MAX_SFIXED_VAL = (1 << (TOTAL_BITS - 1)) - 1 #  32767

def float_to_sfixed(val_float, f_bits=FRACTIONAL_BITS):
    """Converts a float to a scaled signed integer for fixed-point representation."""
    scale_factor = 1 << f_bits
    scaled_val = int(round(val_float * scale_factor))
    # Clamp to the defined TOTAL_BITS signed range
    clamped_val = max(MIN_SFIXED_VAL, min(MAX_SFIXED_VAL, scaled_val))
    return clamped_val

def sfixed_to_float(val_sfixed, f_bits=FRACTIONAL_BITS):
    """Converts a scaled signed integer back to a float."""
    scale_factor = 1 << f_bits
    return float(val_sfixed) / scale_factor

# --- Layer Dimensions relevant for FC Layer ---
LATENT_DIM = 4
# For the FC layer in SimpleDecoder, output_dim = initial_channels * initial_frames
INITIAL_CHANNELS = 4
INITIAL_FRAMES = 1 # Calculated from FIXED_FRAMES // 4 where FIXED_FRAMES was 4
FC_OUTPUT_SIZE = INITIAL_CHANNELS * INITIAL_FRAMES # Should be 4 * 1 = 4

# --- Definition of SimpleLinear (needed to compute FC output) ---
class SimpleLinear:
    def __init__(self, input_dim, output_dim, fixed_weights=None, fixed_bias=None):
        self.input_dim = input_dim
        self.output_dim = output_dim
        if fixed_weights:
            if len(fixed_weights) != output_dim or len(fixed_weights[0]) != input_dim:
                raise ValueError(f"Fixed weights shape mismatch for Linear. Expected ({output_dim}x{input_dim}), got ({len(fixed_weights)}x{len(fixed_weights[0]) if fixed_weights else 0})")
            self.weights = fixed_weights
        else: # Fallback, not used if providing fixed_weights
            self.weights = [[0.0 for _ in range(input_dim)] for _ in range(output_dim)]
        if fixed_bias:
            if len(fixed_bias) != output_dim:
                raise ValueError(f"Fixed bias shape mismatch for Linear. Expected ({output_dim}), got ({len(fixed_bias)})")
            self.bias = fixed_bias
        else: # Fallback
            self.bias = [0.0 for _ in range(output_dim)]

    def forward(self, input_vector):
        if len(input_vector) != self.input_dim:
            raise ValueError(f"Expected input_vector of dimension {self.input_dim}, got {len(input_vector)}")
        output_vector = [b for b in self.bias] # Start with bias
        for i in range(self.output_dim): # Output dimension
            for j in range(self.input_dim): # Input dimension
                output_vector[i] += self.weights[i][j] * input_vector[j]
        return output_vector

# --- Main section to generate and print golden values for FC Layer ---
if __name__ == "__main__":
    print(f"--- Generating Golden Fixed-Point Values for FC Layer ---")
    print(f"Fixed-Point Config: TOTAL_BITS={TOTAL_BITS}, FRACTIONAL_BITS={FRACTIONAL_BITS} (S{INTEGER_BITS+1}.{FRACTIONAL_BITS}), SCALE={SCALE}\n")

    # 1. Define inputs, weights, and biases in float (as in SimpleDecoder)
    fixed_z_float = [0.1, 0.5, 0.2, 0.6]

    weights_fc_float = [
        [0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
        [0.2, 0.4, 0.6, 0.8], [0.1, 0.3, 0.5, 0.7]
    ]
    bias_fc_float = [0.01, 0.02, 0.03, 0.04]

    # 2. Instantiate the Python model for the FC layer
    fc_layer_python_model = SimpleLinear(
        input_dim=LATENT_DIM,
        output_dim=FC_OUTPUT_SIZE,
        fixed_weights=weights_fc_float,
        fixed_bias=bias_fc_float
    )

    # 3. Calculate the output of the FC layer in float
    fc_output_float = fc_layer_python_model.forward(fixed_z_float)
    # This should match [0.42, 0.99, 0.85, 0.72] from your previous logs

    # 4. Convert all these to scaled fixed-point integers and print

    # Input z
    fixed_z_sfixed = [float_to_sfixed(val) for val in fixed_z_float]
    print("--- Input z (fixed_z_sfixed) for Verilog DUT ---")
    # E.g., i_z[0] = 16'sd26; // 0.1
    for i in range(len(fixed_z_float)):
        print(f"i_z[{i}] = 16'sd{fixed_z_sfixed[i]}; \t// Float: {fixed_z_float[i]:.2f}")

    # Weights
    weights_fc_sfixed = [[float_to_sfixed(val) for val in row] for row in weights_fc_float]
    print("\n--- Weights_FC (weights_fc_sfixed) for Verilog parameters/locals ---")
    # E.g. WEIGHTS_FC[0][0] = 16'sd26; // 0.1
    print("localparam signed [15:0] WEIGHTS_FC [0:3][0:3] = '{")
    for r in range(len(weights_fc_sfixed)):
        row_sfixed_str = ", ".join([f"16'sd{val}" for val in weights_fc_sfixed[r]])
        print(f"    '{'{'}{row_sfixed_str}{'}'}{',' if r < len(weights_fc_sfixed)-1 else ''} // Row {r}")
    print("};")

    # Biases
    bias_fc_sfixed = [float_to_sfixed(val) for val in bias_fc_float]
    print("\n--- Biases_FC (bias_fc_sfixed) for Verilog parameters/locals ---")
    # E.g. BIAS_FC[0] = 16'sd3; // 0.01
    bias_sfixed_str = ", ".join([f"16'sd{val}" for val in bias_fc_sfixed])
    print(f"localparam signed [15:0] BIAS_FC [0:3] = '{{{bias_sfixed_str}}};")


    # Golden Output
    fc_output_golden_sfixed = [float_to_sfixed(val) for val in fc_output_float]
    print("\n--- Golden FC Output (fc_output_golden_sfixed) for Verilog Testbench Comparison ---")
    # E.g. golden_output[0] = 16'sd108; // 0.42
    print("logic signed [15:0] golden_fc_output [0:3];")
    print("initial begin")
    for i in range(len(fc_output_float)):
        print(f"    golden_fc_output[{i}] = 16'sd{fc_output_golden_sfixed[i]}; \t// Float: {fc_output_float[i]:.2f}")
    print("end")

    print("\n--- Sanity check: FC output float values ---")
    print(f"Calculated fc_output_float: {fc_output_float}")
    # Expected based on previous full decoder log: [0.42, 0.99, 0.85, 0.72]
    # Small differences might occur due to rounding in fixed-point vs full float precision.
    # Let's also print the float values from the sfixed golden output to see effect of quantization
    fc_output_quantized_float = [sfixed_to_float(val) for val in fc_output_golden_sfixed]
    print(f"FC output from sfixed (quantized): { [round(f,4) for f in fc_output_quantized_float] }")

--- Generating Golden Fixed-Point Values for FC Layer ---
Fixed-Point Config: TOTAL_BITS=16, FRACTIONAL_BITS=8 (S8.8), SCALE=256

--- Input z (fixed_z_sfixed) for Verilog DUT ---
i_z[0] = 16'sd26; 	// Float: 0.10
i_z[1] = 16'sd128; 	// Float: 0.50
i_z[2] = 16'sd51; 	// Float: 0.20
i_z[3] = 16'sd154; 	// Float: 0.60

--- Weights_FC (weights_fc_sfixed) for Verilog parameters/locals ---
localparam signed [15:0] WEIGHTS_FC [0:3][0:3] = '{
    '{16'sd26, 16'sd51, 16'sd77, 16'sd102}, // Row 0
    '{16'sd128, 16'sd154, 16'sd179, 16'sd205}, // Row 1
    '{16'sd51, 16'sd102, 16'sd154, 16'sd205}, // Row 2
    '{16'sd26, 16'sd77, 16'sd128, 16'sd179} // Row 3
};

--- Biases_FC (bias_fc_sfixed) for Verilog parameters/locals ---
localparam signed [15:0] BIAS_FC [0:3] = '{16'sd3, 16'sd5, 16'sd8, 16'sd10};

--- Golden FC Output (fc_output_golden_sfixed) for Verilog Testbench Comparison ---
logic signed [15:0] golden_fc_output [0:3];
initial begin
    golden_fc_output[0] = 16'sd108; 	// Float: 0.42
   

# RELU GOLDEN OUTPUT

In [None]:
import math # For round

# --- Fixed-Point Parameters (consistent with previous stage) ---
FRACTIONAL_BITS = 8
TOTAL_BITS = 16
# For S<I>.<F> format, I = TOTAL_BITS - FRACTIONAL_BITS - 1 (for sign bit)
INTEGER_BITS = TOTAL_BITS - FRACTIONAL_BITS - 1
SCALE = 1 << FRACTIONAL_BITS  # 2**8 = 256

MIN_SFIXED_VAL = -(1 << (TOTAL_BITS - 1))  # -32768
MAX_SFIXED_VAL = (1 << (TOTAL_BITS - 1)) - 1 #  32767

# --- Utility functions (not strictly needed for this ReLU example with positive inputs, but good for completeness) ---
def float_to_sfixed(val_float, f_bits=FRACTIONAL_BITS, total_bits=TOTAL_BITS):
    """Converts a float to a scaled signed integer for fixed-point representation."""
    scale_factor = 1 << f_bits
    min_val = -(1 << (total_bits - 1))
    max_val = (1 << (total_bits - 1)) - 1

    scaled_val = int(round(val_float * scale_factor))
    clamped_val = max(min_val, min(max_val, scaled_val))
    return clamped_val

def sfixed_to_float(val_sfixed, f_bits=FRACTIONAL_BITS):
    """Converts a scaled signed integer back to a float."""
    scale_factor = 1 << f_bits
    return float(val_sfixed) / scale_factor

# --- ReLU Golden Value Generation ---
if __name__ == "__main__":
    print(f"--- Generating Golden Fixed-Point Values for ReLU Module ---")
    print(f"Fixed-Point Config: TOTAL_BITS={TOTAL_BITS}, FRACTIONAL_BITS={FRACTIONAL_BITS} (S{INTEGER_BITS+1}.{FRACTIONAL_BITS}), SCALE={SCALE}\n")

    # 1. Define Inputs for ReLU stage
    # These are the *actual fixed-point integer outputs* from the Verilog fc_layer simulation.
    # From VCS log: Got_Verilog [108, 254, 218, 186]
    relu_input_sfixed = [108, 254, 218, 186]

    print("--- Input to ReLU Module (relu_input_sfixed) ---")
    print("--- (These are the actual outputs from the Verilog fc_layer) ---")
    # E.g., i_data[0] = 16'sd108;
    for i in range(len(relu_input_sfixed)):
        approx_float = sfixed_to_float(relu_input_sfixed[i])
        print(f"i_data[{i}] = 16'sd{relu_input_sfixed[i]}; \t// Approx Float: {approx_float:.4f}")

    # 2. Calculate Golden Output for ReLU
    # ReLU operation: output = (input < 0) ? 0 : input;
    relu_golden_output_sfixed = [0] * len(relu_input_sfixed)
    for i in range(len(relu_input_sfixed)):
        if relu_input_sfixed[i] < 0:
            relu_golden_output_sfixed[i] = 0
        else:
            relu_golden_output_sfixed[i] = relu_input_sfixed[i]

    # Since all current inputs are positive, the output will be the same.
    # We can also test with a mix of positive and negative inputs later if needed.

    print("\n--- Golden ReLU Output (relu_golden_output_sfixed) for Verilog Testbench ---")
    # E.g. golden_output[0] = 16'sd108;
    print("logic signed [15:0] golden_relu_output [0:3]; // Or NUM_ELEMENTS-1")
    print("initial begin")
    for i in range(len(relu_golden_output_sfixed)):
        approx_float = sfixed_to_float(relu_golden_output_sfixed[i])
        print(f"    golden_relu_output[{i}] = 16'sd{relu_golden_output_sfixed[i]}; \t// Approx Float: {approx_float:.4f}")
    print("end")

    print("\n--- For a more comprehensive ReLU test, consider these inputs: ---")
    mixed_inputs_float = [0.42, -0.1, 0.0, -0.99, 2.0] # Example mixed inputs
    mixed_inputs_sfixed = [float_to_sfixed(f) for f in mixed_inputs_float]
    mixed_outputs_sfixed = [max(0, s_val) if s_val >=0 else 0 for s_val in mixed_inputs_sfixed] # ReLU on sfixed

    print("Example Mixed Inputs (sfixed):", mixed_inputs_sfixed)
    print("Example Mixed Outputs (sfixed):", mixed_outputs_sfixed)



--- Generating Golden Fixed-Point Values for ReLU Module ---
Fixed-Point Config: TOTAL_BITS=16, FRACTIONAL_BITS=8 (S8.8), SCALE=256

--- Input to ReLU Module (relu_input_sfixed) ---
--- (These are the actual outputs from the Verilog fc_layer) ---
i_data[0] = 16'sd108; 	// Approx Float: 0.4219
i_data[1] = 16'sd254; 	// Approx Float: 0.9922
i_data[2] = 16'sd218; 	// Approx Float: 0.8516
i_data[3] = 16'sd186; 	// Approx Float: 0.7266

--- Golden ReLU Output (relu_golden_output_sfixed) for Verilog Testbench ---
logic signed [15:0] golden_relu_output [0:3]; // Or NUM_ELEMENTS-1
initial begin
    golden_relu_output[0] = 16'sd108; 	// Approx Float: 0.4219
    golden_relu_output[1] = 16'sd254; 	// Approx Float: 0.9922
    golden_relu_output[2] = 16'sd218; 	// Approx Float: 0.8516
    golden_relu_output[3] = 16'sd186; 	// Approx Float: 0.7266
end

--- For a more comprehensive ReLU test, consider these inputs: ---
Example Mixed Inputs (sfixed): [108, -26, 0, -253, 512]
Example Mixed Outputs (sfi

# CONV1D GOLDEN OUTPUT

In [None]:
import math

# --- Fixed-Point Parameters (consistent) ---
FRACTIONAL_BITS = 8
TOTAL_BITS = 16
INTEGER_BITS = TOTAL_BITS - FRACTIONAL_BITS - 1
SCALE = 1 << FRACTIONAL_BITS

MIN_SFIXED_VAL = -(1 << (TOTAL_BITS - 1))
MAX_SFIXED_VAL = (1 << (TOTAL_BITS - 1)) - 1

def float_to_sfixed(val_float, f_bits=FRACTIONAL_BITS, total_bits=TOTAL_BITS):
    scale_factor = 1 << f_bits
    min_val = -(1 << (total_bits - 1))
    max_val = (1 << (total_bits - 1)) - 1
    scaled_val = int(round(val_float * scale_factor))
    clamped_val = max(min_val, min(max_val, scaled_val))
    return clamped_val

def sfixed_to_float(val_sfixed, f_bits=FRACTIONAL_BITS):
    scale_factor = 1 << f_bits
    return float(val_sfixed) / scale_factor

# --- Conv1 Parameters ---
NUM_IN_CHANNELS_CONV1 = 4
NUM_OUT_CHANNELS_CONV1 = 2
KERNEL_SIZE_CONV1 = 3
STRIDE_CONV1 = 1
PADDING_CONV1 = 1
NUM_IN_FRAMES_CONV1 = 2 # Output frames from Upsampler1

# --- Helper function for fixed-point Conv1D (bit-accurate) ---
def conv1d_fixed_point_manual(input_data_sfixed, # Shape: [NUM_IN_CHANNELS][NUM_IN_FRAMES]
                              kernels_sfixed,    # Shape: [NUM_OUT_CHANNELS][NUM_IN_CHANNELS][KERNEL_SIZE]
                              biases_sfixed,     # Shape: [NUM_OUT_CHANNELS]
                              padding, stride, kernel_size,
                              f_bits=FRACTIONAL_BITS, total_bits=TOTAL_BITS):

    num_in_ch = len(input_data_sfixed)
    num_in_fr = len(input_data_sfixed[0])
    num_out_ch = len(kernels_sfixed)

    scale = 1 << f_bits
    round_const_product = 1 << (f_bits - 1) if f_bits > 0 else 0 # For product (S.2F -> S.F)

    s_max = (1 << (total_bits - 1)) - 1
    s_min = -(1 << (total_bits - 1))

    def _saturate(value):
        return max(s_min, min(s_max, value))

    # Apply padding
    padded_input_sfixed = []
    for ch_data in input_data_sfixed:
        padded_ch_data = [0] * padding + ch_data + [0] * padding
        padded_input_sfixed.append(padded_ch_data)

    num_in_fr_padded = num_in_fr + 2 * padding
    num_out_fr = math.floor((num_in_fr_padded - kernel_size) / stride + 1)

    output_data_sfixed = [[0] * num_out_fr for _ in range(num_out_ch)]

    for oc in range(num_out_ch): # Output Channel
        for f_out_idx in range(num_out_fr): # Output Frame
            receptive_field_start = f_out_idx * stride

            # This will store sum of terms already scaled to S.F (like in fc_layer)
            current_sum_s_dot_f = 0

            for ic in range(num_in_ch): # Input Channel
                for k_idx in range(kernel_size): # Kernel Tap
                    input_val_idx = receptive_field_start + k_idx

                    input_val_sfixed = padded_input_sfixed[ic][input_val_idx]
                    kernel_val_sfixed = kernels_sfixed[oc][ic][k_idx]

                    # S.F * S.F = S.2F (product is scaled by SCALE*SCALE)
                    product_s_dot_2f = input_val_sfixed * kernel_val_sfixed

                    # Round and scale back to S.F (scaled by SCALE)
                    if product_s_dot_2f >= 0:
                        scaled_product_s_dot_f = (product_s_dot_2f + round_const_product) // scale
                    else:
                        scaled_product_s_dot_f = (product_s_dot_2f + round_const_product) // scale
                        # Note: Python's // is floor division. For negative, (val + 0.5) // 1 can differ from strict arithmetic shift with rounding.
                        # Verilog: (product_wide + ROUND_CONST) >>> FRACTIONAL_BITS.
                        # This Python code mimics the Verilog's simple positive rounding for now.

                    current_sum_s_dot_f += scaled_product_s_dot_f

            # Add bias (which is already S.F)
            final_value_s_dot_f = current_sum_s_dot_f + biases_sfixed[oc]
            output_data_sfixed[oc][f_out_idx] = _saturate(final_value_s_dot_f)

    return output_data_sfixed


if __name__ == "__main__":
    print(f"--- Generating Bit-Accurate Golden Values for Conv1 Layer ---")

    # 1. Input to Conv1 (output of previous Upsample stage)
    # Represents 4 channels, 2 frames each: [[108, 108], [254, 254], [218, 218], [186, 186]]
    conv1_input_sfixed = [
        [108, 108], # Channel 0
        [254, 254], # Channel 1
        [218, 218], # Channel 2
        [186, 186]  # Channel 3
    ]
    print("\n--- Input to Conv1 Module (conv1_input_sfixed) ---")
    print("--- (4 channels, 2 frames each) ---")
    for ch in range(len(conv1_input_sfixed)):
        row_sfixed_str = ", ".join([f"16'sd{val}" for val in conv1_input_sfixed[ch]])
        approx_floats = [f"{sfixed_to_float(val):.4f}" for val in conv1_input_sfixed[ch]]
        print(f"Input Channel {ch}: [{row_sfixed_str}]; \t// Approx Floats: [{', '.join(approx_floats)}]")


    # 2. Define Conv1 Weights and Biases (float, from SimpleDecoder architecture)
    kernels_conv1_float = [ # Shape: [out_ch=2][in_ch=4][k_size=3]
        # out_channel 0
        [   # in_channel 0
            [0.1, 0.2, 0.1], # k1, k2, k3
            # in_channel 1
            [0.3, 0.1, 0.2],
            # in_channel 2
            [0.2, 0.3, 0.1],
            # in_channel 3
            [0.1, 0.1, 0.3]
        ],
        # out_channel 1
        [   # in_channel 0
            [0.4, 0.2, 0.1],
            # in_channel 1
            [0.1, 0.3, 0.4],
            # in_channel 2
            [0.3, 0.2, 0.2],
            # in_channel 3
            [0.2, 0.4, 0.1]
        ]
    ]
    biases_conv1_float = [0.05, 0.06]

    # 3. Convert Conv1 Weights and Biases to fixed-point
    kernels_conv1_sfixed = [
        [[float_to_sfixed(val) for val in tap_list] for tap_list in ch_list] for ch_list in kernels_conv1_float
    ]
    biases_conv1_sfixed = [float_to_sfixed(val) for val in biases_conv1_float]

    print("\n--- Conv1 Kernels (kernels_conv1_sfixed) ---")
    # Format for Verilog parameters/locals or testbench
    for oc in range(NUM_OUT_CHANNELS_CONV1):
        print(f"// Kernel for Output Channel {oc}")
        for ic in range(NUM_IN_CHANNELS_CONV1):
            tap_str = ", ".join([f"16'sd{kernels_conv1_sfixed[oc][ic][k]}" for k in range(KERNEL_SIZE_CONV1)])
            print(f"  // InCh {ic}: {{{tap_str}}}")

    print("\n--- Conv1 Biases (biases_conv1_sfixed) ---")
    bias_str = ", ".join([f"16'sd{biases_conv1_sfixed[oc]}" for oc in range(NUM_OUT_CHANNELS_CONV1)])
    print(f"{{ {bias_str} }}")


    # 4. Calculate Golden Output using the fixed-point Conv1D function
    conv1_golden_output_sfixed = conv1d_fixed_point_manual(
        input_data_sfixed=conv1_input_sfixed,
        kernels_sfixed=kernels_conv1_sfixed,
        biases_sfixed=biases_conv1_sfixed,
        padding=PADDING_CONV1,
        stride=STRIDE_CONV1,
        kernel_size=KERNEL_SIZE_CONV1
    )

    print("\n--- Golden Conv1 Output (conv1_golden_output_sfixed) for Verilog Testbench ---")
    print(f"--- ({NUM_OUT_CHANNELS_CONV1} channels, {len(conv1_golden_output_sfixed[0])} frames each) ---")
    for oc in range(len(conv1_golden_output_sfixed)):
        row_sfixed_str = ", ".join([f"16'sd{val}" for val in conv1_golden_output_sfixed[oc]])
        approx_floats = [f"{sfixed_to_float(val):.4f}" for val in conv1_golden_output_sfixed[oc]]
        print(f"Output Channel {oc}: [{row_sfixed_str}]; \t// Approx Floats: [{', '.join(approx_floats)}]")



--- Generating Bit-Accurate Golden Values for Conv1 Layer ---

--- Input to Conv1 Module (conv1_input_sfixed) ---
--- (4 channels, 2 frames each) ---
Input Channel 0: [16'sd108, 16'sd108]; 	// Approx Floats: [0.4219, 0.4219]
Input Channel 1: [16'sd254, 16'sd254]; 	// Approx Floats: [0.9922, 0.9922]
Input Channel 2: [16'sd218, 16'sd218]; 	// Approx Floats: [0.8516, 0.8516]
Input Channel 3: [16'sd186, 16'sd186]; 	// Approx Floats: [0.7266, 0.7266]

--- Conv1 Kernels (kernels_conv1_sfixed) ---
// Kernel for Output Channel 0
  // InCh 0: {16'sd26, 16'sd51, 16'sd26}
  // InCh 1: {16'sd77, 16'sd26, 16'sd51}
  // InCh 2: {16'sd51, 16'sd77, 16'sd26}
  // InCh 3: {16'sd26, 16'sd26, 16'sd77}
// Kernel for Output Channel 1
  // InCh 0: {16'sd102, 16'sd51, 16'sd26}
  // InCh 1: {16'sd26, 16'sd77, 16'sd102}
  // InCh 2: {16'sd77, 16'sd51, 16'sd51}
  // InCh 3: {16'sd51, 16'sd102, 16'sd26}

--- Conv1 Biases (biases_conv1_sfixed) ---
{ 16'sd13, 16'sd15 }

--- Golden Conv1 Output (conv1_golden_output_

# RELU 2 GOLDEN OUTPUT

In [None]:
import math # For round

# --- Fixed-Point Parameters (consistent) ---
FRACTIONAL_BITS = 8
TOTAL_BITS = 16
INTEGER_BITS = TOTAL_BITS - FRACTIONAL_BITS - 1
SCALE = 1 << FRACTIONAL_BITS

MIN_SFIXED_VAL = -(1 << (TOTAL_BITS - 1))
MAX_SFIXED_VAL = (1 << (TOTAL_BITS - 1)) - 1

def sfixed_to_float(val_sfixed, f_bits=FRACTIONAL_BITS):
    """Converts a scaled signed integer back to a float."""
    scale_factor = 1 << f_bits
    return float(val_sfixed) / scale_factor

# --- ReLU (2nd Instance) Golden Value Generation ---
if __name__ == "__main__":
    print(f"--- Generating Golden Fixed-Point Values for 2nd ReLU Module ---")
    print(f"--- Input is the output of Conv1 ---")
    print(f"Fixed-Point Config: TOTAL_BITS={TOTAL_BITS}, FRACTIONAL_BITS={FRACTIONAL_BITS} (S{INTEGER_BITS+1}.{FRACTIONAL_BITS}), SCALE={SCALE}\n")

    # 1. Define Inputs for the 2nd ReLU stage
    # These are the fixed-point integer outputs from the Verilog conv1_module simulation.
    # conv1_module output was: [[286, 295], [404, 402]]
    relu2_input_sfixed = [
        [286, 295], # Channel 0
        [404, 402]  # Channel 1
    ]
    num_channels_relu2 = 2
    num_frames_relu2 = 2

    print("--- Input to 2nd ReLU Module (relu2_input_sfixed) ---")
    print(f"--- ({num_channels_relu2} channels, {num_frames_relu2} frames each) ---")
    for ch in range(num_channels_relu2):
        row_sfixed_str = ", ".join([f"16'sd{val}" for val in relu2_input_sfixed[ch]])
        approx_floats = [f"{sfixed_to_float(val):.4f}" for val in relu2_input_sfixed[ch]]
        print(f"Input Channel {ch}: [{row_sfixed_str}]; \t// Approx Floats: [{', '.join(approx_floats)}]")

    # 2. Calculate Golden Output for ReLU
    relu2_golden_output_sfixed = [[0]*num_frames_relu2 for _ in range(num_channels_relu2)]
    for ch in range(num_channels_relu2):
        for fr in range(num_frames_relu2):
            input_val = relu2_input_sfixed[ch][fr]
            if input_val < 0:
                relu2_golden_output_sfixed[ch][fr] = 0
            else:
                relu2_golden_output_sfixed[ch][fr] = input_val

    print("\n--- Golden Output for 2nd ReLU (relu2_golden_output_sfixed) for Verilog Testbench ---")
    print(f"--- ({num_channels_relu2} channels, {num_frames_relu2} frames each) ---")
    print(f"logic signed [15:0] golden_relu2_output [0:{num_channels_relu2-1}][0:{num_frames_relu2-1}];")
    print("initial begin")
    for ch in range(num_channels_relu2):
        for fr in range(num_frames_relu2):
            val = relu2_golden_output_sfixed[ch][fr]
            approx_float = sfixed_to_float(val)
            print(f"    golden_relu2_output[{ch}][{fr}] = 16'sd{val}; \t// Approx Float: {approx_float:.4f}")
    print("end")



--- Generating Golden Fixed-Point Values for 2nd ReLU Module ---
--- Input is the output of Conv1 ---
Fixed-Point Config: TOTAL_BITS=16, FRACTIONAL_BITS=8 (S8.8), SCALE=256

--- Input to 2nd ReLU Module (relu2_input_sfixed) ---
--- (2 channels, 2 frames each) ---
Input Channel 0: [16'sd286, 16'sd295]; 	// Approx Floats: [1.1172, 1.1523]
Input Channel 1: [16'sd404, 16'sd402]; 	// Approx Floats: [1.5781, 1.5703]

--- Golden Output for 2nd ReLU (relu2_golden_output_sfixed) for Verilog Testbench ---
--- (2 channels, 2 frames each) ---
logic signed [15:0] golden_relu2_output [0:1][0:1];
initial begin
    golden_relu2_output[0][0] = 16'sd286; 	// Approx Float: 1.1172
    golden_relu2_output[0][1] = 16'sd295; 	// Approx Float: 1.1523
    golden_relu2_output[1][0] = 16'sd404; 	// Approx Float: 1.5781
    golden_relu2_output[1][1] = 16'sd402; 	// Approx Float: 1.5703
end


# UPSAMPLE 2 GOLDEN OUTPUT

In [None]:
import math # For round

# --- Fixed-Point Parameters (consistent) ---
FRACTIONAL_BITS = 8
TOTAL_BITS = 16
INTEGER_BITS = TOTAL_BITS - FRACTIONAL_BITS - 1
SCALE = 1 << FRACTIONAL_BITS

MIN_SFIXED_VAL = -(1 << (TOTAL_BITS - 1))
MAX_SFIXED_VAL = (1 << (TOTAL_BITS - 1)) - 1

def sfixed_to_float(val_sfixed, f_bits=FRACTIONAL_BITS):
    """Converts a scaled signed integer back to a float."""
    scale_factor = 1 << f_bits
    return float(val_sfixed) / scale_factor

# --- Upsample (2nd Instance) Golden Value Generation ---
if __name__ == "__main__":
    print(f"--- Generating Golden Fixed-Point Values for 2nd Upsample Module (scale_factor=2) ---")
    print(f"--- Input is the output of the 2nd ReLU stage (after Conv1) ---")
    print(f"Fixed-Point Config: TOTAL_BITS={TOTAL_BITS}, FRACTIONAL_BITS={FRACTIONAL_BITS} (S{INTEGER_BITS+1}.{FRACTIONAL_BITS}), SCALE={SCALE}\n")

    # 1. Define Inputs for the 2nd Upsample stage
    # These are the fixed-point integer outputs from the 2nd ReLU stage.
    # Output of 2nd ReLU was: [[286, 295], [404, 402]]
    upsample2_input_sfixed = [
        [286, 295], # Channel 0 (2 frames)
        [404, 402]  # Channel 1 (2 frames)
    ]
    num_channels_upsample2 = 2
    input_frames_per_channel_upsample2 = 2
    scale_factor = 2
    output_frames_per_channel_upsample2 = input_frames_per_channel_upsample2 * scale_factor # Should be 4

    print("--- Input to 2nd Upsample Module (upsample2_input_sfixed) ---")
    print(f"--- ({num_channels_upsample2} channels, {input_frames_per_channel_upsample2} frames each) ---")
    for ch in range(num_channels_upsample2):
        row_sfixed_str = ", ".join([f"16'sd{val}" for val in upsample2_input_sfixed[ch]])
        approx_floats = [f"{sfixed_to_float(val):.4f}" for val in upsample2_input_sfixed[ch]]
        print(f"Input Channel {ch}: [{row_sfixed_str}]; \t// Approx Floats: [{', '.join(approx_floats)}]")

    # 2. Calculate Golden Output for Upsample
    upsample2_golden_output_sfixed = [[0]*output_frames_per_channel_upsample2 for _ in range(num_channels_upsample2)]

    for ch in range(num_channels_upsample2):
        for out_fr_idx in range(output_frames_per_channel_upsample2):
            # Nearest neighbor: replicate the input value
            # Each input frame is repeated 'scale_factor' times
            in_fr_idx = out_fr_idx // scale_factor
            upsample2_golden_output_sfixed[ch][out_fr_idx] = upsample2_input_sfixed[ch][in_fr_idx]

    print("\n--- Golden Output for 2nd Upsample (upsample2_golden_output_sfixed) for Verilog Testbench ---")
    print(f"--- ({num_channels_upsample2} channels, {output_frames_per_channel_upsample2} frames each) ---")
    print(f"logic signed [15:0] golden_upsample2_output [0:{num_channels_upsample2-1}][0:{output_frames_per_channel_upsample2-1}];")
    print("initial begin")
    for ch in range(num_channels_upsample2):
        for fr in range(output_frames_per_channel_upsample2):
            val = upsample2_golden_output_sfixed[ch][fr]
            approx_float = sfixed_to_float(val)
            print(f"    golden_upsample2_output[{ch}][{fr}] = 16'sd{val}; \t// Approx Float: {approx_float:.4f}")
    print("end")



--- Generating Golden Fixed-Point Values for 2nd Upsample Module (scale_factor=2) ---
--- Input is the output of the 2nd ReLU stage (after Conv1) ---
Fixed-Point Config: TOTAL_BITS=16, FRACTIONAL_BITS=8 (S8.8), SCALE=256

--- Input to 2nd Upsample Module (upsample2_input_sfixed) ---
--- (2 channels, 2 frames each) ---
Input Channel 0: [16'sd286, 16'sd295]; 	// Approx Floats: [1.1172, 1.1523]
Input Channel 1: [16'sd404, 16'sd402]; 	// Approx Floats: [1.5781, 1.5703]

--- Golden Output for 2nd Upsample (upsample2_golden_output_sfixed) for Verilog Testbench ---
--- (2 channels, 4 frames each) ---
logic signed [15:0] golden_upsample2_output [0:1][0:3];
initial begin
    golden_upsample2_output[0][0] = 16'sd286; 	// Approx Float: 1.1172
    golden_upsample2_output[0][1] = 16'sd286; 	// Approx Float: 1.1172
    golden_upsample2_output[0][2] = 16'sd295; 	// Approx Float: 1.1523
    golden_upsample2_output[0][3] = 16'sd295; 	// Approx Float: 1.1523
    golden_upsample2_output[1][0] = 16'sd404;

# CONV1D 2 GOLDEN OUTPUT

In [None]:
import math

# --- Fixed-Point Parameters (consistent) ---
FRACTIONAL_BITS = 8
TOTAL_BITS = 16
INTEGER_BITS = TOTAL_BITS - FRACTIONAL_BITS - 1
SCALE = 1 << FRACTIONAL_BITS

MIN_SFIXED_VAL = -(1 << (TOTAL_BITS - 1))
MAX_SFIXED_VAL = (1 << (TOTAL_BITS - 1)) - 1

def float_to_sfixed(val_float, f_bits=FRACTIONAL_BITS, total_bits=TOTAL_BITS):
    scale_factor = 1 << f_bits
    min_val = -(1 << (total_bits - 1))
    max_val = (1 << (total_bits - 1)) - 1
    scaled_val = int(round(val_float * scale_factor))
    clamped_val = max(min_val, min(max_val, scaled_val))
    return clamped_val

def sfixed_to_float(val_sfixed, f_bits=FRACTIONAL_BITS):
    scale_factor = 1 << f_bits
    return float(val_sfixed) / scale_factor

# --- Conv1D Fixed-Point Manual Calculation Function (reused) ---
def conv1d_fixed_point_manual(input_data_sfixed, # Shape: [NUM_IN_CHANNELS][NUM_IN_FRAMES]
                              kernels_sfixed,    # Shape: [NUM_OUT_CHANNELS][NUM_IN_CHANNELS][KERNEL_SIZE]
                              biases_sfixed,     # Shape: [NUM_OUT_CHANNELS]
                              padding, stride, kernel_size,
                              f_bits=FRACTIONAL_BITS, total_bits=TOTAL_BITS):

    num_in_ch = len(input_data_sfixed)
    num_in_fr = len(input_data_sfixed[0])
    num_out_ch = len(kernels_sfixed)

    scale = 1 << f_bits
    round_const_product = 1 << (f_bits - 1) if f_bits > 0 else 0

    s_max = (1 << (total_bits - 1)) - 1
    s_min = -(1 << (total_bits - 1))

    def _saturate(value):
        return max(s_min, min(s_max, value))

    padded_input_sfixed = []
    for ch_data in input_data_sfixed:
        padded_ch_data = [0] * padding + ch_data + [0] * padding
        padded_input_sfixed.append(padded_ch_data)

    num_in_fr_padded = num_in_fr + 2 * padding
    num_out_fr = math.floor((num_in_fr_padded - kernel_size) / stride + 1)

    output_data_sfixed = [[0] * num_out_fr for _ in range(num_out_ch)]

    for oc in range(num_out_ch):
        for f_out_idx in range(num_out_fr):
            receptive_field_start = f_out_idx * stride
            current_sum_s_dot_f = 0
            for ic in range(num_in_ch):
                for k_idx in range(kernel_size):
                    input_val_idx = receptive_field_start + k_idx
                    input_val_sfixed = padded_input_sfixed[ic][input_val_idx]
                    kernel_val_sfixed = kernels_sfixed[oc][ic][k_idx]
                    product_s_dot_2f = input_val_sfixed * kernel_val_sfixed
                    if product_s_dot_2f >= 0:
                        scaled_product_s_dot_f = (product_s_dot_2f + round_const_product) // scale
                    else:
                        scaled_product_s_dot_f = (product_s_dot_2f + round_const_product) // scale
                    current_sum_s_dot_f += scaled_product_s_dot_f
            final_value_s_dot_f = current_sum_s_dot_f + biases_sfixed[oc]
            output_data_sfixed[oc][f_out_idx] = _saturate(final_value_s_dot_f)
    return output_data_sfixed

# --- Conv2 Parameters ---
NUM_IN_CHANNELS_CONV2 = 2
NUM_OUT_CHANNELS_CONV2 = 1
KERNEL_SIZE_CONV2 = 3
STRIDE_CONV2 = 1
PADDING_CONV2 = 1
NUM_IN_FRAMES_CONV2 = 4 # Output frames from Upsampler2

if __name__ == "__main__":
    print(f"--- Generating Bit-Accurate Golden Values for Conv2 Layer ---")

    # 1. Input to Conv2 (output of previous Upsample stage)
    # Represents 2 channels, 4 frames each
    conv2_input_sfixed = [
        [286, 286, 295, 295], # Channel 0 from Upsample2
        [404, 404, 402, 402]  # Channel 1 from Upsample2
    ]
    print("\n--- Input to Conv2 Module (conv2_input_sfixed) ---")
    print("--- (2 channels, 4 frames each) ---")
    for ch in range(len(conv2_input_sfixed)):
        row_sfixed_str = ", ".join([f"16'sd{val}" for val in conv2_input_sfixed[ch]])
        approx_floats = [f"{sfixed_to_float(val):.4f}" for val in conv2_input_sfixed[ch]]
        print(f"Input Channel {ch}: [{row_sfixed_str}]; \t// Approx Floats: [{', '.join(approx_floats)}]")

    # 2. Define Conv2 Weights and Biases (float, from SimpleDecoder architecture)
    # kernels_conv2_float: [out_ch=1][in_ch=2][k_size=3]
    kernels_conv2_float = [
        # out_channel 0
        [   # in_channel 0
            [0.5, 0.1, 0.2],
            # in_channel 1
            [0.2, 0.3, 0.5]
        ]
    ]
    biases_conv2_float = [0.07] # Single bias for the single output channel

    # 3. Convert Conv2 Weights and Biases to fixed-point
    kernels_conv2_sfixed = [
        [[float_to_sfixed(val) for val in tap_list] for tap_list in ch_list] for ch_list in kernels_conv2_float
    ]
    biases_conv2_sfixed = [float_to_sfixed(val) for val in biases_conv2_float]

    print("\n--- Conv2 Kernels (kernels_conv2_sfixed) ---")
    for oc in range(NUM_OUT_CHANNELS_CONV2):
        print(f"// Kernel for Output Channel {oc}")
        for ic in range(NUM_IN_CHANNELS_CONV2):
            tap_str = ", ".join([f"16'sd{kernels_conv2_sfixed[oc][ic][k]}" for k in range(KERNEL_SIZE_CONV2)])
            print(f"  // InCh {ic}: {{{tap_str}}}")

    print("\n--- Conv2 Biases (biases_conv2_sfixed) ---")
    bias_str = ", ".join([f"16'sd{biases_conv2_sfixed[oc]}" for oc in range(NUM_OUT_CHANNELS_CONV2)])
    print(f"{{ {bias_str} }}")

    # 4. Calculate Golden Output using the fixed-point Conv1D function
    conv2_golden_output_sfixed = conv1d_fixed_point_manual(
        input_data_sfixed=conv2_input_sfixed,
        kernels_sfixed=kernels_conv2_sfixed,
        biases_sfixed=biases_conv2_sfixed,
        padding=PADDING_CONV2,
        stride=STRIDE_CONV2,
        kernel_size=KERNEL_SIZE_CONV2
    )

    # Expected output shape: 1 channel, 4 frames
    num_out_fr_conv2 = math.floor(((NUM_IN_FRAMES_CONV2 + 2 * PADDING_CONV2) - KERNEL_SIZE_CONV2) / STRIDE_CONV2 + 1)

    print("\n--- Golden Conv2 Output (conv2_golden_output_sfixed) for Verilog Testbench ---")
    print(f"--- ({NUM_OUT_CHANNELS_CONV2} channel, {num_out_fr_conv2} frames each) ---")
    for oc in range(len(conv2_golden_output_sfixed)): # Should be 1 output channel
        row_sfixed_str = ", ".join([f"16'sd{val}" for val in conv2_golden_output_sfixed[oc]])
        approx_floats = [f"{sfixed_to_float(val):.4f}" for val in conv2_golden_output_sfixed[oc]]
        print(f"Output Channel {oc}: [{row_sfixed_str}]; \t// Approx Floats: [{', '.join(approx_floats)}]")



--- Generating Bit-Accurate Golden Values for Conv2 Layer ---

--- Input to Conv2 Module (conv2_input_sfixed) ---
--- (2 channels, 4 frames each) ---
Input Channel 0: [16'sd286, 16'sd286, 16'sd295, 16'sd295]; 	// Approx Floats: [1.1172, 1.1172, 1.1523, 1.1523]
Input Channel 1: [16'sd404, 16'sd404, 16'sd402, 16'sd402]; 	// Approx Floats: [1.5781, 1.5781, 1.5703, 1.5703]

--- Conv2 Kernels (kernels_conv2_sfixed) ---
// Kernel for Output Channel 0
  // InCh 0: {16'sd128, 16'sd26, 16'sd51}
  // InCh 1: {16'sd51, 16'sd77, 16'sd128}

--- Conv2 Biases (biases_conv2_sfixed) ---
{ 16'sd18 }

--- Golden Conv2 Output (conv2_golden_output_sfixed) for Verilog Testbench ---
--- (1 channel, 4 frames each) ---
Output Channel 0: [16'sd428, 16'sd652, 16'sd652, 16'sd397]; 	// Approx Floats: [1.6719, 2.5469, 2.5469, 1.5508]


# RELU 3 GOLDEN OUTPUT


In [None]:
import math # For round

# --- Fixed-Point Parameters (consistent) ---
FRACTIONAL_BITS = 8
TOTAL_BITS = 16
INTEGER_BITS = TOTAL_BITS - FRACTIONAL_BITS - 1
SCALE = 1 << FRACTIONAL_BITS

MIN_SFIXED_VAL = -(1 << (TOTAL_BITS - 1))
MAX_SFIXED_VAL = (1 << (TOTAL_BITS - 1)) - 1

def sfixed_to_float(val_sfixed, f_bits=FRACTIONAL_BITS):
    """Converts a scaled signed integer back to a float."""
    scale_factor = 1 << f_bits
    return float(val_sfixed) / scale_factor

# --- ReLU (3rd Instance) Golden Value Generation ---
if __name__ == "__main__":
    print(f"--- Generating Golden Fixed-Point Values for 3rd ReLU Module ---")
    print(f"--- Input is the output of Conv2 ---")
    print(f"Fixed-Point Config: TOTAL_BITS={TOTAL_BITS}, FRACTIONAL_BITS={FRACTIONAL_BITS} (S{INTEGER_BITS+1}.{FRACTIONAL_BITS}), SCALE={SCALE}\n")

    # 1. Define Inputs for the 3rd ReLU stage
    # These are the fixed-point integer outputs from the Verilog conv2_module simulation.
    # conv2_module output was: [[428, 652, 652, 397]]
    relu3_input_sfixed = [ # This is a list of lists: [NUM_CHANNELS][FRAMES_PER_CHANNEL]
        [428, 652, 652, 397] # Channel 0 (4 frames)
    ]
    num_channels_relu3 = 1
    num_frames_relu3 = 4 # Frames per channel

    print("--- Input to 3rd ReLU Module (relu3_input_sfixed) ---")
    print(f"--- ({num_channels_relu3} channel, {num_frames_relu3} frames each) ---")
    for ch in range(num_channels_relu3):
        row_sfixed_str = ", ".join([f"16'sd{val}" for val in relu3_input_sfixed[ch]])
        approx_floats = [f"{sfixed_to_float(val):.4f}" for val in relu3_input_sfixed[ch]]
        print(f"Input Channel {ch}: [{row_sfixed_str}]; \t// Approx Floats: [{', '.join(approx_floats)}]")

    # 2. Calculate Golden Output for ReLU
    relu3_golden_output_sfixed = [[0]*num_frames_relu3 for _ in range(num_channels_relu3)]
    for ch in range(num_channels_relu3):
        for fr in range(num_frames_relu3):
            input_val = relu3_input_sfixed[ch][fr]
            if input_val < 0:
                relu3_golden_output_sfixed[ch][fr] = 0
            else:
                relu3_golden_output_sfixed[ch][fr] = input_val

    print("\n--- Golden Output for 3rd ReLU (relu3_golden_output_sfixed) for Verilog Testbench ---")
    print(f"--- ({num_channels_relu3} channel, {num_frames_relu3} frames each) ---")
    print(f"logic signed [15:0] golden_relu3_output [0:{num_channels_relu3-1}][0:{num_frames_relu3-1}];")
    print("initial begin")
    for ch in range(num_channels_relu3):
        for fr in range(num_frames_relu3):
            val = relu3_golden_output_sfixed[ch][fr]
            approx_float = sfixed_to_float(val)
            print(f"    golden_relu3_output[{ch}][{fr}] = 16'sd{val}; \t// Approx Float: {approx_float:.4f}")
    print("end")



--- Generating Golden Fixed-Point Values for 3rd ReLU Module ---
--- Input is the output of Conv2 ---
Fixed-Point Config: TOTAL_BITS=16, FRACTIONAL_BITS=8 (S8.8), SCALE=256

--- Input to 3rd ReLU Module (relu3_input_sfixed) ---
--- (1 channel, 4 frames each) ---
Input Channel 0: [16'sd428, 16'sd652, 16'sd652, 16'sd397]; 	// Approx Floats: [1.6719, 2.5469, 2.5469, 1.5508]

--- Golden Output for 3rd ReLU (relu3_golden_output_sfixed) for Verilog Testbench ---
--- (1 channel, 4 frames each) ---
logic signed [15:0] golden_relu3_output [0:0][0:3];
initial begin
    golden_relu3_output[0][0] = 16'sd428; 	// Approx Float: 1.6719
    golden_relu3_output[0][1] = 16'sd652; 	// Approx Float: 2.5469
    golden_relu3_output[0][2] = 16'sd652; 	// Approx Float: 2.5469
    golden_relu3_output[0][3] = 16'sd397; 	// Approx Float: 1.5508
end


Conv3 GOlden Oujtput

In [None]:
import math

# --- Fixed-Point Parameters (consistent) ---
FRACTIONAL_BITS = 8
TOTAL_BITS = 16
INTEGER_BITS = TOTAL_BITS - FRACTIONAL_BITS - 1
SCALE = 1 << FRACTIONAL_BITS

MIN_SFIXED_VAL = -(1 << (TOTAL_BITS - 1))
MAX_SFIXED_VAL = (1 << (TOTAL_BITS - 1)) - 1

def float_to_sfixed(val_float, f_bits=FRACTIONAL_BITS, total_bits=TOTAL_BITS):
    scale_factor = 1 << f_bits
    min_val = -(1 << (total_bits - 1))
    max_val = (1 << (total_bits - 1)) - 1
    scaled_val = int(round(val_float * scale_factor))
    clamped_val = max(min_val, min(max_val, scaled_val))
    return clamped_val

def sfixed_to_float(val_sfixed, f_bits=FRACTIONAL_BITS):
    scale_factor = 1 << f_bits
    return float(val_sfixed) / scale_factor

# --- Conv1D Fixed-Point Manual Calculation Function (reused) ---
def conv1d_fixed_point_manual(input_data_sfixed, # Shape: [NUM_IN_CHANNELS][NUM_IN_FRAMES]
                              kernels_sfixed,    # Shape: [NUM_OUT_CHANNELS][NUM_IN_CHANNELS][KERNEL_SIZE]
                              biases_sfixed,     # Shape: [NUM_OUT_CHANNELS]
                              padding, stride, kernel_size,
                              f_bits=FRACTIONAL_BITS, total_bits=TOTAL_BITS):

    num_in_ch = len(input_data_sfixed)
    num_in_fr = len(input_data_sfixed[0]) if num_in_ch > 0 else 0 # Handle empty input_data_sfixed
    num_out_ch = len(kernels_sfixed)

    scale = 1 << f_bits
    round_const_product = 1 << (f_bits - 1) if f_bits > 0 else 0

    s_max = (1 << (total_bits - 1)) - 1
    s_min = -(1 << (total_bits - 1))

    def _saturate(value):
        return max(s_min, min(s_max, value))

    padded_input_sfixed = []
    if num_in_ch > 0:
        for ch_data in input_data_sfixed:
            padded_ch_data = [0] * padding + ch_data + [0] * padding
            padded_input_sfixed.append(padded_ch_data)

    num_in_fr_padded = num_in_fr + 2 * padding
    if num_in_fr_padded < kernel_size : # check if kernel is larger than padded input
        num_out_fr = 0
    else:
        num_out_fr = math.floor((num_in_fr_padded - kernel_size) / stride + 1)


    output_data_sfixed = [[0] * num_out_fr for _ in range(num_out_ch)]

    for oc in range(num_out_ch):
        for f_out_idx in range(num_out_fr):
            receptive_field_start = f_out_idx * stride
            current_sum_s_dot_f = 0
            for ic in range(num_in_ch):
                for k_idx in range(kernel_size):
                    input_val_idx = receptive_field_start + k_idx
                    input_val_sfixed = padded_input_sfixed[ic][input_val_idx]
                    kernel_val_sfixed = kernels_sfixed[oc][ic][k_idx]
                    product_s_dot_2f = input_val_sfixed * kernel_val_sfixed
                    if product_s_dot_2f >= 0:
                        scaled_product_s_dot_f = (product_s_dot_2f + round_const_product) // scale
                    else:
                        scaled_product_s_dot_f = (product_s_dot_2f + round_const_product) // scale
                    current_sum_s_dot_f += scaled_product_s_dot_f
            final_value_s_dot_f = current_sum_s_dot_f + biases_sfixed[oc]
            output_data_sfixed[oc][f_out_idx] = _saturate(final_value_s_dot_f)
    return output_data_sfixed

# --- Conv3_Output Parameters ---
NUM_IN_CHANNELS_CONV3 = 1
NUM_OUT_CHANNELS_CONV3 = 2 # N_MELS
KERNEL_SIZE_CONV3 = 1
STRIDE_CONV3 = 1
PADDING_CONV3 = 0
NUM_IN_FRAMES_CONV3 = 4 # Output frames from 3rd ReLU (after Conv2)

if __name__ == "__main__":
    print(f"--- Generating Bit-Accurate Golden Values for Conv3_Output Layer ---")

    # 1. Input to Conv3 (output of previous 3rd ReLU stage)
    # Represents 1 channel, 4 frames each
    conv3_input_sfixed = [
        [428, 652, 652, 397] # Channel 0 from 3rd ReLU
    ]
    print("\n--- Input to Conv3_Output Module (conv3_input_sfixed) ---")
    print("--- (1 channel, 4 frames each) ---")
    for ch in range(len(conv3_input_sfixed)):
        row_sfixed_str = ", ".join([f"16'sd{val}" for val in conv3_input_sfixed[ch]])
        approx_floats = [f"{sfixed_to_float(val):.4f}" for val in conv3_input_sfixed[ch]]
        print(f"Input Channel {ch}: [{row_sfixed_str}]; \t// Approx Floats: [{', '.join(approx_floats)}]")

    # 2. Define Conv3_Output Weights and Biases (float, from SimpleDecoder architecture)
    # kernels_conv3_float: [out_ch=2][in_ch=1][k_size=1]
    kernels_conv3_float = [
        [[0.8]], # Output Channel 0, Input Channel 0, Tap 0
        [[0.9]]  # Output Channel 1, Input Channel 0, Tap 0
    ]
    biases_conv3_float = [0.08, 0.09] # Two biases for the two output channels

    # 3. Convert Conv3_Output Weights and Biases to fixed-point
    kernels_conv3_sfixed = [
        [[float_to_sfixed(val) for val in tap_list] for tap_list in ch_list] for ch_list in kernels_conv3_float
    ]
    biases_conv3_sfixed = [float_to_sfixed(val) for val in biases_conv3_float]

    print("\n--- Conv3_Output Kernels (kernels_conv3_sfixed) ---")
    for oc in range(NUM_OUT_CHANNELS_CONV3):
        print(f"// Kernel for Output Channel {oc}")
        for ic in range(NUM_IN_CHANNELS_CONV3): # Should be 1 input channel
            tap_str = ", ".join([f"16'sd{kernels_conv3_sfixed[oc][ic][k]}" for k in range(KERNEL_SIZE_CONV3)])
            print(f"  // InCh {ic}: {{{tap_str}}}")

    print("\n--- Conv3_Output Biases (biases_conv3_sfixed) ---")
    bias_str = ", ".join([f"16'sd{biases_conv3_sfixed[oc]}" for oc in range(NUM_OUT_CHANNELS_CONV3)])
    print(f"{{ {bias_str} }}")

    # 4. Calculate Golden Output using the fixed-point Conv1D function
    conv3_golden_output_sfixed = conv1d_fixed_point_manual(
        input_data_sfixed=conv3_input_sfixed,
        kernels_sfixed=kernels_conv3_sfixed,
        biases_sfixed=biases_conv3_sfixed,
        padding=PADDING_CONV3,
        stride=STRIDE_CONV3,
        kernel_size=KERNEL_SIZE_CONV3
    )

    # Expected output shape: 2 channels, 4 frames
    num_out_fr_conv3 = math.floor(((NUM_IN_FRAMES_CONV3 + 2 * PADDING_CONV3) - KERNEL_SIZE_CONV3) / STRIDE_CONV3 + 1)

    print("\n--- Golden Conv3_Output (conv3_golden_output_sfixed) for Verilog Testbench ---")
    print(f"--- ({NUM_OUT_CHANNELS_CONV3} channels, {num_out_fr_conv3} frames each) ---")
    for oc in range(len(conv3_golden_output_sfixed)):
        row_sfixed_str = ", ".join([f"16'sd{val}" for val in conv3_golden_output_sfixed[oc]])
        approx_floats = [f"{sfixed_to_float(val):.4f}" for val in conv3_golden_output_sfixed[oc]]
        print(f"Output Channel {oc}: [{row_sfixed_str}]; \t// Approx Floats: [{', '.join(approx_floats)}]")



--- Generating Bit-Accurate Golden Values for Conv3_Output Layer ---

--- Input to Conv3_Output Module (conv3_input_sfixed) ---
--- (1 channel, 4 frames each) ---
Input Channel 0: [16'sd428, 16'sd652, 16'sd652, 16'sd397]; 	// Approx Floats: [1.6719, 2.5469, 2.5469, 1.5508]

--- Conv3_Output Kernels (kernels_conv3_sfixed) ---
// Kernel for Output Channel 0
  // InCh 0: {16'sd205}
// Kernel for Output Channel 1
  // InCh 0: {16'sd230}

--- Conv3_Output Biases (biases_conv3_sfixed) ---
{ 16'sd20, 16'sd23 }

--- Golden Conv3_Output (conv3_golden_output_sfixed) for Verilog Testbench ---
--- (2 channels, 4 frames each) ---
Output Channel 0: [16'sd363, 16'sd542, 16'sd542, 16'sd338]; 	// Approx Floats: [1.4180, 2.1172, 2.1172, 1.3203]
Output Channel 1: [16'sd408, 16'sd609, 16'sd609, 16'sd380]; 	// Approx Floats: [1.5938, 2.3789, 2.3789, 1.4844]


# RELU 4 GOLDEN OUTPUT

In [None]:
import math # For round

# --- Fixed-Point Parameters (consistent) ---
FRACTIONAL_BITS = 8
TOTAL_BITS = 16
INTEGER_BITS = TOTAL_BITS - FRACTIONAL_BITS - 1
SCALE = 1 << FRACTIONAL_BITS

MIN_SFIXED_VAL = -(1 << (TOTAL_BITS - 1))
MAX_SFIXED_VAL = (1 << (TOTAL_BITS - 1)) - 1

def sfixed_to_float(val_sfixed, f_bits=FRACTIONAL_BITS):
    """Converts a scaled signed integer back to a float."""
    scale_factor = 1 << f_bits
    return float(val_sfixed) / scale_factor

# --- ReLU (4th Instance) Golden Value Generation ---
if __name__ == "__main__":
    print(f"--- Generating Golden Fixed-Point Values for 4th (Final) ReLU Module ---")
    print(f"--- Input is the output of Conv3_Output ---")
    print(f"Fixed-Point Config: TOTAL_BITS={TOTAL_BITS}, FRACTIONAL_BITS={FRACTIONAL_BITS} (S{INTEGER_BITS+1}.{FRACTIONAL_BITS}), SCALE={SCALE}\n")

    # 1. Define Inputs for the 4th ReLU stage
    # These are the fixed-point integer outputs from the Verilog conv3_output_module simulation.
    # conv3_output_module output was:
    # Channel 0: [363, 542, 542, 338]
    # Channel 1: [408, 609, 609, 380]
    relu4_input_sfixed = [
        [363, 542, 542, 338], # Channel 0
        [408, 609, 609, 380]  # Channel 1
    ]
    num_channels_relu4 = 2
    num_frames_relu4 = 4 # Frames per channel

    print("--- Input to 4th ReLU Module (relu4_input_sfixed) ---")
    print(f"--- ({num_channels_relu4} channels, {num_frames_relu4} frames each) ---")
    for ch in range(num_channels_relu4):
        row_sfixed_str = ", ".join([f"16'sd{val}" for val in relu4_input_sfixed[ch]])
        approx_floats = [f"{sfixed_to_float(val):.4f}" for val in relu4_input_sfixed[ch]]
        print(f"Input Channel {ch}: [{row_sfixed_str}]; \t// Approx Floats: [{', '.join(approx_floats)}]")

    # 2. Calculate Golden Output for ReLU
    relu4_golden_output_sfixed = [[0]*num_frames_relu4 for _ in range(num_channels_relu4)]
    for ch in range(num_channels_relu4):
        for fr in range(num_frames_relu4):
            input_val = relu4_input_sfixed[ch][fr]
            if input_val < 0:
                relu4_golden_output_sfixed[ch][fr] = 0
            else:
                relu4_golden_output_sfixed[ch][fr] = input_val

    print("\n--- Golden Output for 4th ReLU (relu4_golden_output_sfixed) for Verilog Testbench ---")
    print(f"--- ({num_channels_relu4} channels, {num_frames_relu4} frames each) ---")
    print(f"logic signed [15:0] golden_relu4_output [0:{num_channels_relu4-1}][0:{num_frames_relu4-1}];")
    print("initial begin")
    for ch in range(num_channels_relu4):
        for fr in range(num_frames_relu4):
            val = relu4_golden_output_sfixed[ch][fr]
            approx_float = sfixed_to_float(val)
            print(f"    golden_relu4_output[{ch}][{fr}] = 16'sd{val}; \t// Approx Float: {approx_float:.4f}")
    print("end")



--- Generating Golden Fixed-Point Values for 4th (Final) ReLU Module ---
--- Input is the output of Conv3_Output ---
Fixed-Point Config: TOTAL_BITS=16, FRACTIONAL_BITS=8 (S8.8), SCALE=256

--- Input to 4th ReLU Module (relu4_input_sfixed) ---
--- (2 channels, 4 frames each) ---
Input Channel 0: [16'sd363, 16'sd542, 16'sd542, 16'sd338]; 	// Approx Floats: [1.4180, 2.1172, 2.1172, 1.3203]
Input Channel 1: [16'sd408, 16'sd609, 16'sd609, 16'sd380]; 	// Approx Floats: [1.5938, 2.3789, 2.3789, 1.4844]

--- Golden Output for 4th ReLU (relu4_golden_output_sfixed) for Verilog Testbench ---
--- (2 channels, 4 frames each) ---
logic signed [15:0] golden_relu4_output [0:1][0:3];
initial begin
    golden_relu4_output[0][0] = 16'sd363; 	// Approx Float: 1.4180
    golden_relu4_output[0][1] = 16'sd542; 	// Approx Float: 2.1172
    golden_relu4_output[0][2] = 16'sd542; 	// Approx Float: 2.1172
    golden_relu4_output[0][3] = 16'sd338; 	// Approx Float: 1.3203
    golden_relu4_output[1][0] = 16'sd408; 