In [None]:
import json
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import fnmatch
from pathlib import Path
from IPython.display import display, Markdown
from scipy import stats

# Set up plotting style
plt.style.use('seaborn-v0_8-paper')
sns.set_palette("colorblind")
sns.set_style('ticks')
plt.rcParams.update({
    'font.family': 'sans-serif',
    'font.size': 7,
    'axes.labelsize': 7,
    'axes.titlesize': 7,
    'xtick.labelsize': 6,
    'ytick.labelsize': 6,
    'legend.fontsize': 6,
    'lines.linewidth': 1.0,
    'lines.markersize': 4.0,
    'axes.linewidth': 0.8,
    'figure.dpi': 300,
    'savefig.dpi': 600,
    'figure.figsize': [3.5, 2.625],
    'figure.constrained_layout.use': True,
    'axes.xmargin': 0.05,
    'axes.ymargin': 0.05,
    'axes.spines.top': False,
    'axes.spines.right': False,
    'axes.grid': True,
    'grid.alpha': 0.3,
    # Reduce margins around the entire figure DOESNT DO ANYTHING
    #'figure.constrained_layout.h_pad': 0.1,  # Horizontal padding
    #'figure.constrained_layout.w_pad': 0.1,  # Vertical padding
    # Reduce space between title and plot
    'axes.titlepad': 1,         # Distance between title and plot
    # Reduce space between labels and ticks
    'axes.labelpad': 1,         # Distance between axis label and ticks
    # Reduce tick parameters
    'xtick.major.pad': 1,       # Distance between tick and tick label
    'ytick.major.pad': 1,
    
    # Adjust subplot spacing
    'figure.subplot.top': 0.95,    # Top margin
    'figure.subplot.bottom': 0.15, # Bottom margin
    'figure.subplot.left': 0.15,   # Left margin
    'figure.subplot.right': 0.95   # Right margin
})

# Global plot saving configuration
SAVE_PLOTS = True  # Switch this to True to enable saving
PLOT_DIR = Path('/var/home/thorben/git/bachelor_thesis_docs/written_bachelor_thesis/bilder/plots')  # Directory for saved plots
RESULT_PATH = "/var/home/thorben/git/bachelor_thesis_docs/jupiter/results/"

def save_plot(name, fig=None):
    """Helper function to save plots if SAVE_PLOTS is enabled
    
    Args:
        name: Name of the plot (will be used as filename)
        fig: Figure to save (defaults to current figure)
    """
    if SAVE_PLOTS:
        # Create plots directory if it doesn't exist
        PLOT_DIR.mkdir(exist_ok=True)
        
        # Get current figure if none provided
        if fig is None:
            fig = plt.gcf()
            
        # Save with timestamp to avoid overwrites
        filename = PLOT_DIR / f"{name}.png"
        fig.savefig(filename, bbox_inches='tight', dpi=600)
        print(f"Saved plot to {filename}")

# Load benchmark results
def load_benchmark_results(files):
    results = {}
    if isinstance(files, str):
        files = [files]
    for file in files:
        path = Path(RESULT_PATH + file)
        with open(path, 'r') as f:
            results[path.stem] = json.load(f)
    return results

def get_title_metadata(result_key: str, with_unterscores=False):
    if(not with_unterscores): result_key.replace("_", " ")
    return result_key.replace("benchmark_", '')

def calculate_method_memory(method: str, num_vectors, vector_dim):
    """Calculate theoretical memory usage for a method
    Args:
        method: Name of the method
        num_vectors: Number of vectors in the dataset
        vector_dim: Dimension of each vector
    Returns:
        Memory usage in bytes
    """
    # Base memory calculation
    if method.startswith('float16'):
        vector_memory = num_vectors * vector_dim * 2
    elif method.startswith('float') or method.startswith('avx2'):
        vector_memory = num_vectors * vector_dim * 4  # 4 bytes per float
    elif method.startswith('binary'):
        vector_memory = num_vectors * ((vector_dim + 7) // 8)  # Round up to nearest byte
    elif method.startswith('int8') or method.startswith('mf'):
        vector_memory = num_vectors * vector_dim  # 1 byte per value
    elif method.startswith('pca'):
        reduction_factor = int(method[3:]) if method[3:].isdigit() else 1
        reduced_dim = vector_dim // reduction_factor
        vector_memory = num_vectors * reduced_dim * 4  # Still using floats after PCA
    elif method.startswith('twostep'):
        binary_memory = num_vectors * ((vector_dim + 7) // 8)  # Binary vectors
        float_memory = num_vectors * vector_dim * 4  # Float vectors
        vector_memory = binary_memory + float_memory
    elif method.startswith('ts_mf'):
        mf_memory = calculate_method_memory('mf', num_vectors, vector_dim)
        binary_memory = calculate_method_memory('binary', num_vectors, vector_dim)
        vector_memory = mf_memory + binary_memory
    else:
        raise ValueError(f"Unknown method: {method}")
    
    # Add overhead for index structures
    #if method.startswith('twostep'):
        #overhead = num_vectors * 8  # Additional index for binary search
    if method.startswith('pca'):
        reduction_factor = int(method[3:]) if method[3:].isdigit() else 1
        reduced_dim = vector_dim // reduction_factor
        overhead = vector_dim * reduced_dim * 4  # PCA transformation matrix
    else:
        #overhead = num_vectors * 4  # Basic index overhead
        overhead = 0
    
    return vector_memory + overhead

def calculate_bandwidth_memory(method: str, num_vectors, vector_dim, k):
    """Calculate memory accessed during search for bandwidth calculation.
    
    Args:
        method: Name of the method
        num_vectors: Number of vectors in dataset
        vector_dim: Dimension of vectors
        k: Number of neighbors to retrieve
    Returns:
        Memory accessed in bytes
    """
    if method.startswith('float16'):
        memory_bytes = num_vectors * vector_dim * 2
    elif method.startswith('float') or method.startswith('avx2'):
        # Access all float vectors
        memory_bytes = num_vectors * vector_dim * 4
    elif method.startswith('binary'):
        # Access all binary vectors
        memory_bytes = num_vectors * ((vector_dim + 7) // 8)
    elif method.startswith('int8') or method.startswith('mf'):
        # Access all int8 vectors
        memory_bytes = num_vectors * vector_dim
    elif method.startswith('pca'):
        # Access reduced dimension float vectors
        reduction_factor = int(method[3:]) if method[3:].isdigit() else 1
        reduced_dim = vector_dim // reduction_factor
        memory_bytes = num_vectors * reduced_dim * 4
    elif method.startswith('twostep_rf'):
        # First phase: access all binary vectors
        binary_memory = num_vectors * ((vector_dim + 7) // 8)
        
        # Second phase: access top-k * rf float vectors
        rf = int(method.replace('twostep_rf', ''))
        float_memory = k * rf * vector_dim * 4
        
        memory_bytes = binary_memory + float_memory
    elif method.startswith('ts_mf_rf'):
        binary_memory = num_vectors * ((vector_dim + 7) // 8)
        rf = int(method.replace('ts_mf_rf', ''))
        mf_memory = k * rf * vector_dim * 1
        memory_bytes = binary_memory + mf_memory
    else:
        raise ValueError(f"Unknown method: {method}")
    
    return memory_bytes

# Load your data
results = load_benchmark_results([
    'benchmark_dim1024_k100_q.json',
    'benchmark_dim1024_k100_re.json',
    'benchmark_dim1024_k25_q.json',
    'benchmark_dim1024_k10_q.json',
    'benchmark_dim768_k100_q.json',
    'benchmark_dim768_k100_re.json',
    'benchmark_results_1733419058.json',
    ])

In [None]:
def plot_performance_comparison(results, result_key, skip_methods=None, enable_save_plot=False, name_postfix=""):
    if skip_methods is None:
        skip_methods = []
        
    summary = results[result_key]['summary']
    
    df = pd.DataFrame([
        {
            'Method': method,
            'Mean Time (ms)': stats['time_us']['mean'] / 1000,
            'Std Time': stats['time_us']['std'] / 1000
        }
        for method, stats in summary.items()
        #if method not in skip_methods
        if not any(fnmatch.fnmatch(method, pattern) for pattern in skip_methods)
    ])
    
    fig, ax = plt.subplots()
    
    bars = sns.barplot(data=df, x='Method', y='Mean Time (ms)', 
                      errorbar=('ci', 68), ax=ax)
    
    ax.set_xlabel('Method')
    ax.set_ylabel('Search Time (ms)')
    ax.set_title('Search Performance Comparison')
    
    plt.xticks(rotation=30, ha='right')
    
    for bar in bars.containers[0]:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
               f'{height:.1f}', ha='center', va='bottom', fontsize=7)
    
    plt.tight_layout()
    skipped_methods = ""
    for name in skip_methods:
        skipped_methods += name
    if enable_save_plot: save_plot(f'performance_comparison_{get_title_metadata(result_key=result_key, with_unterscores=True)}{name_postfix}')
    plt.show()

# Generate plot
plot_performance_comparison(results, 'benchmark_dim1024_k100_q', skip_methods=['float', 'mf', 'float16', 'twostep_rf5', 'twostep_rf10', 'twostep_rf25', 'ts_mf_rf5', 'ts_mf_rf10', 'ts_mf_rf25'], enable_save_plot=True)
plot_performance_comparison(results, 'benchmark_dim1024_k100_q', skip_methods=['float16'])
plot_performance_comparison(results, 'benchmark_dim1024_k100_re', skip_methods=['float', 'mf', 'float16'])
plot_performance_comparison(results, 'benchmark_dim1024_k100_re', skip_methods=['float16'])
plot_performance_comparison(results, 'benchmark_dim768_k100_q', skip_methods=['float', 'mf', 'float16', 'twostep_rf5', 'twostep_rf10', 'twostep_rf25', 'ts_mf_rf5', 'ts_mf_rf10', 'ts_mf_rf25'], enable_save_plot=True)
plot_performance_comparison(results, 'benchmark_results_1733419058', skip_methods=['float', 'mf', 'float16'])

In [None]:
def plot_accuracy_heatmap(results, result_key, only_methods=None, enable_save_plot=False, name_postfix=""):
    metrics = []
    
    for method, stats in results[result_key]['summary'].items():
        if method != 'float' and (only_methods == None or any(fnmatch.fnmatch(method, pattern) for pattern in only_methods)):
            ndcg = stats.get('ndcg', {}).get('mean', 1.0)
            jaccard = stats.get('jaccard_index', {}).get('mean', 1.0)
            overlap = stats.get('overlap', {}).get('mean', results[result_key]['metadata']['k'])
            metrics.append({
                'Method': method,
                'NDCG': ndcg,
                'Jaccard': jaccard,
                'Overlap': overlap / results[result_key]['metadata']['k']
            })
    
    if not metrics:
        print("No valid metrics found to create heatmap")
        return
        
    df = pd.DataFrame(metrics).set_index('Method')
    
    fig, ax = plt.subplots()
    
    sns.heatmap(df, annot=True, fmt='.3f', cmap='YlOrRd',
               vmin=0, vmax=1, center=0.5, ax=ax,
               annot_kws={'size': 7},
               cbar=False)  # Disable colorbar
    
    ax.set_title('Accuracy Metrics Comparison')
    plt.tight_layout()
    if enable_save_plot: save_plot(f'accuracy_heatmap_{get_title_metadata(result_key=result_key, with_unterscores=True)}{name_postfix}')
    plt.show()

# Generate plot
plot_accuracy_heatmap(results, 'benchmark_dim1024_k100_q', only_methods=['avx2','binary', 'int8', 'float16', 'mf', 'pca*'], enable_save_plot=True)
plot_accuracy_heatmap(results, 'benchmark_dim1024_k100_q', only_methods=['twostep*','ts*'], enable_save_plot=True, name_postfix="_twostep")
#plot_accuracy_heatmap(results, 'benchmark_dim1024_k100_re')
#plot_accuracy_heatmap(results, 'benchmark_results_1733419058', only_methods=['twostep*','ts*'])

In [None]:
def plot_pareto_frontier(results, result_key, skip_methods = ['float'], enable_save_plot=False, name_postfix=""):
    """Create a scatter plot showing the trade-off between speed and accuracy.

    Args:
        results: Dictionary containing benchmark results
        result_key: Key of the benchmark results to plot
        skip_methods: List of method names to exclude
    """
    summary = results[result_key]['summary']
    
    # Collect data points
    data = []
    for method, stats in summary.items():
        if method not in skip_methods:
            data.append({
                'Method': method,
                'Time (ms)': stats['time_us']['mean']/1000,
                'NDCG': stats.get('ndcg', {}).get('mean', 1.0)
            })
    
    df = pd.DataFrame(data)

    fig, ax = plt.subplots()
    
    # Create scatter plot
    sns.scatterplot(data=df, x='Time (ms)', y='NDCG', s=50)
    
    # Add method labels next to points
    for _, row in df.iterrows():
        if(row['Method'] == 'twostep_rf25'): textpos = (5, -2)
        elif(row['Method'] == 'twostep_rf50'): textpos = (5, 3)
        elif(row['Method'] == 'twostep_rf2'): textpos = (5, 3)
        else: textpos = (5, -5)
        ax.annotate(row['Method'], 
                   (row['Time (ms)'], row['NDCG']),
                   xytext=textpos, 
                   textcoords='offset points',
                   fontsize=6)
    
    ax.set_title('Speed-Accuracy Trade-off Analysis')
    ax.set_xlabel('Search Time (ms)')
    ax.set_ylabel('NDCG Score')
    
    # Ensure NDCG axis starts at 0
    ax.set_ylim(0, 1.05)
    ax.set_xlim(0)
    
    plt.tight_layout()
    if enable_save_plot: save_plot(f'speed_vs_accuracy_{get_title_metadata(result_key=result_key, with_unterscores=True)}{name_postfix}')
    plt.show()

# Generate plot
plot_pareto_frontier(results, 'benchmark_dim1024_k100_q', skip_methods=['float', 'float16', 'mf', 'twostep_rf5', 'twostep_rf10', 'twostep_rf25', 'ts_mf_rf5', 'ts_mf_rf10', 'ts_mf_rf25'], enable_save_plot=True)
plot_pareto_frontier(results, 'benchmark_dim1024_k100_re', skip_methods=['float', 'float16', 'mf'])
plot_pareto_frontier(results, 'benchmark_results_1733419058', skip_methods=['float', 'float16', 'mf'])

In [None]:
def plot_pareto_frontier_memory(results, result_key, skip_methods = ['float'], enable_save_plot=False, name_postfix=""):
    """Create a scatter plot showing the trade-off between memory and accuracy.

    Args:
        results: Dictionary containing benchmark results
        result_key: Key of the benchmark results to plot
        skip_methods: List of method names to exclude
    """
    summary = results[result_key]['summary']
    num_vectors = results[result_key]['metadata']['num_vectors']
    vector_dim = results[result_key]['metadata']['vector_dim']
    
    # Collect data points
    data = []
    for method, stats in summary.items():
        if method not in skip_methods:
            data.append({
                'Method': method,
                'Memory (GB)': calculate_method_memory(method, num_vectors, vector_dim)/1024/1024/1024,
                'NDCG': stats.get('ndcg', {}).get('mean', 1.0)
            })
    
    df = pd.DataFrame(data)

    fig, ax = plt.subplots()
    
    # Create scatter plot
    sns.scatterplot(data=df, x='Memory (GB)', y='NDCG', s=50)
    
    # Add method labels next to points
    for _, row in df.iterrows():
        if(row['Method'] == 'twostep_rf25'): textpos = (5, -2)
        elif(row['Method'] == 'twostep_rf50'): textpos = (5, 3)
        elif row['Method'] in ['avx2', 'mf']: textpos = (-12,-5)
        else: textpos = (5, -5)
        ax.annotate(row['Method'], 
                   (row['Memory (GB)'], row['NDCG']),
                   xytext=textpos, 
                   textcoords='offset points',
                   fontsize=6)
    
    ax.set_title('Memory-Accuracy Trade-off Analysis')
    ax.set_xlabel('Used Memory (GB)')
    ax.set_ylabel('NDCG Score')
    
    # Ensure NDCG axis starts at 0
    ax.set_ylim(0, 1.05)
    ax.set_xlim(0)
    
    plt.tight_layout()
    if enable_save_plot: save_plot(f'memory_vs_accuracy_{get_title_metadata(result_key=result_key, with_unterscores=True)}{name_postfix}')
    plt.show()

# Generate plot
plot_pareto_frontier_memory(results, 'benchmark_dim1024_k100_q', skip_methods=['float', 'ts_mf_rf5', 'ts_mf_rf10', '_ts_mf_rf25', 'ts_mf_rf50'], enable_save_plot=True)
plot_pareto_frontier_memory(results, 'benchmark_dim1024_k100_re', skip_methods=['float'])
plot_pareto_frontier_memory(results, 'benchmark_results_1733419058', skip_methods=['float', 'ts_mf_rf5', 'ts_mf_rf10', '_ts_mf_rf25', 'ts_mf_rf50'])

In [None]:
def plot_twostep_comparison(results, result_key, enable_save_plot=False, name_postfix=""):
    """Plot comparison of different rescoring factors for twostep methods.
    
    Creates two plots:
    1. Search time vs rescoring factor
    2. Search quality (NDCG and Jaccard) vs rescoring factor
    
    Args:
        results: Dictionary containing benchmark results
        result_key: Key of the benchmark results to plot
    """
    # Get twostep methods
    twostep_methods = [m for m in results[result_key]['summary'].keys() 
                      if m.startswith('twostep_rf')]
    twostep_methods += [m for m in results[result_key]['summary'].keys() 
                      if m.startswith('ts_mf_rf')]
    
    if not twostep_methods:
        print("No twostep methods found in results")
        return
    
    # Collect data
    data = []
    for method in twostep_methods:
        if method.startswith('twostep_rf'): rf = int(method.replace('twostep_rf', ''))
        else: rf = int(method.replace('ts_mf_rf', ''))
        stats = results[result_key]['summary'][method]
        data.append({
            'Rescoring Factor': rf,
            'Mean Time (ms)': stats['time_us']['mean'] / 1000,
            'Mean NDCG': stats.get('ndcg', {}).get('mean', 1.0),
            'Jaccard Index': stats.get('jaccard_index', {}).get('mean', 1.0)
        })
    
    df = pd.DataFrame(data).sort_values('Rescoring Factor')
    
    # Plot 1: Search Time
    fig1, ax1 = plt.subplots()
    
    sns.lineplot(data=df, x='Rescoring Factor', y='Mean Time (ms)', 
                marker='o', ax=ax1)
    
    ax1.set_title('Search Time vs Rescoring Factor')
    ax1.set_xlabel('Rescoring Factor')
    ax1.set_ylabel('Mean Search Time (ms)')
    
    # Ensure x-axis shows all integer rescoring factors
    ax1.set_xticks(df['Rescoring Factor'])
    
    plt.tight_layout()
    if enable_save_plot: save_plot(f'twostep_time_{result_key}{name_postfix}')
    plt.show()
    
    # Plot 2: Search Quality
    fig2, ax2 = plt.subplots()
    
    sns.lineplot(data=df, x='Rescoring Factor', y='Mean NDCG', 
                marker='o', label='NDCG', ax=ax2)
    sns.lineplot(data=df, x='Rescoring Factor', y='Jaccard Index', 
                marker='o', label='Jaccard', ax=ax2)
    
    ax2.set_title('Search Quality vs Rescoring Factor')
    ax2.set_xlabel('Rescoring Factor')
    ax2.set_ylabel('Score')
    
    # Ensure x-axis shows all integer rescoring factors
    ax2.set_xticks(df['Rescoring Factor'])
    
    # Set y-axis limits for scores
    ax2.set_ylim(0, 1.05)
    
    ax2.legend()
    
    plt.tight_layout()
    if enable_save_plot: save_plot(f'twostep_comparison_{get_title_metadata(result_key=result_key, with_unterscores=True)}{name_postfix}')
    plt.show()

# Generate plots
plot_twostep_comparison(results, 'benchmark_dim1024_k100_q')
plot_twostep_comparison(results, 'benchmark_dim1024_k25_q')

In [None]:
def plot_twostep_comparison_combined(results, result_key, show_method='twostep_rf', enable_save_plot=False, name_postfix=""):
    """Plot combined comparison of time and quality metrics for twostep methods.
    
    Creates a single plot with dual y-axes showing:
    - Search time vs rescoring factor (left axis)
    - Search quality (NDCG and Jaccard) vs rescoring factor (right axis)
    
    Args:
        results: Dictionary containing benchmark results
        result_key: Key of the benchmark results to plot
    """
    # Get twostep methods
    twostep_methods = [m for m in results[result_key]['summary'].keys() 
                      if m.startswith(show_method)]
    
    if not twostep_methods:
        print("No twostep methods found in results")
        return
    
    # Collect data
    data = []
    for method in twostep_methods:
        rf = int(method.replace(show_method, ''))
        stats = results[result_key]['summary'][method]
        data.append({
            'Rescoring Factor': rf,
            'Mean Time (ms)': stats['time_us']['mean'] / 1000,
            'Mean NDCG': stats.get('ndcg', {}).get('mean', 1.0),
            'Jaccard Index': stats.get('jaccard_index', {}).get('mean', 1.0)
        })
    
    df = pd.DataFrame(data).sort_values('Rescoring Factor')
    
    # Create figure with two y-axes
    fig, ax1 = plt.subplots()
    ax2 = ax1.twinx()
    
    # Plot time on left axis
    line1 = ax1.plot(df['Rescoring Factor'], df['Mean Time (ms)'], 
                     marker='o', color='black', label='Time')
    ax1.set_xlabel('Rescoring Factor')
    ax1.set_ylabel('Mean Search Time (ms)')
    
    # Plot quality metrics on right axis
    line2 = ax2.plot(df['Rescoring Factor'], df['Mean NDCG'], 
                     marker='s', linestyle='--', color='gray', label='NDCG')
    line3 = ax2.plot(df['Rescoring Factor'], df['Jaccard Index'], 
                     marker='^', linestyle=':', color='lightgray', label='Jaccard')
    ax2.set_ylabel('Score')
    
    # Set y-axis limits for scores
    ax2.set_ylim(0, 1.05)
    
    # Ensure x-axis shows all integer rescoring factors
    ax1.set_xticks(df['Rescoring Factor'])
    
    # Add combined legend
    lines = line1 + line2 + line3
    labels = [l.get_label() for l in lines]
    ax1.legend(lines, labels, loc='center right')
    
    plt.title('Twostep Method Performance' + ' ' + show_method)
    plt.tight_layout()
    
    if enable_save_plot: save_plot(f'twostep_comparison_combined_{result_key}{name_postfix}')
    plt.show()

# Example usage
# plot_twostep_comparison(results_dict, 'benchmark_results_example')
plot_twostep_comparison_combined(results, 'benchmark_dim1024_k100_q', enable_save_plot=True)
plot_twostep_comparison_combined(results, 'benchmark_dim1024_k100_q', show_method='ts_mf_rf', enable_save_plot=True, name_postfix="_mf")
#plot_twostep_comparison_combined(results, 'benchmark_dim1024_k25_q')
#plot_twostep_comparison_combined(results, 'benchmark_results_1733419058')
#plot_twostep_comparison_combined(results, 'benchmark_results_1733419058', show_method='ts_mf_rf')

In [None]:
def plot_metric_boxplots(results, result_key, metric='ndcg', skip_methods=None, enable_save_plot=False, name_postfix=""):
    """Create box plots for NDCG or Jaccard Index distributions.
    
    Args:
        results: Dictionary containing benchmark results
        result_key: Key of the benchmark results to plot
        metric: Which metric to plot ('ndcg' or 'jaccard')
        skip_methods: List of method names to skip in the plot
    """
    if skip_methods is None:
        skip_methods = []
        
    # Configure based on metric
    metric_config = {
        'ndcg': {'key': 'ndcg', 'title': 'NDCG', 'ylabel': 'NDCG Score'},
        'jaccard': {'key': 'jaccard_index', 'title': 'Jaccard Index', 'ylabel': 'Jaccard Index'}
    }
    
    if metric not in metric_config:
        raise ValueError(f"Metric must be one of {list(metric_config.keys())}")
    
    config = metric_config[metric]
    
    # Collect metric data from each run
    plot_data = []
    methods = []
    
    for run in results[result_key]['runs']:
        for search in run['searches']:
            method = search['method']
            if method != 'float' and not any(fnmatch.fnmatch(method, pattern) for pattern in skip_methods):
                if method not in methods: methods.append(method)
                try:
                    metrics = search['metrics']
                    value = metrics[config['key']]
                    plot_data.append({
                        'Method': method,
                        'Value': value
                    })
                except KeyError:
                    print(f"Warning: Missing {config['key']} metric for method {method}")
                    continue
    
    if not plot_data:
        print(f"No valid {metric} data found to plot")
        return
        
    df = pd.DataFrame(plot_data)
    
    plt.figure()
    
    # Create boxplot
    sns.boxplot(data=df, x='Method', y='Value',
                showfliers=True,
                width=0.6,
                linewidth=0.8,
                fliersize=1.0,
                fill=False,
                flierprops={"marker": "x", 'markerfacecolor': 'black', 'markeredgecolor': 'black'},
                boxprops={'color': 'black'},  # Set box color to black
                whiskerprops={'color': 'black'},  # Match whisker color
                medianprops={'color': 'black'},  # Match median line color
                capprops={'color': 'black'},
                notch=False)
    
    # Customize plot
    plt.xticks(rotation=30, ha='right')
    plt.title(f'{config["title"]} Score Distributions')
    plt.ylabel(config['ylabel'])
    plt.xlabel('Method')
    plt.ylim(0, 1.025)  # Both metrics are between 0 and 1
    
    plt.tight_layout()
    
    # Save plot if enabled
    if enable_save_plot: save_plot(f'{metric}_boxplots_{get_title_metadata(result_key=result_key, with_unterscores=True)}{name_postfix}')
    
    plt.show()
    
    # Print statistical summary
    print(f"\n{config['title']} Statistical Summary:")
    summary = df.groupby('Method')['Value'].describe()
    print(summary.round(4).to_string())

# Generate plots
plot_metric_boxplots(results, 'benchmark_dim1024_k100_q', metric='ndcg', skip_methods=['ts_mf*', 'twostep_rf*'], enable_save_plot=True)
plot_metric_boxplots(results, 'benchmark_dim1024_k100_q', metric='jaccard', skip_methods=['ts_mf*', 'twostep_rf*'], enable_save_plot=True)
plot_metric_boxplots(results, 'benchmark_dim1024_k100_q', metric='ndcg', skip_methods=['avx2', 'binary', 'int8', 'float16', 'mf', 'pca*'], name_postfix="_twostep", enable_save_plot=True)
plot_metric_boxplots(results, 'benchmark_dim1024_k100_q', metric='jaccard', skip_methods=['avx2', 'binary', 'int8', 'float16', 'mf', 'pca*'], name_postfix="_twostep", enable_save_plot=True)

plot_metric_boxplots(results, 'benchmark_dim1024_k100_re', metric='ndcg', skip_methods=['ts_mf*', 'twostep_rf*'], enable_save_plot=True)
plot_metric_boxplots(results, 'benchmark_dim1024_k100_re', metric='jaccard', skip_methods=['ts_mf*', 'twostep_rf*'], enable_save_plot=True)
plot_metric_boxplots(results, 'benchmark_dim1024_k100_re', metric='ndcg', skip_methods=['avx2', 'binary', 'int8', 'float16', 'mf', 'pca*'], name_postfix="_twostep", enable_save_plot=True)
plot_metric_boxplots(results, 'benchmark_dim1024_k100_re', metric='jaccard', skip_methods=['avx2', 'binary', 'int8', 'float16', 'mf', 'pca*'], name_postfix="_twostep", enable_save_plot=True)
#plot_metric_boxplots(results, 'benchmark_dim1024_k10_q', metric='ndcg', skip_methods=['ts_mf*', 'twostep_rf*'])
#plot_metric_boxplots(results, 'benchmark_dim1024_k100_re', metric='ndcg')
#plot_metric_boxplots(results, 'benchmark_dim1024_k100_re', metric='jaccard')

In [None]:
def plot_performance_vs_memory(results, result_key, enable_save_plot=False, name_postfix=""):
    """Create a scatter plot showing performance vs memory usage trade-off.
    
    Args:
        results: Dictionary containing benchmark results
        result_key: Key of the benchmark results to plot
    """
    summary = results[result_key]['summary']
    num_vectors = results[result_key]['metadata']['num_vectors']
    vector_dim = results[result_key]['metadata']['vector_dim']
    
    # Collect data
    data = []
    baseline_time = summary['avx2']['time_us']['mean']
    
    for method, stats in summary.items():
        if method != 'float':
            memory_mb = calculate_method_memory(method, num_vectors, vector_dim) / (1024 * 1024)
            speedup = baseline_time / stats['time_us']['mean']
            ndcg = stats.get('ndcg', {}).get('mean', 1.0)
            
            data.append({
                'Method': method,
                'Memory (MB)': memory_mb,
                'Speedup': speedup,
                'NDCG': ndcg,
                'Size': ndcg * ndcg * ndcg * 250  # Adjusted size scaling
            })
    
    df = pd.DataFrame(data)
    
    plt.figure()
    plt.scatter(df['Memory (MB)'], df['Speedup'], 
                s=df['Size'], alpha=0.6, 
                color='black')  # Single color (black)
    
    # Add method labels
    for _, row in df.iterrows():
        plt.annotate(row['Method'], 
                    (row['Memory (MB)'], row['Speedup']),
                    xytext=(5, 5), textcoords='offset points',
                    fontsize=8)
    
    plt.title('Performance vs Memory Usage')
    plt.xlabel('Memory Usage (MB)')
    plt.ylabel('Speedup vs Baseline (avx2)')
    
    plt.tight_layout()
    
    # Save plot if enabled
    if enable_save_plot: save_plot(f'performance_vs_memory_{get_title_metadata(result_key=result_key, with_unterscores=True)}{name_postfix}')
    
    plt.show()

# Generate plot
plot_performance_vs_memory(results, 'benchmark_dim1024_k100_q')


In [None]:
def create_query_table(results, result_key, num_variance=5, num_extreme=3, num_random=0, random_seed=42):
    """Create a table showing NDCG scores for different methods across interesting queries.
    
    Args:
        results: Dictionary containing benchmark results
        result_key: Key of the benchmark results to plot
        num_variance: Number of high-variance queries to show
        num_extreme: Number of best/worst queries to show
        num_random: Number of random queries to show
        random_seed: Random seed for reproducibility
    """
    if random_seed is not None:
        np.random.seed(random_seed)
        
    # Collect per-query performance data
    query_data = []
    
    for run in results[result_key]['runs']:
        query_text = run['query_text']
        for search in run['searches']:
            if search['method'] != 'float':  # Skip baseline
                query_data.append({
                    'Query': query_text,
                    'Method': search['method'],
                    'NDCG': search['metrics'].get('ndcg', 1.0)
                })
    
    # Convert to DataFrame
    df = pd.DataFrame(query_data)
    
    # Create pivot table for calculating statistics
    query_matrix = df.pivot_table(
        values='NDCG',
        index='Query',
        columns='Method',
        aggfunc='first'
    )
    
    # Calculate statistics for each query
    query_stats = query_matrix.agg(['mean', 'std'], axis=1)
    query_stats.columns = ['mean_ndcg', 'std_ndcg']
    
    # Select interesting queries
    high_var_queries = query_stats.nlargest(num_variance, 'std_ndcg').index.tolist()
    best_queries = query_stats.nlargest(num_extreme, 'mean_ndcg').index.tolist()
    worst_queries = query_stats.nsmallest(num_extreme, 'mean_ndcg').index.tolist()
    
    # Select random queries
    available_queries = list(set(query_matrix.index) - 
                           set(high_var_queries) - 
                           set(best_queries) - 
                           set(worst_queries))
    random_queries = []
    if num_random > 0 and available_queries:
        random_queries = list(np.random.choice(available_queries, 
                                             size=min(num_random, len(available_queries)), 
                                             replace=False))
    
    # Combine and deduplicate queries
    selected_queries = list(dict.fromkeys(
        worst_queries + high_var_queries + best_queries + random_queries
    ))
    
    # Filter matrix for selected queries
    selected_matrix = query_matrix.loc[selected_queries]
    
    # Add statistics columns
    selected_matrix['Mean'] = query_stats.loc[selected_queries, 'mean_ndcg']
    selected_matrix['Std'] = query_stats.loc[selected_queries, 'std_ndcg']
    
    # Sort by mean NDCG
    selected_matrix = selected_matrix.sort_values('Mean', ascending=True)
    
    # Create labels with query type and shortened text
    query_labels = {}
    for q in selected_matrix.index:
        if q in worst_queries:
            prefix = 'WORST - '
        elif q in best_queries:
            prefix = 'BEST - '
        elif q in high_var_queries:
            prefix = 'VAR - '
        elif q in random_queries:
            prefix = 'RANDOM - '
        else:
            prefix = ''
        
        short_q = q[:48] + '...' if len(q) > 48 else q
        query_labels[q] = f"{prefix}{short_q}"
    
    selected_matrix.index = [query_labels[q] for q in selected_matrix.index]
    
    # Round all numbers to 3 decimal places
    selected_matrix = selected_matrix.round(3)
    
    # Print the table
    print("\nQuery Performance Analysis:")
    print("=" * 80)
    print(selected_matrix.to_string())
    print("\nMethod Averages:")
    print("-" * 40)
    method_averages = df.groupby('Method')['NDCG'].mean().sort_values(ascending=False)
    for method, avg in method_averages.items():
        print(f"{method:15} {avg:.3f}")

# Generate table
create_query_table(results, 'benchmark_dim1024_k100_q', 50, 30, 20)

In [None]:
def compare_benchmarks(results_dict, benchmark_keys, names=None, skip_methods=['float'], enable_save_plot=False, name_postfix="", plot_title='NDCG Comparison Across Embedding Models'):
    """Compare NDCG scores across multiple benchmark results.
    
    Args:
        results_dict: Dictionary containing multiple benchmark results
        benchmark_keys: List of benchmark result keys to compare
        names: Optional list of names to use for the benchmarks (e.g. 'BERT-1024', 'BERT-768')
        skip_methods: List of methods to exclude from comparison
    """
    if names is None:
        names = benchmark_keys
        
    if len(benchmark_keys) != len(names):
        raise ValueError("Length of benchmark_keys and names must match")
    
    # Collect metadata and basic statistics for each benchmark
    benchmark_info = {}
    for key, name in zip(benchmark_keys, names):
        # Get basic statistics for each method
        method_stats = {}
        for method, stats in results_dict[key]['summary'].items():
            method_stats[method] = {
                'mean_time': stats['time_us']['mean'],
                'std_time': stats['time_us']['std'],
                'ndcg': stats.get('ndcg', {}).get('mean', 1.0),
                'ndcg_std': stats.get('ndcg', {}).get('std', 0.0),
            }
            
        benchmark_info[name] = {
            'metadata': results_dict[key]['metadata'],
            'stats': method_stats
        }
    
    # Prepare data for plotting
    methods = list(benchmark_info[names[0]]['stats'].keys())
    methods_filtered = [m for m in methods if m not in skip_methods]
    methods_filtered = [m for m in methods if not any(fnmatch.fnmatch(m, pattern) for pattern in skip_methods)]
    x = np.arange(len(methods_filtered))
    width = 0.8 / len(names)  # Width of bars
    
    # Create NDCG comparison plot
    plt.figure()
    
    # Plot bars for each benchmark
    for i, name in enumerate(names):
        ndcg_scores = [benchmark_info[name]['stats'][method]['ndcg'] 
                      for method in methods_filtered]
        
        plt.bar(x + i*width - width*len(names)/2 + width/2,
                ndcg_scores,
                width,
                label=name)
    
    plt.xlabel('Method')
    plt.ylabel('NDCG Score')
    plt.title(plot_title)
    plt.xticks(x, methods_filtered, rotation=30, ha='right')
    plt.legend(loc=3, bbox_to_anchor=(-0.1,-0.3))
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    # Save plot if enabled
    if len(benchmark_keys) > 1:
        if enable_save_plot: save_plot(f'benchmark_comparison_{get_title_metadata(result_key=benchmark_keys[0], with_unterscores=True)}_{get_title_metadata(result_key=benchmark_keys[1], with_unterscores=True)}{name_postfix}')
    else:
        if enable_save_plot: save_plot(f'benchmark_comparison_{get_title_metadata(result_key=benchmark_keys[0], with_unterscores=True)}{name_postfix}')
    
    plt.show()
    
    # Print detailed comparison
    print("\nDetailed Comparison:")
    for method in methods:
        print(f"\nMethod: {method}")
        print("-" * 50)
        for name in names:
            stats = benchmark_info[name]['stats'][method]
            dim = benchmark_info[name]['metadata']['vector_dim']
            print(f"{name} (dim={dim}):")
            print(f"  Time: {stats['mean_time']:.2f} ± {stats['std_time']:.2f} μs")
            if method != 'float':
                print(f"  NDCG: {stats['ndcg']:.3f} ± {stats['ndcg_std']:.3f}")

# Example usage with multiple benchmark results
compare_benchmarks(results, 
                   benchmark_keys=['benchmark_dim1024_k100_q', 'benchmark_dim768_k100_q'],
                   names=['mxbai-embed-large-v1', 'all-mpnet-base-v2'],
                   skip_methods=['float', 'twostep_rf5', 'twostep_rf10', 'twostep_rf25', 'ts_mf_rf5', 'ts_mf_rf10', 'ts_mf_rf25'], enable_save_plot=True)
compare_benchmarks(results, 
                   benchmark_keys=['benchmark_dim1024_k100_re', 'benchmark_dim768_k100_re'],
                   names=['mxbai-embed-large-v1', 'all-mpnet-base-v2'],
                   skip_methods=['float'])
compare_benchmarks(results, 
                   benchmark_keys=['benchmark_dim1024_k100_q', 'benchmark_dim1024_k25_q'],
                   names=['k=100', 'k=25'],
                   skip_methods=['float', 'twostep_rf5', 'twostep_rf10', 'twostep_rf25', 'ts_mf_rf5', 'ts_mf_rf10', 'ts_mf_rf25'], enable_save_plot=True, plot_title='NDCG Comparison Across k', name_postfix='_cmp_k')

In [None]:
#plot_ndcg_scatter_comparison
def format_method_name(method):
    """Format method names for display in plots and tables.
    
    Args:
        method: Original method name
    Returns:
        Formatted method name
    """
    if method.startswith('twostep_rf'):
        return f"ts_rf{method.split('twostep_rf')[1]}"
    return method

def plot_ndcg_scatter_comparison(results_dict, key1, key2, name1=None, name2=None, skip_methods=['float'], filename_suffix:str=None, enable_save_plot=False, name_postfix=""):
    """Create scatter plot comparing NDCG scores between two benchmarks.
    
    Args:
        results_dict: Dictionary containing benchmark results
        key1: Key of first benchmark result
        key2: Key of second benchmark result
        name1: Display name for first benchmark (optional)
        name2: Display name for second benchmark (optional)
        skip_methods: List of methods to exclude from comparison
    """
    if name1 is None:
        name1 = key1
    if name2 is None:
        name2 = key2
    
    results1 = results_dict[key1]
    results2 = results_dict[key2]
    
    # Collect NDCG scores from both benchmarks
    ndcg_data = []
    
    # Process each run/query
    for run1, run2 in zip(results1['runs'], results2['runs']):
        query = run1['query_text']
        
        # Get NDCG scores for each method
        for search1, search2 in zip(run1['searches'], run2['searches']):
            method = search1['method']
            if method not in skip_methods:
                ndcg_data.append({
                    'Query': query,
                    'Method': format_method_name(method),
                    'NDCG_1': search1['metrics'].get('ndcg', 1.0),
                    'NDCG_2': search2['metrics'].get('ndcg', 1.0)
                })
    
    df = pd.DataFrame(ndcg_data)
    
    plt.figure()
    
    # Plot diagonal line for reference
    plt.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Eq. Prf')
    
    # Plot points for each method with different colors
    for method in df['Method'].unique():
        method_data = df[df['Method'] == method]
        plt.scatter(method_data['NDCG_1'], 
                method_data['NDCG_2'], 
                alpha=0.25, 
                label=method,
                s=10)  # Set point size
    
    plt.xlabel(f'NDCG Score - {name1}')
    plt.ylabel(f'NDCG Score - {name2}')
    plt.title('NDCG Score Comparison')
    
    plt.xlim(0, 1)
    plt.ylim(0, 1)
    
    # Add legend with smaller font
    plt.legend(fontsize=8, bbox_to_anchor=(1, 1), loc='upper left')
    
    plt.tight_layout()
    
    if filename_suffix is not None: 
        filename_suffix = "_" + filename_suffix
    else:
        filename_suffix = ""
    # Save plot if enabled
    if enable_save_plot: save_plot(f'ndcg_scatter_{key1}_vs_{key2}{filename_suffix}{name_postfix}')
    
    plt.show()

    # Print statistics
    print("\nCorrelation between benchmark NDCG scores by method:")
    for method in df['Method'].unique():
        method_data = df[df['Method'] == method]
        corr = method_data['NDCG_1'].corr(method_data['NDCG_2'])
        mean_diff = (method_data['NDCG_2'] - method_data['NDCG_1']).mean()
        print(f"\n{method}:")
        print(f"Correlation: {corr:.3f}")
        print(f"Mean difference ({name2} - {name1}): {mean_diff:.3f}")

# Example usage
plot_ndcg_scatter_comparison(results,
                             key1='benchmark_dim1024_k100_q', key2='benchmark_dim768_k100_q',
                             name1='mxbai-embed-large-v1', name2='all-mpnet-base-v2',
                             skip_methods=['float', 'avx2', 'twostep_rf5', 'twostep_rf25', 'twostep_rf50', 'pca2', 'pca4', 'pca8', 'pca16', 'pca32', 'ts_mf_rf2', 'ts_mf_rf5', 'ts_mf_rf10', 'ts_mf_rf25', 'ts_mf_rf50', 'float16'], enable_save_plot=True)
plot_ndcg_scatter_comparison(results,
                             key1='benchmark_dim1024_k100_q', key2='benchmark_dim768_k100_q',
                             name1='mxbai-embed-large-v1', name2='all-mpnet-base-v2',
                             skip_methods=['float', 'avx2', 'binary', 'int8', 'float16', 'mf', 'twostep_rf2', 'twostep_rf5', 'twostep_rf10', 'twostep_rf25', 'twostep_rf50'],
                             filename_suffix="pca")

In [None]:
def plot_pca_analysis(results_dict, result_key, enable_save_plot=False, name_postfix=""):
    """Analyze PCA performance across different reduction factors.
    
    Creates two plots:
    1. PCA accuracy vs reduction factor
    2. PCA time and speedup vs reduction factor
    
    Args:
        results_dict: Dictionary containing benchmark results
        result_key: Key of the benchmark results to plot
    """
    results = results_dict[result_key]
    
    # Collect PCA data
    pca_data = []
    float_time = results['summary']['float']['time_us']['mean']
    
    for method, stats in results['summary'].items():
        if method.startswith('pca'):
            reduction_factor = int(method[3:]) if method[3:].isdigit() else 1
            original_dim = results['metadata']['vector_dim']
            reduced_dim = original_dim // reduction_factor
            
            pca_data.append({
                'Reduction Factor': reduction_factor,
                'Reduced Dimension': reduced_dim,
                'NDCG': stats.get('ndcg', {}).get('mean', 1.0),
                'NDCG_std': stats.get('ndcg', {}).get('std', 0.0),
                'Time (μs)': stats['time_us']['mean'],
                'Time_std': stats['time_us']['std'],
                'Speedup': float_time / stats['time_us']['mean']
            })
    
    if not pca_data:
        print("No PCA methods found in the benchmark results")
        return
    
    df = pd.DataFrame(pca_data).sort_values('Reduction Factor')
    
    # Plot 1: NDCG vs Reduction Factor
    plt.figure()
    
    plt.errorbar(df['Reduction Factor'], 
                df['NDCG'],
                yerr=df['NDCG_std'],
                marker='o',
                capsize=5,
                capthick=1,
                elinewidth=1,
                markersize=8,
                color='black')
    
    plt.xlabel('Reduction Factor')
    plt.ylabel('NDCG Score')
    plt.title('PCA Accuracy vs Reduction Factor')
    
    # Add reduced dimension as top axis
    ax_top = plt.gca().twiny()
    ax_top.set_xlim(plt.gca().get_xlim())
    ax_top.set_xticks(df['Reduction Factor'])
    ax_top.set_xticklabels([f'{dim}' for dim in df['Reduced Dimension']])
    ax_top.set_xlabel('Reduced Dimension')
    
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    if enable_save_plot: save_plot(f'pca_accuracy_{result_key}{name_postfix}')
    plt.show()
    
    # Plot 2: Time and Speedup vs Reduction Factor
    plt.figure()
    
    # Plot time
    ax1 = plt.gca()
    ax1.errorbar(df['Reduction Factor'], 
                df['Time (μs)'],
                yerr=df['Time_std'],
                marker='o',
                capsize=5,
                capthick=1,
                elinewidth=1,
                markersize=8,
                color='black',
                label='Time')
    
    # Add speedup as second y-axis
    ax2 = ax1.twinx()
    ax2.plot(df['Reduction Factor'], 
            df['Speedup'],
            color='gray',
            marker='s',
            linestyle='--',
            label='Speedup')
    
    ax1.set_xlabel('Reduction Factor')
    ax1.set_ylabel('Search Time (μs)')
    ax2.set_ylabel('Speedup vs Float')
    plt.title('PCA Performance vs Reduction Factor')
    
    # Add reduced dimension as top axis
    ax_top = ax1.twiny()
    ax_top.set_xlim(ax1.get_xlim())
    ax_top.set_xticks(df['Reduction Factor'])
    ax_top.set_xticklabels([f'{dim}' for dim in df['Reduced Dimension']])
    ax_top.set_xlabel('Reduced Dimension')
    
    # Combine legends
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines1 + lines2, labels1 + labels2, loc='upper right')
    
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    
    if enable_save_plot: save_plot(f'pca_performance_{result_key}{name_postfix}')
    plt.show()
    
    # Print statistics
    print("\nDetailed PCA Statistics:")
    stats_df = df[['Reduction Factor', 'Reduced Dimension', 'NDCG', 'Time (μs)', 'Speedup']]
    stats_df = stats_df.set_index('Reduction Factor').round(3)
    print(stats_df.to_string())
    
    # Calculate correlations
    print("\nCorrelations with Reduction Factor:")
    correlations = {
        'NDCG': df['Reduction Factor'].corr(df['NDCG']),
        'Time': df['Reduction Factor'].corr(df['Time (μs)']),
        'Speedup': df['Reduction Factor'].corr(df['Speedup'])
    }
    for metric, corr in correlations.items():
        print(f"{metric}: {corr:.3f}")


plot_pca_analysis(results, 'benchmark_dim1024_k100_re')
plot_pca_analysis(results, 'benchmark_dim1024_k100_q')

In [None]:
def plot_memory_bandwidth(results_dict, result_key, skip_methods=[], enable_save_plot=False, name_postfix=""):
    """Calculate and plot approximate memory bandwidth for each method.
    
    Args:
        results_dict: Dictionary containing benchmark results
        result_key: Key of the benchmark results to plot
    """
    results = results_dict[result_key]
    num_vectors = results['metadata']['num_vectors']
    vector_dim = results['metadata']['vector_dim']
    k = results['metadata']['k']
    
    # Collect bandwidth data
    bandwidth_data = []
    
    for method, stats in results['summary'].items():
        if method not in skip_methods:
            time_seconds = stats['time_us']['mean'] / 1_000_000  # Convert microseconds to seconds
            memory_bytes = calculate_bandwidth_memory(method, num_vectors, vector_dim, k)
            
            # Calculate bandwidth in GB/s
            bandwidth = (memory_bytes / (1024 * 1024 * 1024)) / time_seconds
            
            bandwidth_data.append({
                'Method': format_method_name(method),
                'Bandwidth (GB/s)': bandwidth,
                'Memory (GB)': memory_bytes / (1024 * 1024 * 1024),
                'Time (ms)': stats['time_us']['mean'] / 1000
            })
    
    df = pd.DataFrame(bandwidth_data)

    plt.figure()
    bars = plt.bar(df['Method'], df['Bandwidth (GB/s)'])
    
    # Add value labels on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.1f}',
                ha='center', va='bottom',
                fontsize=5)
    
    plt.title('Approximate Memory Bandwidth by Method')
    plt.xlabel('Method')
    plt.ylabel('Bandwidth (GB/s)')
    plt.xticks(rotation=30, ha='right')
    
    plt.tight_layout()
    if enable_save_plot: save_plot(f'memory_bandwidth_{result_key}{name_postfix}')
    plt.show()
    
    # Print detailed statistics
    print("\nDetailed Bandwidth Analysis:")
    print("\nMethod Statistics:")
    stats_df = df.sort_values('Bandwidth (GB/s)', ascending=False)
    print(stats_df.round(2).to_string(index=False))
    
    # Calculate relative bandwidth compared to float
    float_bandwidth = df[df['Method'] == 'float']['Bandwidth (GB/s)'].iloc[0]
    print("\nRelative Bandwidth (compared to float):")
    for _, row in df.iterrows():
        relative = row['Bandwidth (GB/s)'] / float_bandwidth
        print(f"{row['Method']:15} {relative:.2f}x")


# Example usage
plot_memory_bandwidth(results, 'benchmark_dim1024_k100_re', ['float16'])
plot_memory_bandwidth(results, 'benchmark_dim1024_k100_q', ['float16'])
plot_memory_bandwidth(results, 'benchmark_dim1024_k100_q', ['float16', 'twostep_rf5', 'twostep_rf10', 'twostep_rf25', 'ts_mf_rf5', 'ts_mf_rf10', 'ts_mf_rf25'], enable_save_plot=True)
plot_memory_bandwidth(results, 'benchmark_dim768_k100_q', ['float16', 'twostep_rf5', 'twostep_rf10', 'twostep_rf25', 'ts_mf_rf5', 'ts_mf_rf10', 'ts_mf_rf25'], enable_save_plot=True)
plot_memory_bandwidth(results, 'benchmark_results_1733419058', ['float16'])

In [None]:
def show_summary(results, result_key):
    results = results[result_key]
    num_vectors = results['metadata']['num_vectors']
    vector_dim = results['metadata']['vector_dim']
    k = results['metadata']['k']
    summary_df = pd.DataFrame([
        {
            'Method': method,
            'Mean Time (ms)': stats['time_us']['mean'] / 1000,
            'Std Time': stats['time_us']['std'] / 1000,
            'NDCG': stats.get('ndcg', {}).get('mean', 1.0),
            'Jaccard': stats.get('jaccard_index', {}).get('mean', 1.0),
            'Overlap': stats.get('overlap', {}).get('mean', results['metadata']['k']),
            'Memory (GB)' : calculate_method_memory(method, num_vectors, vector_dim) / (1024**3),
            'Bandw. (GB/s)': (calculate_bandwidth_memory(method, num_vectors, vector_dim, k) / (stats['time_us']['mean'] * 10**-6)) / (1024**3)
        }
        for method, stats in results['summary'].items()
    ])
    
    display(Markdown(f"## Summary for {result_key}"))
    display(Markdown("### Metadata"))
    display(Markdown("```json\n" + json.dumps(results['metadata'], indent=2) + "\n```"))
    display(Markdown("### Results"))
    display(summary_df.round(3))
    
show_summary(results, 'benchmark_dim1024_k100_re')
show_summary(results, 'benchmark_dim768_k100_re')

show_summary(results, 'benchmark_results_1733419058')

In [None]:
import re

def plot_value_distribution(df, enable_save_plot=False, name_postfix=""):
    """Create visualization of the value distribution with value mapping."""
    
    plt.figure()
    
    # Create plot with two x-axes
    ax1 = plt.gca()
    ax2 = ax1.twiny()  # Create second x-axis
    
    # Plot size distribution as a line
    ax1.plot(df['Partition'], df['Size'], 
            color='black', linewidth=1,
            alpha=0.8)
    
    # Add area fill under the line
    ax1.fill_between(df['Partition'], df['Size'],
                    color='black', alpha=0.1)
    
    # Set up the second x-axis with values
    ax2.set_xlim(ax1.get_xlim())
    
    # Select a subset of partitions for readability
    step = 16
    selected_partitions = df.iloc[::step]
    
    ax2.set_xticks(selected_partitions['Partition'])
    ax2.set_xticklabels([f'{val:.3f}' for val in selected_partitions['Average']], 
                        rotation=30, ha='left')
    
    # Labels and title
    ax1.set_title('Value Distribution Across Partitions')
    ax1.set_xlabel('Partition Index')
    ax2.set_xlabel('Mapped Values')
    ax1.set_ylabel('Number of Elements')
    ax1.set_ylim(0)
    
    ax1.grid(True, alpha=0.3)
    plt.tight_layout()
    
    if enable_save_plot: save_plot(f'value_distribution{name_postfix}')
    plt.show()
    
    # Print statistics
    print("\nDistribution Statistics:")
    print(f"Total elements: {df['Size'].sum():,}")
    print(f"Max partition size: {df['Size'].max():,}")
    print(f"Average partition size: {df['Size'].mean():,.1f}")
    print(f"Value range: [{df['Range_Start'].min():.6f}, {df['Range_End'].max():.6f}]")

# Load and parse data
def parse_partition_data(file_path):
    """Parse the partition data from a file into a DataFrame."""
    with open(file_path, 'r') as f:
        text = f.read()
    
    # Extract partition lines
    lines = text.split('\n')
    partition_data = []
    
    for line in lines:
        # Skip non-partition lines
        if not line.strip().startswith('Partition'):
            continue
            
        # Extract values using regex
        match = re.search(r'Partition\s+(\d+):\s+size\s+=\s+(\d+)\s+elements,\s+range\s+\[(.*?),\s+(.*?)\],\s+avg\s+=\s+(.*?)$', line)
        if match:
            partition_data.append({
                'Partition': int(match.group(1)),
                'Size': int(match.group(2)),
                'Range_Start': float(match.group(3)),
                'Range_End': float(match.group(4)),
                'Average': float(match.group(5))
            })
    
    return pd.DataFrame(partition_data)

# Generate plot
df_partitions = parse_partition_data('results/mapped_float_partitions.txt')
df_partitions768 = parse_partition_data('results/mapped_float_partitions_768.txt')
plot_value_distribution(df_partitions, enable_save_plot=True)
plot_value_distribution(df_partitions768)

In [None]:
def plot_cumulative_distribution(df):
    """Plot cumulative distribution of values."""
    plt.figure()
    
    cumsum = df['Size'].cumsum()
    cumsum_normalized = cumsum / cumsum.max()
    
    plt.plot(df['Average'], cumsum_normalized, 
             color='black', linewidth=1)
    
    plt.title('Cumulative Distribution')
    plt.xlabel('Mapped Float Value')
    plt.ylabel('Cumulative Proportion')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
plot_cumulative_distribution(df_partitions)

In [None]:
def plot_quantization_analysis(df, enable_save_plot=False, name_postfix=""):
    """Plot the value ranges and averages to show quantization."""
    plt.figure()
    
    # Plot ranges as vertical lines
    for _, row in df.iterrows():
        plt.vlines(x=row['Partition'], 
                  ymin=row['Range_Start'], 
                  ymax=row['Range_End'],
                  color='blue', alpha=0.6)
    
    # Plot averages as points
    plt.scatter(df['Partition'], df['Average'], 
               color='black', s=10, alpha=0.3)
    
    plt.title('Quantization Mapping')
    plt.xlabel('Int8 Value (Partition)')
    plt.ylabel('Float32 Value')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    if enable_save_plot: save_plot(f'quantization_mapping{name_postfix}')
    plt.show()
    
plot_quantization_analysis(df_partitions ,enable_save_plot=True)
plot_quantization_analysis(df_partitions768, enable_save_plot=True, name_postfix="_768")

In [None]:
def plot_value_density(df):
    """Create a density plot showing concentration of values."""
    plt.figure()
    
    # Calculate density (elements per value range)
    df['Range_Size'] = df['Range_End'] - df['Range_Start']
    df['Density'] = df['Size'] / df['Range_Size']
    
    plt.plot(df['Average'], df['Density'], 
             color='black', linewidth=1)
    
    plt.title('Value Density Distribution')
    plt.xlabel('Float32 Value')
    plt.ylabel('Elements per Value Unit')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
plot_value_density(df_partitions)

In [None]:
def plot_combined_analysis(df):
    """Create a combined plot showing multiple aspects of the quantization."""
    fig, (ax1, ax2) = plt.subplots(2, 1)
    
    # Top plot: Distribution
    ax1.bar(df['Partition'], df['Size'], 
            alpha=0.6, color='black', width=1.0)
    ax1.set_title('Element Distribution')
    ax1.set_ylabel('Number of Elements')
    
    # Bottom plot: Quantization mapping
    ax2.scatter(df['Partition'], df['Average'], 
                color='black', s=20, alpha=0.6)
    ax2.vlines(df['Partition'], 
               df['Range_Start'], df['Range_End'],
               color='gray', alpha=0.3)
    ax2.set_xlabel('Int8 Value (Partition)')
    ax2.set_ylabel('Float32 Value')
    
    plt.tight_layout()
    plt.show()

plot_combined_analysis(df_partitions)

In [None]:
def plot_partition_efficiency(df):
    """Analyze how effectively each partition captures similar values."""
    plt.figure()
    
    # Calculate variance in each partition
    df['Value_Range'] = df['Range_End'] - df['Range_Start']
    df['Elements_per_Range'] = df['Size'] / df['Value_Range']
    
    plt.scatter(df['Average'], df['Elements_per_Range'],
                color='black', alpha=0.6, s=20)
    
    plt.title('Partition Efficiency')
    plt.xlabel('Float32 Value')
    plt.ylabel('Elements per Value Range')
    plt.yscale('log')  # Log scale might be better for this
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
plot_partition_efficiency(df_partitions)

In [None]:
def calculate_distribution_stats(df):
    """Calculate and print basic distribution statistics."""
    stats = {
        'Total Elements': df['Size'].sum(),
        'Mean Elements per Partition': df['Size'].mean(),
        'Median Elements per Partition': df['Size'].median(),
        'Std Dev Elements': df['Size'].std(),
        'Skewness': df['Size'].skew(),
        'Kurtosis': df['Size'].kurtosis(),
        
        # Value range statistics
        'Total Value Range': df['Range_End'].max() - df['Range_Start'].min(),
        'Mean Value Range per Partition': (df['Range_End'] - df['Range_Start']).mean(),
        'Mean Absolute Value': abs(df['Average']).mean(),
        
        # Quantization statistics
        'Average Quantization Step': (df['Range_End'] - df['Range_Start']).mean(),
        'Max Quantization Error': (df['Range_End'] - df['Range_Start']).max() / 2,
        'Min Quantization Error': (df['Range_End'] - df['Range_Start']).min() / 2
    }
    
    # Print formatted statistics
    print("\nDistribution Statistics:")
    for name, value in stats.items():
        print(f"{name:30s}: {value:,.6f}")
        
calculate_distribution_stats(df_partitions)

In [None]:
def analyze_partition_balance(df):
    """Analyze how well balanced the partitions are."""
    # Calculate partition utilization
    total_elements = df['Size'].sum()
    df['Utilization'] = df['Size'] / total_elements * 100
    
    stats = {
        'Most Populated Partition': df.loc[df['Size'].idxmax(), 'Partition'],
        'Max Partition Utilization %': df['Utilization'].max(),
        'Min Partition Utilization %': df['Utilization'].min(),
        'Utilization Std Dev %': df['Utilization'].std(),
        'Empty Partitions': (df['Size'] == 0).sum(),
        'Effective Partitions': (df['Size'] > 0).sum()
        #'Gini Coefficient': gini_coefficient(df['Size'])  # Need to implement this
    }
    
    print("\nPartition Balance Analysis:")
    for name, value in stats.items():
        print(f"{name:30s}: {value:,.6f}")
        
analyze_partition_balance(df_partitions)

In [None]:
def analyze_value_ranges(df):
    """Analyze the distribution of values and ranges."""
    df['Range_Size'] = df['Range_End'] - df['Range_Start']
    df['Density'] = df['Size'] / df['Range_Size']
    
    stats = {
        'Positive Values %': (df[df['Average'] > 0]['Size'].sum() / df['Size'].sum()) * 100,
        'Negative Values %': (df[df['Average'] < 0]['Size'].sum() / df['Size'].sum()) * 100,
        'Mean Value Density': df['Density'].mean(),
        'Max Value Density': df['Density'].max(),
        'Value Range Coverage %': (df[df['Size'] > 0]['Range_Size'].sum() / 
                                 (df['Range_End'].max() - df['Range_Start'].min())) * 100
    }
    
    print("\nValue Range Analysis:")
    for name, value in stats.items():
        print(f"{name:30s}: {value:,.6f}")
        
analyze_value_ranges(df_partitions)

In [None]:
def analyze_quantization_error(df):
    """Analyze the potential quantization errors."""
    df['Max_Error'] = (df['Range_End'] - df['Range_Start']) / 2
    df['Weighted_Error'] = df['Max_Error'] * df['Size']
    
    stats = {
        'Average Max Error': df['Weighted_Error'].sum() / df['Size'].sum(),
        'Worst Case Error': df['Max_Error'].max(),
        'Best Case Error': df['Max_Error'].min(),
        'Error Std Dev': df['Max_Error'].std(),
        'Mean Relative Error %': (df['Max_Error'] / df['Average'].abs()).mean() * 100
    }
    
    print("\nQuantization Error Analysis:")
    for name, value in stats.items():
        print(f"{name:30s}: {value:,.6f}")
        
analyze_quantization_error(df_partitions)

In [None]:
def analyze_distribution_shape(df):
    """Analyze the shape and characteristics of the distribution."""
    # Calculate percentiles
    percentiles = [10, 25, 50, 75, 90]
    value_percentiles = np.percentile(df['Average'], percentiles)
    size_percentiles = np.percentile(df['Size'], percentiles)
    
    stats = {
        'Distribution Mode': df.loc[df['Size'].idxmax(), 'Average'],
        'Value Range 90%': value_percentiles[-1] - value_percentiles[0],
        'Size Range 90%': size_percentiles[-1] - size_percentiles[0],
        'Interquartile Range': value_percentiles[3] - value_percentiles[1],
        'Value Concentration': (df[df['Size'] > df['Size'].mean()]['Size'].sum() / 
                              df['Size'].sum() * 100)
    }
    
    print("\nDistribution Shape Analysis:")
    for name, value in stats.items():
        print(f"{name:30s}: {value:,.6f}")
        
analyze_distribution_shape(df_partitions)

In [None]:
def plot_statistical_summary(df, enable_save_plot=False, name_postfix=""):
    """Create a comprehensive statistical summary visualization."""
    fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2)
    
    # 1. Distribution with percentiles - use weights directly
    sns.histplot(data=df, x='Average', weights='Size', 
                ax=ax1, color='black', alpha=0.6)
    
    # Calculate weighted percentiles without creating huge arrays
    percentiles = [25, 50, 75]
    total = df['Size'].sum()
    cumsum = 0
    percentile_values = []
    
    # Sort by Average and calculate weighted percentiles
    df_sorted = df.sort_values('Average')
    for idx, row in df_sorted.iterrows():
        cumsum += row['Size']
        for p in percentiles:
            if len(percentile_values) < len(percentiles) and cumsum/total > p/100:
                percentile_values.append(row['Average'])
    
    # Add percentile lines
    for p, val in zip(percentiles, percentile_values):
        ax1.axvline(val, color='gray', linestyle='--', alpha=0.5)
        ax1.text(val, ax1.get_ylim()[1], f'P{p}', rotation=90, va='top')
    
    ax1.set_title('Value Distribution with Percentiles')
    
    # 2. QQ Plot - use sampling instead of full data
    from scipy import stats
    
    # Sample weighted values (much smaller sample size)
    sample_size = 10000
    probs = df['Size'] / df['Size'].sum()
    sampled_indices = np.random.choice(len(df), size=sample_size, p=probs)
    sampled_values = df['Average'].iloc[sampled_indices]
    
    qq = stats.probplot(sampled_values, dist="norm", plot=ax2)
    ax2.set_title('Quantile-Quantile Plot (Sampled)')
    
    # 3. Box plot of partition sizes
    sns.boxplot(y=df['Size'], ax=ax3, color='lightgray')
    ax3.set_title('Partition Size Distribution')
    
    # 4. Cumulative density
    cumsum = (df['Size'].cumsum() / df['Size'].sum())
    ax4.plot([0, 1], [0, 1], 'r--', alpha=0.5, label='Perfect Equality')
    ax4.plot(np.linspace(0, 1, len(cumsum)), cumsum, 'black', label='Actual Distribution')
    ax4.set_title('Lorenz Curve (Distribution Equality)')
    ax4.legend()
    
    plt.tight_layout()
    if enable_save_plot: save_plot(f'statistical_summary{name_postfix}')
    plt.show()
    
plot_statistical_summary(df_partitions)

In [None]:
def perform_statistical_analysis(df):
    """Perform statistical tests using sampling to reduce memory usage."""
    from scipy import stats
    
    # Sample size for statistical tests
    sample_size = 10000
    
    # Normalize probabilities to ensure they sum to 1
    probs = df['Size'] / df['Size'].sum()
    probs = probs.values  # Convert to numpy array
    probs = probs / probs.sum()  # Renormalize to ensure sum is exactly 1
    
    # Create weighted sample
    samples = np.random.choice(df['Average'].values, size=sample_size, p=probs)
    
    # 1. Normality Tests
    normality_tests = {
        'Shapiro-Wilk Test': stats.shapiro(samples),
        'D\'Agostino K^2 Test': stats.normaltest(samples)
        # Anderson-Darling handled separately
    }
    
    # 2. Distribution Fitting
    distributions = ['norm', 'laplace', 'logistic']
    dist_fits = {}
    
    for dist_name in distributions:
        dist = getattr(stats, dist_name)
        params = dist.fit(samples)
        d_stat, p_val = stats.kstest(samples, dist_name, params)
        dist_fits[dist_name] = {
            'parameters': params,
            'ks_statistic': d_stat,
            'p_value': p_val
        }
    
    # 3. Shape Analysis
    shape_stats = {
        'Skewness': stats.skew(samples),
        'Kurtosis': stats.kurtosis(samples),
        'Variance Ratio': df['Size'].var() / df['Size'].mean(),
    }
    
    # Print results
    print("\nNormality Tests (based on sampling):")
    for test_name, result in normality_tests.items():
        print(f"{test_name}:")
        print(f"  Statistic: {float(result[0]):.6f}")
        print(f"  p-value: {float(result[1]):.6f}")
    
    # Handle Anderson-Darling test separately
    ad_result = stats.anderson(samples)
    print("\nAnderson-Darling Test:")
    print(f"  Statistic: {float(ad_result[0]):.6f}")
    print("  Critical Values:")
    for sig_level, crit_val in zip(ad_result[1], ad_result[2]):
        print(f"    {sig_level}%: {crit_val:.6f}")
    
    print("\nDistribution Fitting:")
    for dist_name, result in dist_fits.items():
        print(f"\n{dist_name} distribution:")
        print(f"  KS statistic: {result['ks_statistic']:.6f}")
        print(f"  p-value: {result['p_value']:.6f}")
        print(f"  Parameters: {', '.join(f'{float(p):.6f}' for p in result['parameters'])}")
    
    print("\nShape Analysis:")
    for stat_name, value in shape_stats.items():
        print(f"{stat_name}: {float(value):.6f}")


#plot_statistical_summary(df_partitions)
perform_statistical_analysis(df_partitions)