In [1]:
import subprocess
import os
import time
import logging
from typing import List, Dict

# Constants
BINARY_FOLDER = '/home/vinh/Q32024/CuckooHeavyKeeper/build/release/bin/release'
LOG_FOLDER = '/home/vinh/Q32024/CuckooHeavyKeeper/experiments/'
BYTE_SIZES = 1024
THETA = 0.0005
BASE_FLAGS = f"--app.line_read=5000000 --app.theta {THETA}"

# Set up a global console handler
console_handler = logging.StreamHandler()
console_handler.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S'))


def setup_logger(name: str, log_file: str, level=logging.INFO) -> logging.Logger:
    """
    Set up a logger with file handler and use the global console handler.
    
    Args:
        name (str): Name of the logger.
        log_file (str): Path to the log file.
        level: Logging level.
    
    Returns:
        logging.Logger: Configured logger object.
    """
    
    logger = logging.getLogger(name)
    logger.setLevel(level)
    
    # Check if the logger already has handlers and remove them
    if logger.hasHandlers():
        logger.handlers.clear()
    
    # File handler
    # file_handler = logging.FileHandler(log_file, mode='a')
    file_handler = logging.FileHandler(log_file, mode='w')
    file_handler.setFormatter(logging.Formatter('%(asctime)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S'))
    logger.addHandler(file_handler)
    
    # Add the global console handler
    logger.addHandler(console_handler)
    
    # Prevent the logger from propagating messages to the root logger
    logger.propagate = False
    
    return logger

def run_command(command: List[str], cwd: str, logger: logging.Logger) -> None:
    """
    Run a shell command, log its output, and print to console.
    
    Args:
        command (List[str]): The command to run as a list of strings.
        cwd (str): The current working directory for the command.
        logger (logging.Logger): Logger object to use for this command.
    """
    logger.info(f"Running command: {' '.join(command)}")
    
    start_time = time.time()
    logger.info(f"Command started at: {time.strftime('%Y-%m-%d %H:%M:%S')}")
    
    try:
        process = subprocess.Popen(
            command,
            cwd=cwd,
            stdout=subprocess.PIPE,
            stderr=subprocess.STDOUT,
            text=True,
            bufsize=1,
            universal_newlines=True
        )
        
        for line in process.stdout: # type: ignore
            line = line.strip()
            logger.info(line)
        
        process.wait()
        
        if process.returncode != 0:
            logger.error(f"Command failed with return code {process.returncode}")
        else:
            logger.info("Command completed successfully")
        
    except subprocess.CalledProcessError as e:
        logger.error(f"Command failed with return code {e.returncode}")
        logger.error(e.output)
    except Exception as e:
        logger.error(f"An error occurred: {str(e)}")
    
    end_time = time.time()
    duration = end_time - start_time
    logger.info(f"Command finished. Duration: {duration:.2f} seconds")

def run_sketch_command(command_name: str, additional_flags: str = "") -> None:
    """
    Run a sketch command and log its output to a specific file.
    
    Args:
        command_name (str): Name of the command (used for executable and log file).
        additional_flags (str): Additional command-line flags, if any.
    """
    log_file = os.path.join(LOG_FOLDER, f'{command_name}.log')
    logger = setup_logger(command_name, log_file)
    
    command = [f'./{command_name}'] + BASE_FLAGS.split() + additional_flags.split()
    run_command(command, BINARY_FOLDER, logger)



In [2]:
# some other information

# cuckoo heavy keeper size = bucket_size * bucket_num * 2
# bucket_size = (NUM_ENTRY_PER_BUCKET - 1) * sizeof(big entry) + sizeof(small entry)
# = 2 * (2(fingerprint) + 4(counter)) + (2 + 1) = 15
cuckoo_heavy_keeper_bucket_size = 15

# heav keeper size = bucket_size * bucket_num (bucket_num = w * d, d=2 by default)
# bucket_size = fingerprint + counter = 2 + 4 = 6
heavy_keeper_bucket_size = 6

# count-min sketch size = bucket_size * w * d 
# bucket_size = 4 (counter)
count_min_sketch_bucket_size = 4

# augmented sketch = count-min sketch size + filter size 
# = count-min sketch size + (2 + 4)*16 
# = count-min sketch size + 96
augmented_sketch_bucket_size = count_min_sketch_bucket_size

# space saving = bucket_size * bucket_num
# bucket_size = 2 (fingerprint) + 4 (counter) = 6
space_saving_bucket_size = 6

# HeapHashMapSpaceSaving = (bucket_size + hashmap_entry_size) * bucket_num
# old calculation: -> wrong
# # (bucket_size + hashmap_entry_size) = (2 + 4) * 2 = 12 
# heap_hashmap_space_saving_bucket_size = 12
# item size + vector item size, item map size = 12 + 4 + 8 = 24 

# if using max 1kb:
base_unit = 1
cuckoo_heavy_keeper_bucket_num = 32 * base_unit # 960 bytes, bucket_num needs to be 2^x 
# cuckoo_heavy_keeper_bucket_num = 24 * base_unit # if 4 buckets 
heavy_keeper_bucket_num = 85 * base_unit # 1020 bytes
count_min_sketch_bucket_d = 8 * base_unit
count_min_sketch_bucket_w = 32 * base_unit # 1024 bytes
augmented_sketch_bucket_d = 8 * base_unit
augmented_sketch_bucket_w = 29 * base_unit # 1024 bytes
space_saving_bucket_num = 170 * base_unit # 1020 bytes
heap_hashmap_space_saving_bucket_num = 43 * base_unit # 1020 bytes



In [3]:
# Dictionary of commands to run

COMMANDS: Dict[str, str] = {
    "example_count_min": f"--countmin.width={count_min_sketch_bucket_w} --countmin.depth={count_min_sketch_bucket_d}",
    "example_heavy_keeper": f"--heavykeeper.k={heavy_keeper_bucket_num}",
    "example_cuckoo_heavy_keeper": f"--cuckooheavykeeper.bucket_num={cuckoo_heavy_keeper_bucket_num} --cuckooheavykeeper.theta={THETA}",
    "example_augmented_sketch": f"--augmentedsketch.width={augmented_sketch_bucket_w} --augmentedsketch.depth={augmented_sketch_bucket_d}",
    "example_heap_hashmap_space_saving": f"--spacesaving.k={heap_hashmap_space_saving_bucket_num}",
}

if __name__ == "__main__":
    for cmd_name, cmd_flags in COMMANDS.items():
        run_sketch_command(cmd_name, cmd_flags)

2024-09-04 13:51:51 - example_count_min - Running command: ./example_count_min --app.line_read=100000 --app.theta 0.0005 --countmin.width=32 --countmin.depth=8
2024-09-04 13:51:51 - example_count_min - Command started at: 2024-09-04 13:51:51
2024-09-04 13:51:51 - example_count_min - +------------------------------+
2024-09-04 13:51:51 - example_count_min - | CountMinConfig               |
2024-09-04 13:51:51 - example_count_min - +------------------------------+
2024-09-04 13:51:51 - example_count_min - | WIDTH           : 32         |
2024-09-04 13:51:51 - example_count_min - | DEPTH           : 8          |
2024-09-04 13:51:51 - example_count_min - | DELTA           : 0.010000   |
2024-09-04 13:51:51 - example_count_min - | EPSILON         : 0.010000   |
2024-09-04 13:51:51 - example_count_min - | CALCULATE_FROM  : WIDTH_DEPTH|
2024-09-04 13:51:51 - example_count_min - +------------------------------+
2024-09-04 13:51:51 - example_count_min - 
2024-09-04 13:51:51 - example_count_min 

In [3]:
import os
import re
import glob
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def parse_log_file(file_path):
    data = []
    pattern = r'RESULT_SUMMARY: FrequencyEstimator=(\w+) TotalHeavyHitters=(\d+) TotalHeavyHitterCandidates=(\d+) Precision=([\d.]+) Recall=([\d.]+) ARE=([\d.]+) AAE=([\d.]+) ExecutionTime=([\d.]+)'
    
    with open(file_path, 'r') as file:
        for line in file:
            match = re.search(pattern, line)
            if match:
                data.append({
                    'FrequencyEstimator': match.group(1),
                    'TotalHeavyHitters': int(match.group(2)),
                    'TotalHeavyHitterCandidates': int(match.group(3)),
                    'Precision': float(match.group(4)),
                    'Recall': float(match.group(5)),
                    'ARE': float(match.group(6)),
                    'AAE': float(match.group(7)),
                    'ExecutionTime': float(match.group(8))
                })
    return data

def process_log_files(directory):
    all_data = []
    for file_path in glob.glob(os.path.join(directory, '*.log')):
        all_data.extend(parse_log_file(file_path))
    return pd.DataFrame(all_data)

def load_material_colors(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

def plot_metrics(df, colors, save_path='metrics_comparison.png'):
    metrics = ['Precision', 'Recall', 'ARE', 'AAE', 'ExecutionTime']
    color_palette = [colors['blue']['500'], colors['red']['500'], colors['green']['500'], 
                     colors['orange']['500'], colors['purple']['500']]
    
    fig, axes = plt.subplots(len(metrics), 1, figsize=(12, 4*len(metrics)), sharex=True)
    fig.suptitle('Comparison of Metrics Across Frequency Estimators', fontsize=16, y=1.02)
    
    for i, metric in enumerate(metrics):
        sns.boxplot(x='FrequencyEstimator', y=metric, data=df, ax=axes[i], 
              palette=color_palette, width=0.3)
        axes[i].set_title(f'{metric} Comparison', fontsize=14, y=1.05)  # Increase the y value to move the title higher
        axes[i].set_xlabel('')
        axes[i].tick_params(axis='both', which='major', labelsize=10)
        
        if metric in ['Precision', 'Recall']:
            axes[i].set_ylim(0, df[metric].max() * 1.1)
        elif metric == 'ARE':
            axes[i].set_ylim(0, df['ARE'].max() * 1.1)
        elif metric == 'AAE':
            axes[i].set_ylim(0, df['AAE'].max() * 1.1)
        elif metric == 'ExecutionTime':
            axes[i].set_ylim(0, df['ExecutionTime'].max() * 1.1)
        
        # Add value labels with mean ± std as percentage
        for j, estimator in enumerate(df['FrequencyEstimator'].unique()):
            estimator_data = df[df['FrequencyEstimator'] == estimator][metric]
            mean = estimator_data.mean()
            std = estimator_data.std()
            if metric in ['Precision', 'Recall', 'ARE']:
                label = f"{mean:.2f}±{std:.2f}"
            elif metric == 'AAE':
                label = f"{mean:.0f}±{std:.0f}"
            else:  # ExecutionTime
                label = f"{mean:.0f}±{std:.0f}"
            axes[i].text(j, axes[i].get_ylim()[1], label, ha='center', va='bottom', fontweight='bold', fontsize=10)
        
    plt.tight_layout(pad=1.5)
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()

def experiment(save_path='metrics_comparison.png'):
    log_directory = '/home/vinh/Q32024/CuckooHeavyKeeper/experiments/'
    material_colors_file = 'material-colors.json'  # Ensure this file is in the same directory as the script
    
    df = process_log_files(log_directory)
    colors = load_material_colors(material_colors_file)
    
    # Print summary statistics
    print(df.groupby('FrequencyEstimator').agg({
        'Precision': ['mean', 'std'],
        'Recall': ['mean', 'std'],
        'ARE': ['mean', 'std'],
        'AAE': ['mean', 'std'],
        'ExecutionTime': ['mean', 'std']
    }))
    
    # Create visualizations
    plot_metrics(df, colors, save_path)
    
    print("Analysis complete. Check 'metrics_comparison.png' for the visualization.")

if __name__ == "__main__":
    experiment()

                         Precision            Recall                 ARE  \
                              mean       std    mean       std      mean   
FrequencyEstimator                                                         
AugmentedSketch           0.017764  0.000034  1.0000  0.000000  6.856961   
CountMinSketch            0.017847  0.000027  1.0000  0.000000  6.452594   
CuckooHeavyKeeper         0.997529  0.005210  0.6608  0.021400  0.336559   
HeapHashMapSpaceSavingV2  0.018171  0.000000  1.0000  0.000000  1.349690   
HeavyKeeper               0.998529  0.004650  0.5336  0.013091  0.477003   

                                         AAE            ExecutionTime  \
                               std      mean        std          mean   
FrequencyEstimator                                                      
AugmentedSketch           0.120234  717.2248  12.095349      20.06296   
CountMinSketch            0.185016  729.3200   9.782181      39.15070   
CuckooHeavyKeeper         


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='FrequencyEstimator', y=metric, data=df, ax=axes[i],

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='FrequencyEstimator', y=metric, data=df, ax=axes[i],

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='FrequencyEstimator', y=metric, data=df, ax=axes[i],

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='FrequencyEstimator', y=metric, data=df, ax=axes[i],

Passing `palette` without assigning `hue` is deprecated

Analysis complete. Check 'metrics_comparison.png' for the visualization.


In [5]:
def reproduce(save_path=None):
  if save_path is None:
    assert False, "Please provide a path to save the metrics comparison plot."
  # calculate dist_param, base_unit and theta from save_path
  save_path_split = save_path.split('_')
  dist_param = float(save_path_split[4])
  base_unit = int(save_path_split[7])
  theta = float(save_path_split[9].split('.png')[0])
  print(f"dist_param: {dist_param}, base_unit: {base_unit}, theta: {theta}")
  
  BASE_FLAGS = f"--app.line_read=100000 --app.theta {theta} --app.dist_param {dist_param}"
            
  COMMANDS: Dict[str, str] = {
      # "example_count_min": f"--countmin.width={count_min_sketch_bucket_w*base_unit} --countmin.depth={count_min_sketch_bucket_d}",
      "example_heavy_keeper": f"--heavykeeper.m2={heavy_keeper_bucket_num*base_unit}",
      # "example_cuckoo_heavy_keeper": f"--cuckooheavykeeper.bucket_num={cuckoo_heavy_keeper_bucket_num*base_unit} --cuckooheavykeeper.theta={theta}",
      # "example_augmented_sketch": f"--augmentedsketch.width={augmented_sketch_bucket_w*base_unit} --augmentedsketch.depth={augmented_sketch_bucket_d}",
      # "example_heap_hashmap_space_saving": f"--spacesaving.k={heap_hashmap_space_saving_bucket_num*base_unit}",
  }
  
  for cmd_name, cmd_flags in COMMANDS.items():
      run_sketch_command(cmd_name, cmd_flags)
  experiment(save_path)
  
reproduce('/home/vinh/Q32024/CuckooHeavyKeeper/notebooks/metrics_comparison_dist_param_0.8_base_unit_2_theta_0.001.png')

2024-09-04 13:52:03 - example_heavy_keeper - Running command: ./example_heavy_keeper --app.line_read=100000 --app.theta 0.0005 --heavykeeper.m2=170
2024-09-04 13:52:03 - example_heavy_keeper - Command started at: 2024-09-04 13:52:03
2024-09-04 13:52:03 - example_heavy_keeper - +----------------------+
2024-09-04 13:52:03 - example_heavy_keeper - | HeavyKeeperConfig    |
2024-09-04 13:52:03 - example_heavy_keeper - +----------------------+
2024-09-04 13:52:03 - example_heavy_keeper - | K               : 100|
2024-09-04 13:52:03 - example_heavy_keeper - | M2              : 170|
2024-09-04 13:52:03 - example_heavy_keeper - +----------------------+
2024-09-04 13:52:03 - example_heavy_keeper - 
2024-09-04 13:52:03 - example_heavy_keeper - +---------------------------+
2024-09-04 13:52:03 - example_heavy_keeper - | AppConfig                 |
2024-09-04 13:52:03 - example_heavy_keeper - +---------------------------+
2024-09-04 13:52:03 - example_heavy_keeper - | MODE            : count   |
2

dist_param: 0.8, base_unit: 2, theta: 0.001


2024-09-04 13:52:03 - example_heavy_keeper - ## Run 0
2024-09-04 13:52:03 - example_heavy_keeper - # sketch + heap heavy hitters
2024-09-04 13:52:03 - example_heavy_keeper - Execution Time: 40.6947
2024-09-04 13:52:03 - example_heavy_keeper - Count distinct: 7368
2024-09-04 13:52:03 - example_heavy_keeper - Total heavy hitters: 125
2024-09-04 13:52:03 - example_heavy_keeper - Total heavy_hitter candidates: 65
2024-09-04 13:52:03 - example_heavy_keeper - # sample set: true heavy_hitter_counter
2024-09-04 13:52:03 - example_heavy_keeper - ARE: 0.480484
2024-09-04 13:52:03 - example_heavy_keeper - AAE: 42.432
2024-09-04 13:52:03 - example_heavy_keeper - Precision: 65 over 65
2024-09-04 13:52:03 - example_heavy_keeper - Recall: 65 over 125
2024-09-04 13:52:03 - example_heavy_keeper - ---------------------
2024-09-04 13:52:03 - example_heavy_keeper - # sample set: heavy_hitter candidates
2024-09-04 13:52:03 - example_heavy_keeper - ARE: 0.0746934
2024-09-04 13:52:03 - example_heavy_keeper -

                         Precision            Recall                 ARE  \
                              mean       std    mean       std      mean   
FrequencyEstimator                                                         
AugmentedSketch           0.017764  0.000034  1.0000  0.000000  6.856961   
CountMinSketch            0.017847  0.000027  1.0000  0.000000  6.452594   
CuckooHeavyKeeper         0.997529  0.005210  0.6608  0.021400  0.336559   
HeapHashMapSpaceSavingV2  0.018171  0.000000  1.0000  0.000000  1.349690   
HeavyKeeper               0.998529  0.004650  0.5336  0.013091  0.477003   

                                         AAE            ExecutionTime  \
                               std      mean        std          mean   
FrequencyEstimator                                                      
AugmentedSketch           0.120234  717.2248  12.095349      20.06296   
CountMinSketch            0.185016  729.3200   9.782181      39.15070   
CuckooHeavyKeeper         


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='FrequencyEstimator', y=metric, data=df, ax=axes[i],

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x='FrequencyEstimator', y=metric, data=df, ax=axes[i],


Analysis complete. Check 'metrics_comparison.png' for the visualization.


In [5]:
# running grid experiment for different --app.dist_param 

dist_params = [0.8, 1, 1.2, 1.5]
base_units = [2, 4, 8, 16]
thetas = [0.0005, 0.001, 0.005, 0.01]

for dist_param in dist_params:
    for base_unit in base_units:
        for theta in thetas:
            BASE_FLAGS = f"--app.line_read=5000000 --app.theta {theta} --app.dist_param {dist_param}"
            
            COMMANDS: Dict[str, str] = {
                "example_count_min": f"--countmin.width={count_min_sketch_bucket_w*base_unit} --countmin.depth={count_min_sketch_bucket_d}",
                "example_heavy_keeper": f"--heavykeeper.m2={heavy_keeper_bucket_num*base_unit}",
                "example_cuckoo_heavy_keeper": f"--cuckooheavykeeper.bucket_num={cuckoo_heavy_keeper_bucket_num*base_unit} --cuckooheavykeeper.theta={theta}",
                "example_augmented_sketch": f"--augmentedsketch.width={augmented_sketch_bucket_w*base_unit} --augmentedsketch.depth={augmented_sketch_bucket_d}",
                "example_heap_hashmap_space_saving": f"--spacesaving.k={heap_hashmap_space_saving_bucket_num*base_unit}",
            }
            for cmd_name, cmd_flags in COMMANDS.items():
                run_sketch_command(cmd_name, cmd_flags)
                
            save_path = f'metrics_comparison_dist_param_{dist_param}_base_unit_{base_unit}_theta_{theta}.png'
            experiment(save_path)
            
            

2024-09-04 14:31:55 - example_count_min - Running command: ./example_count_min --app.line_read=5000000 --app.theta 0.0005 --app.dist_param 0.8 --countmin.width=64 --countmin.depth=8
2024-09-04 14:31:55 - example_count_min - Command started at: 2024-09-04 14:31:55
2024-09-04 14:31:55 - example_count_min - +------------------------------+
2024-09-04 14:31:55 - example_count_min - | CountMinConfig               |
2024-09-04 14:31:55 - example_count_min - +------------------------------+
2024-09-04 14:31:55 - example_count_min - | WIDTH           : 64         |
2024-09-04 14:31:55 - example_count_min - | DEPTH           : 8          |
2024-09-04 14:31:55 - example_count_min - | DELTA           : 0.010000   |
2024-09-04 14:31:55 - example_count_min - | EPSILON         : 0.010000   |
2024-09-04 14:31:55 - example_count_min - | CALCULATE_FROM  : WIDTH_DEPTH|
2024-09-04 14:31:55 - example_count_min - +------------------------------+
2024-09-04 14:31:55 - example_count_min - 
2024-09-04 14:31:5