In [45]:
!uv add pandas numpy matplotlib seaborn pyarrow

[2mResolved [1m51 packages[0m [2min 9ms[0m[0m
[2mAudited [1m16 packages[0m [2min 0.04ms[0m[0m


In [46]:
import os
from enum import Enum
import logging
import re # Added import

# Configure basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [47]:
class LogType(Enum):
    BASELINE = "baseline"
    CVM = "cvm"
    MISC = "misc"

In [48]:
def get_log_files(log_type: LogType) -> list[str]:
    """Gets a list of log file paths from the specified log type directory.

    Args:
        log_type: The type of logs to retrieve (LogType.BASELINE, LogType.CVM, LogType.MISC).

    Returns:
        A list of full file paths for the logs found in the directory.
        Returns an empty list if the directory doesn't exist or is empty.
    """
    base_log_path = "/home/ybyan/cvm-net-perf/echo/logs"
    log_folder = log_type.value
    target_path = os.path.join(base_log_path, log_folder)
    
    log_files = []
    if not os.path.isdir(target_path):
        logging.warning(f"Directory not found: {target_path}")
        return log_files
        
    try:
        for filename in os.listdir(target_path):
            file_path = os.path.join(target_path, filename)
            if os.path.isfile(file_path):
                log_files.append(file_path)
        logging.info(f"Found {len(log_files)} log files in {target_path}")
    except OSError as e:
        logging.error(f"Error accessing directory {target_path}: {e}")
        
    return log_files

In [49]:
def get_specific_log_file(log_type: LogType, filename: str) -> str | None:
    """Gets the full path to a specific log file within the log type directory.
    
    Args:
        log_type: The type of logs directory to search in.
        filename: The name of the log file.
        
    Returns:
        The full path to the log file if found, otherwise None.
    """
    base_log_path = "/home/ybyan/cvm-net-perf/echo/logs"
    log_folder = log_type.value
    target_path = os.path.join(base_log_path, log_folder, filename)
    
    if os.path.isfile(target_path):
        logging.info(f"Found specific log file: {target_path}")
        return target_path
    else:
        logging.warning(f"Specific log file not found: {target_path}")
        return None

In [50]:
def read_log_data(log_type: LogType, filename: str) -> list[str]:
    """Reads the content of a specific log file.
    
    Args:
        log_type: The type of logs directory.
        filename: The name of the log file.
        
    Returns:
        A list of strings, where each string is a line from the file.
        Returns an empty list if the file cannot be found or read.
    """
    file_path = get_specific_log_file(log_type, filename)
    lines = []
    if not file_path:
        # get_specific_log_file already logs a warning
        return lines
        
    try:
        with open(file_path, 'r') as f:
            lines = f.readlines()
        logging.info(f"Successfully read {len(lines)} lines from {log_type.name}/{filename}")
    except IOError as e:
        logging.error(f"Error reading file {log_type.name}/{filename} (path: {file_path}): {e}")
        
    return lines

In [51]:
def extract_log_info(log_type: LogType, filename: str, patterns: list[str]) -> list[str]:
    """Extracts lines from a specific log file that match given regex patterns.
    
    Args:
        log_type: The type of logs directory.
        filename: The name of the log file.
        patterns: A list of regex patterns (strings) to search for.
        
    Returns:
        A list of lines from the log file that match any of the patterns.
    """
    file_path = get_specific_log_file(log_type, filename)
    if not file_path:
        return []
        
    log_data = read_log_data(log_type, filename)
    if not log_data:
        return []
        
    matching_lines = []
    compiled_patterns = []
    for pattern in patterns:
        try:
            compiled_patterns.append(re.compile(pattern))
        except re.error as e:
            logging.error(f"Invalid regex pattern '{pattern}': {e}")
            continue 
            
    for line in log_data:
        for compiled_pattern in compiled_patterns:
            if compiled_pattern.search(line):
                matching_lines.append(line.strip())
                break 
                
    logging.info(f"Extracted {len(matching_lines)} lines matching patterns from {log_type.name}/{filename}")
    return matching_lines

In [52]:
def parse_results_section(log_type: LogType, filename: str) -> dict:
    """Parses the 'Results' section of a specific log file using simplified keys.
    
    Args:
        log_type: The type of logs directory.
        filename: The name of the log file.
        
    Returns:
        A dictionary containing the key-value pairs from the Results section,
        using simplified, programmatic keys.
        Returns an empty dictionary if the file/section is not found or parsing fails.
    """
    file_path = get_specific_log_file(log_type, filename)
    if not file_path:
        return {}
        
    log_data = read_log_data(log_type, filename)
    if not log_data:
        return {}
        
    results = {}
    in_results_section = False
    latency_section_active = False
    
    # Mapping from log file keys to simplified dictionary keys
    key_map = {
        "Mode": "mode",
        "Clients": "clients",
        "Total Requests Completed": "total_requests",
        "Test Duration": "duration_sec",
        "Achieved Throughput": "throughput_rps",
        "Target Rate (per client)": "target_rate_client_hz",
        "Target Rate (total)": "target_rate_total_hz",
        "Latency Average": "latency_avg_us",
        "Latency p50": "latency_p50_us",
        "Latency p90": "latency_p90_us",
        "Latency p95": "latency_p95_us",
        "Latency p99": "latency_p99_us",
    }
    
    # Regex patterns (unchanged)
    kv_pattern = re.compile(r"^([^:\s][^:]*):\s+(.*)$")
    latency_pattern = re.compile(r"^\s+([\w\.]+):\s+([\d\.]+)$")
    latency_header_pattern = re.compile(r"Latency \(microseconds\)")
    
    for line in log_data:
        line = line.rstrip()
        stripped_line = line.strip()
        
        if "---- Results ----" in stripped_line:
            in_results_section = True
            continue
            
        if "-------------------------------------------------" in stripped_line and in_results_section:
            break 
            
        if not in_results_section:
            continue
            
        if latency_header_pattern.search(stripped_line):
            latency_section_active = True
            continue
            
        parsed_key = None
        value_str = None
        original_key = None
        
        if latency_section_active:
            latency_match = latency_pattern.match(line)
            if latency_match:
                original_key = f"Latency {latency_match.group(1)}"
                value_str = latency_match.group(2)
                parsed_key = key_map.get(original_key)
                if not parsed_key:
                     # Fallback for unmapped latency keys (e.g., Latency p99.9)
                     parsed_key = f"latency_{latency_match.group(1).lower().replace('.', '_')}_us"
                     logging.debug(f"Using fallback key '{parsed_key}' for unmapped latency key '{original_key}'")
                
        # Check general KV pattern if not a latency line or if latency match failed
        if not parsed_key:
            kv_match = kv_pattern.match(stripped_line)
            if kv_match:
                original_key = kv_match.group(1).strip()
                value_str = kv_match.group(2).strip()
                parsed_key = key_map.get(original_key)
                if not parsed_key:
                    # Fallback for unmapped general keys
                    parsed_key = original_key.lower().replace(' ', '_').replace('(', '').replace(')', '')
                    logging.debug(f"Using fallback key '{parsed_key}' for unmapped key '{original_key}'")
        
        # If a key-value pair was successfully parsed (either latency or general)
        if parsed_key and value_str is not None:
            try:
                # Value conversion logic (prioritize units)
                if ' req/sec' in value_str:
                   results[parsed_key] = float(value_str.replace(' req/sec',''))
                elif ' Hz' in value_str:
                   results[parsed_key] = float(value_str.replace(' Hz',''))
                elif ' seconds' in value_str:
                   results[parsed_key] = int(value_str.replace(' seconds',''))
                elif '.' in value_str:
                    results[parsed_key] = float(value_str)
                else:
                    results[parsed_key] = int(value_str) 
            except ValueError:
                 results[parsed_key] = value_str # Keep as string if all conversions fail
            continue # Move to next line once processed
                 
    if not results:
        logging.warning(f"Could not find or parse Results section in {log_type.name}/{filename}")
    else:
        logging.info(f"Successfully parsed Results section from {log_type.name}/{filename} using simplified keys")
        
    return results

In [53]:
# Updated call
r = parse_results_section(LogType.BASELINE, "closed_2c.log")
r

2025-04-30 19:59:24,449 - INFO - Found specific log file: /home/ybyan/cvm-net-perf/echo/logs/baseline/closed_2c.log
2025-04-30 19:59:24,453 - INFO - Found specific log file: /home/ybyan/cvm-net-perf/echo/logs/baseline/closed_2c.log
2025-04-30 19:59:24,455 - INFO - Successfully read 22 lines from BASELINE/closed_2c.log
2025-04-30 19:59:24,457 - INFO - Successfully parsed Results section from BASELINE/closed_2c.log using simplified keys


{'mode': 'Closed Loop',
 'clients': 2,
 'total_requests': 229466,
 'duration_sec': 10,
 'throughput_rps': 22946.6,
 'latency_avg_us': 86.5831,
 'latency_p50_us': 85,
 'latency_p90_us': 95,
 'latency_p95_us': 98,
 'latency_p99_us': 109}

In [54]:
# Updated call
r = parse_results_section(LogType.BASELINE, "open_5c_1000p.log")
r

2025-04-30 19:59:24,471 - INFO - Found specific log file: /home/ybyan/cvm-net-perf/echo/logs/baseline/open_5c_1000p.log
2025-04-30 19:59:24,474 - INFO - Found specific log file: /home/ybyan/cvm-net-perf/echo/logs/baseline/open_5c_1000p.log
2025-04-30 19:59:24,476 - INFO - Successfully read 26 lines from BASELINE/open_5c_1000p.log
2025-04-30 19:59:24,477 - INFO - Successfully parsed Results section from BASELINE/open_5c_1000p.log using simplified keys


{'mode': 'Open Loop (2 Threads/Client, ID Matching)',
 'clients': 5,
 'target_rate_client_hz': 1000.0,
 'target_rate_total_hz': 5000.0,
 'total_requests': 50153,
 'duration_sec': 10,
 'throughput_rps': 5015.3,
 'latency_avg_us': 132.684,
 'latency_p50_us': 130,
 'latency_p90_us': 167,
 'latency_p95_us': 187,
 'latency_p99_us': 199}

In [55]:
import pandas as pd

def parse_all_logs_to_dataframe() -> pd.DataFrame:
    """
    Parses the 'Results' section from all log files across all LogType directories
    and compiles them into a single pandas DataFrame.

    Returns:
        A pandas DataFrame containing the parsed results from all log files,
        including columns for 'log_type' and 'filename'. Returns an empty
        DataFrame if no logs are found or no results can be parsed.
    """
    all_results = []
    
    for log_type in LogType:
        log_files = get_log_files(log_type)
        logging.info(f"Processing {len(log_files)} files for log type: {log_type.name}")
        
        for file_path in log_files:
            filename = os.path.basename(file_path)
            # parse_results_section already logs info/warnings about parsing
            parsed_data = parse_results_section(log_type, filename)
            
            if parsed_data:
                # Add metadata to the parsed results
                parsed_data['log_type'] = log_type.value
                parsed_data['filename'] = filename
                all_results.append(parsed_data)
            else:
                logging.warning(f"No results parsed from {log_type.name}/{filename}")

    if not all_results:
        logging.warning("No results were parsed from any log files.")
        return pd.DataFrame() # Return empty DataFrame

    # Convert the list of dictionaries to a DataFrame
    results_df = pd.DataFrame(all_results)
    logging.info(f"Successfully compiled results from {len(results_df)} log files into a DataFrame.")
    
    # Optional: Reorder columns to put metadata first
    cols = ['log_type', 'filename'] + [col for col in results_df.columns if col not in ['log_type', 'filename']]
    results_df = results_df[cols]
    
    return results_df

all_data_df = parse_all_logs_to_dataframe()
print(f"Created DataFrame with shape: {all_data_df.shape}")

2025-04-30 19:59:24,495 - INFO - Found 34 log files in /home/ybyan/cvm-net-perf/echo/logs/baseline
2025-04-30 19:59:24,497 - INFO - Processing 34 files for log type: BASELINE
2025-04-30 19:59:24,498 - INFO - Found specific log file: /home/ybyan/cvm-net-perf/echo/logs/baseline/open_5c_1500p.log
2025-04-30 19:59:24,499 - INFO - Found specific log file: /home/ybyan/cvm-net-perf/echo/logs/baseline/open_5c_1500p.log
2025-04-30 19:59:24,500 - INFO - Successfully read 25 lines from BASELINE/open_5c_1500p.log
2025-04-30 19:59:24,501 - INFO - Successfully parsed Results section from BASELINE/open_5c_1500p.log using simplified keys
2025-04-30 19:59:24,502 - INFO - Found specific log file: /home/ybyan/cvm-net-perf/echo/logs/baseline/open_5c_2000p.log
2025-04-30 19:59:24,504 - INFO - Found specific log file: /home/ybyan/cvm-net-perf/echo/logs/baseline/open_5c_2000p.log
2025-04-30 19:59:24,505 - INFO - Successfully read 25 lines from BASELINE/open_5c_2000p.log
2025-04-30 19:59:24,506 - INFO - Succe

Created DataFrame with shape: (70, 14)


In [59]:
import numpy as np

# Clean up the 'mode' column
all_data_df['mode'] = np.where(all_data_df['mode'].str.contains("Open Loop", case=False, na=False), 
                               "open", 
                               all_data_df['mode'])
all_data_df['mode'] = np.where(all_data_df['mode'].str.contains("Closed Loop", case=False, na=False), 
                               "close", 
                               all_data_df['mode'])

# Convert the 'mode' column to categorical type
all_data_df['mode'] = all_data_df['mode'].astype('category')

# Display the unique values and dtype to verify
print("Unique values in 'mode' column after cleaning:", all_data_df['mode'].unique())
print("Data type of 'mode' column:", all_data_df['mode'].dtype)
# Display the first few rows with the updated column
print(all_data_df[['filename', 'mode']].head())

Unique values in 'mode' column after cleaning: ['open', 'close']
Categories (2, object): ['close', 'open']
Data type of 'mode' column: category
            filename   mode
0  open_5c_1500p.log   open
1  open_5c_2000p.log   open
2     closed_16c.log  close
3  open_5c_8000p.log   open
4  open_5c_5000p.log   open


In [60]:
all_data_df.to_parquet("echo_logs.parquet")

In [61]:
all_data_df

Unnamed: 0,log_type,filename,mode,clients,target_rate_client_hz,target_rate_total_hz,total_requests,duration_sec,throughput_rps,latency_avg_us,latency_p50_us,latency_p90_us,latency_p95_us,latency_p99_us
0,baseline,open_5c_1500p.log,open,5,1500.0,7500.0,75304,10,7530.4,125.367,124,157,165,187
1,baseline,open_5c_2000p.log,open,5,2000.0,10000.0,100303,10,10030.3,121.792,118,157,164,182
2,baseline,closed_16c.log,close,16,,,634515,10,63451.5,253.563,250,288,302,337
3,baseline,open_5c_8000p.log,open,5,8000.0,40000.0,398605,10,39860.5,165.395,154,244,272,318
4,baseline,open_5c_5000p.log,open,5,5000.0,25000.0,250692,10,25069.2,123.732,119,160,170,198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
65,cvm,open_5c_6000p.log,open,5,6000.0,30000.0,301593,10,30159.3,148.481,148,196,224,313
66,cvm,open_5c_2500p.log,open,5,2500.0,12500.0,125376,10,12537.6,135.055,144,162,170,190
67,cvm,open_20c_2500p.log,open,20,2500.0,50000.0,493675,10,49367.5,337.896,285,574,794,1048
68,misc,open_20c_2500p_with_hires.log,open,20,2500.0,50000.0,395262,10,39526.2,813.678,826,1103,1173,1307
