In [None]:
import matplotlib.pyplot as plt
import numpy as np
from collections import defaultdict

def plot_wbm_data(file_path):
    # Initialize a nested dictionary to store data for each client id
    client_data = defaultdict(lambda: {'global': [], 'steady': []})

    # Read the file and parse relevant lines
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith("wbm"):
                parts = line.strip().split(',')
                if len(parts) == 7 and parts[3] == 'res':
                    # Process 'res' line with steady or global
                    try:
                        _, timestamp, client_id, operation, operation_size, current_value, steady_or_global = parts
                        timestamp = int(timestamp) / 1000  # Convert to seconds
                        client_id = int(client_id)
                        current_value = int(current_value) / (1024 * 1024)  # Convert to MB
                        steady_or_global = steady_or_global.strip()
                        if steady_or_global in ['steady', 'global']:
                            client_data[client_id][steady_or_global].append((timestamp, current_value))
                    except ValueError:
                        continue
                elif len(parts) == 8 and parts[3] == 'free':
                    # Process 'free' line with global:XXX and steady:YYY
                    try:
                        _, timestamp, client_id, operation, operation_size, current_value, global_str, steady_str = parts
                        timestamp = int(timestamp) / 1000
                        client_id = int(client_id)
                        # Extract the global and steady values
                        global_value = int(global_str.split(':')[1]) / (1024 * 1024)  # Convert to MB
                        steady_value = int(steady_str.split(':')[1]) / (1024 * 1024)  # Convert to MB
                        # Store the values
                        client_data[client_id]['global'].append((timestamp, global_value))
                        client_data[client_id]['steady'].append((timestamp, steady_value))
                    except (ValueError, IndexError):
                        continue
                else:
                    continue
            else:
                continue

    # Determine the start time of the experiment
    start_time = min(
        min((timestamps[0][0] for timestamps in client.values() if len(timestamps) > 0), default=float('inf'))
        for client in client_data.values()
    )

    # Adjust timestamps and convert to numpy arrays
    for client_id in client_data:
        for key in ['global', 'steady']:
            values = client_data[client_id][key]
            adjusted_values = [(timestamp - start_time, value) for timestamp, value in values]
            client_data[client_id][key] = np.array(adjusted_values)

    # Collect all timestamps
    all_timestamps = sorted(set(
        timestamp
        for client_values in client_data.values()
        for key in ['global', 'steady']
        for timestamp, _ in client_values[key]
    ))

    # Initialize latest values for each client and key
    latest_values = {client_id: {'global': 0, 'steady': 0} for client_id in client_data}
    total_usage_values = []
    sum_steady_values = []
    sum_global_values = []

    # Calculate per-client total usage and total usages
    for ts in all_timestamps:
        total_global = 0
        total_steady = 0
        total_usage = 0
        per_client_total_usage = {}
        for client_id in client_data:
            total_client_usage = 0
            for key in ['global', 'steady']:
                values = client_data[client_id][key]
                if len(values) == 0:
                    continue
                mask = values[:, 0] <= ts
                if np.any(mask):
                    latest_values[client_id][key] = values[mask, 1][-1]
                # Sum up the latest values
            # Sum per-client total usage
            total_client_usage = latest_values[client_id]['global'] + latest_values[client_id]['steady']
            per_client_total_usage[client_id] = total_client_usage
            total_usage += total_client_usage
            total_global += latest_values[client_id]['global']
            total_steady += latest_values[client_id]['steady']
        total_usage_values.append((ts, total_usage))
        sum_steady_values.append((ts, total_steady))
        sum_global_values.append((ts, total_global))

    # Convert total usage values to numpy arrays
    total_usage_values = np.array(total_usage_values)
    sum_steady_values = np.array(sum_steady_values)
    sum_global_values = np.array(sum_global_values)

    # Prepare per-client total usage arrays for plotting
    client_total_usage = {}
    for client_id in client_data:
        timestamps = []
        usage_values = []
        latest_values = {'global': 0, 'steady': 0}
        for ts in all_timestamps:
            for key in ['global', 'steady']:
                values = client_data[client_id][key]
                if len(values) == 0:
                    continue
                mask = values[:, 0] <= ts
                if np.any(mask):
                    latest_values[key] = values[mask, 1][-1]
            total_client_usage = latest_values['global'] + latest_values['steady']
            timestamps.append(ts)
            usage_values.append(total_client_usage)
        client_total_usage[client_id] = np.array([timestamps, usage_values])

    # Plot the data
    plt.figure(figsize=(12, 6))

    # Plot per-client total usage
    for client_id, data in client_total_usage.items():
        plt.plot(data[0], data[1], label=f"Client {client_id} Total Usage")

    # Plot summed lines
    plt.plot(total_usage_values[:, 0], total_usage_values[:, 1], label="All Clients Total Usage", color="black", linestyle="-", linewidth=2)
    plt.plot(sum_steady_values[:, 0], sum_steady_values[:, 1], label="Total Steady Usage", color="green", linestyle="--", linewidth=2)
    plt.plot(sum_global_values[:, 0], sum_global_values[:, 1], label="Total Global Usage", color="blue", linestyle=":", linewidth=2)

    # Add labels, legend, and title
    plt.xlabel("Time Since Start (s)")
    plt.ylabel("Usage (MB)")
    plt.title("Usage Over Time")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    # plt.ylim(0, 256)
    # plt.xlim(0, 25)

    # Show the plot
    plt.show()

# Call the function with the path to your file
file_path = "logs/memtable_stats.txt"  # Replace with your file's path
plot_wbm_data(file_path)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Configuration: Operation sizes (in KB)
READ_SIZE_KB = 1  # Size of a single READ operation
INSERT_SIZE_KB = 1  # Size of a single INSERT operation
INSERT_BATCH_SIZE_KB = 100  # Size of a single INSERT_BATCH operation

# Convert sizes to MB for throughput calculations
READ_SIZE_MB = READ_SIZE_KB / 1024
INSERT_SIZE_MB = INSERT_SIZE_KB / 1024
INSERT_BATCH_SIZE_MB = INSERT_BATCH_SIZE_KB / 1024

# Read the CSV file
df = pd.read_csv('logs/client_stats.log')

# Convert timestamp to relative times (in seconds)
df['relative_time_ms'] = df['timestamp'] - df['timestamp'].min()
df['relative_time_s'] = df['relative_time_ms'] / 1000.0

# Sort data and calculate time differences
df = df.sort_values(by=['client_id', 'op_type', 'timestamp'])
df['time_diff_s'] = df.groupby(['client_id', 'op_type'])['relative_time_s'].diff()

# Calculate throughput (ops/sec) and convert to MB/s
df['throughput'] = df['count'] / df['time_diff_s']
df['throughput_mb_s'] = 0.0  # initialize as float

df.loc[df['op_type'] == 'READ', 'throughput_mb_s'] = df['throughput'] * READ_SIZE_MB
df.loc[df['op_type'] == 'INSERT', 'throughput_mb_s'] = df['throughput'] * INSERT_SIZE_MB
df.loc[df['op_type'] == 'INSERT_BATCH', 'throughput_mb_s'] = df['throughput'] * INSERT_BATCH_SIZE_MB

# Drop NaN throughput rows
df_throughput = df.dropna(subset=['throughput_mb_s'])

# Separate dataframes for different operation categories
df_read = df_throughput[df_throughput['op_type'] == 'READ']
df_insert = df_throughput[df_throughput['op_type'] == 'INSERT']
df_insert_batch = df_throughput[df_throughput['op_type'] == 'INSERT_BATCH']
df_queue_latency = df[df['op_type'] == 'QUEUE']
df_read_latency = df[df['op_type'] == 'READ']
df_insert_latency = df[df['op_type'] == 'INSERT']

# Calculate user_cache_hit_rate for READ operations
df_read['user_cache_total'] = df_read['user_cache_hits'] + df_read['user_cache_misses']
df_read['user_cache_hit_rate'] = np.where(
    df_read['user_cache_total'] > 0,
    (df_read['user_cache_hits'] / df_read['user_cache_total']) * 100.0,
    np.nan
)

# Prepare data for user_cache_usage stacked area plot
df_cache_usage = df_read[['relative_time_s', 'client_id', 'user_cache_usage']].dropna()
df_cache_usage_pivot = df_cache_usage.pivot_table(
    index='relative_time_s',
    columns='client_id',
    values='user_cache_usage',
    aggfunc='mean',
    fill_value=0
)

# Create figure and axes
fig, axes = plt.subplots(nrows=8, ncols=1, figsize=(10, 24), sharex=True)
fig.subplots_adjust(hspace=0.4)

# 1. Per-Client READ Throughput
ax1 = axes[0]
for client_id, grp in df_read.groupby('client_id'):
    ax1.plot(grp['relative_time_s'], grp['throughput_mb_s'], label=f'Client {client_id}')
ax1.set_title('Per-Client READ Throughput')
ax1.set_ylabel('Throughput (MB/s)')
ax1.legend(loc='upper right')

# 2. Per-Client WRITE Throughput (INSERT & INSERT_BATCH)
ax2 = axes[1]
for client_id, grp in df_insert.groupby('client_id'):
    ax2.plot(grp['relative_time_s'], grp['throughput_mb_s'], label=f'Client {client_id} INSERT')
for client_id, grp in df_insert_batch.groupby('client_id'):
    ax2.plot(grp['relative_time_s'], grp['throughput_mb_s'], label=f'Client {client_id} INSERT_BATCH')
ax2.set_title('Per-Client WRITE Throughput (INSERT & INSERT_BATCH)')
ax2.set_ylabel('Throughput (MB/s)')
ax2.legend(loc='upper right')

# 3. Combined READ & WRITE Throughput (in one plot, line per-client per-op)
ax3 = axes[2]
combined_ops = df_throughput[df_throughput['op_type'].isin(['READ', 'INSERT', 'INSERT_BATCH'])]
for (client_id, op_type), grp in combined_ops.groupby(['client_id', 'op_type']):
    ax3.plot(grp['relative_time_s'], grp['throughput_mb_s'], label=f'Client {client_id}, {op_type}')
ax3.set_title('Combined READ & WRITE Throughput')
ax3.set_ylabel('Throughput (MB/s)')
ax3.legend(loc='upper right')

# 4. P99 READ Latency
ax4 = axes[3]
for client_id, grp in df_read_latency.groupby('client_id'):
    ax4.plot(grp['relative_time_s'], grp['99p'], label=f'Client {client_id}')
ax4.set_title('Per-Client READ P99 Latency')
ax4.set_ylabel('Latency (ms)')
ax4.legend(loc='upper right')

# 5. P99 INSERT Latency
ax5 = axes[4]
for client_id, grp in df_insert_latency.groupby('client_id'):
    ax5.plot(grp['relative_time_s'], grp['99p'], label=f'Client {client_id}')
ax5.set_title('Per-Client INSERT P99 Latency')
ax5.set_ylabel('Latency (ms)')
ax5.legend(loc='upper right')

# 6. P99 QUEUE Latency
ax6 = axes[5]
for client_id, grp in df_queue_latency.groupby('client_id'):
    ax6.plot(grp['relative_time_s'], grp['99p'], label=f'Client {client_id}')
ax6.set_title('Per-Client QUEUE P99 Latency')
ax6.set_ylabel('Latency (ms)')
ax6.legend(loc='upper right')

# # 7. User Cache Hit Rate (per-client line)
# ax7 = axes[6]
# for client_id, grp in df_read.groupby('client_id'):
#     valid_grp = grp.dropna(subset=['user_cache_hit_rate'])
#     if not valid_grp.empty:
#         ax7.plot(valid_grp['relative_time_s'], valid_grp['user_cache_hit_rate'], label=f'Client {client_id}')
# ax7.set_title('Per-Client User Cache Hit Rate')
# ax7.set_ylabel('Hit Rate (%)')
# ax7.legend(loc='upper right')

# # 8. User Cache Usage (Stacked)
# ax8 = axes[7]
# x = df_cache_usage_pivot.index
# y = df_cache_usage_pivot.values.T  # Each row in y is a series for one client
# ax8.stackplot(x, y, labels=df_cache_usage_pivot.columns)
# ax8.set_title('Per-Client User Cache Usage (Stacked)')
# ax8.set_xlabel('Time (s)')
# ax8.set_ylabel('Cache Usage')
# ax8.legend(loc='upper right')

plt.tight_layout()
plt.show()


In [None]:
# Overall throughputs (client + system)

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from datetime import datetime
import time

max_read_tput = 700
max_read_iops = 180000
max_write_tput = 400
max_write_iops = 100000

def timestamp_to_seconds(timestamp_str):
  timestamp = datetime.strptime(timestamp_str.rstrip(), '%Y-%m-%d %H:%M:%S.%f')
  epoch = datetime(1970, 1, 1)
  return (timestamp - epoch).total_seconds()

def plot_overall_tputs(output_file, axs, start_time_shift, xlim, fig_loc):
  df_from_csv = pd.read_csv("iostat_results.csv")

  # print(f"Timestamp: {timestamp_to_seconds(df_from_csv['Timestamp'][0])}")

  time_seconds = np.arange(len(df_from_csv))
  time_seconds = [x + start_time_shift for x in time_seconds]

  # Plotting
  # axs[fig_loc[0]].figure(figsize=(10, 6))
  axs[fig_loc[0]].plot(time_seconds, df_from_csv["rMB/s"], label='Read MB/s', marker='o', color='tab:green')
  axs[fig_loc[0]].plot(time_seconds, df_from_csv["wMB/s"], label='Write MB/s', marker='o', color='tab:red')

  with open(output_file, 'a') as outfile:
    outfile.write(f"metric-rMB_rate\n")
    outfile.write(f"time_points:{time_seconds}\n")
    outfile.write(f"data_points:{df_from_csv["rMB/s"].tolist()}\n")

    outfile.write(f"metric-wMB_rate\n")
    outfile.write(f"time_points:{time_seconds}\n")
    outfile.write(f"data_points:{df_from_csv["wMB/s"].tolist()}\n")

  axs[fig_loc[0]].set_title('SSD Throughput Over Time')
  axs[fig_loc[0]].set_xlabel('Time (s)')
  axs[fig_loc[0]].set_ylabel('MB/s')
  axs[fig_loc[0]].set_xlim(xlim)
  # axs[fig_loc[0]].set_ylim(0, 520)
  axs[fig_loc[0]].legend(loc='upper left')
  axs[fig_loc[0]].grid(True)

  # # Creating a second y-axis
  # ax2 = axs[fig_loc[0]].twinx()
  # # Plotting on the secondary y-axis
  # ax2.plot(time_seconds, df_from_csv["rMB/s"]/max_read_tput, label='Read Util', marker='x', linestyle='--', color='tab:green')
  # ax2.plot(time_seconds, df_from_csv["wMB/s"]/max_write_tput, label='Write Util', marker='+', linestyle='--', color='tab:red')
  # ax2.set_ylabel('Utilization (based on tput)')
  # ax2.legend(loc='upper right')
  # ax2.set_ylim(0,1)

  # Adjust the right margin to accommodate the second y-axis legend
  plt.subplots_adjust(right=0.85)

  with open(output_file, 'a') as outfile:
    outfile.write(f"metric-rMB_util\n")
    outfile.write(f"time_points:{time_seconds}\n")
    outfile.write(f"data_points:{(df_from_csv["rMB/s"]/max_read_tput).tolist()}\n")

    outfile.write(f"metric-wMB_util\n")
    outfile.write(f"time_points:{time_seconds}\n")
    outfile.write(f"data_points:{(df_from_csv["wMB/s"]/max_write_tput).tolist()}\n")

def plot_overall_iops(output_file, axs, start_time_shift, xlim, fig_loc):
  df_from_csv = pd.read_csv("iostat_results.csv")
  time_seconds = np.arange(len(df_from_csv))
  time_seconds = [x + start_time_shift for x in time_seconds]

  axs[fig_loc[0]].plot(time_seconds, df_from_csv["r/s"], label='Read IOPS', marker='o', color='tab:green')
  axs[fig_loc[0]].plot(time_seconds, df_from_csv["w/s"], label='Write IOPS ', marker='o', color='tab:red')

  with open(output_file, 'a') as outfile:
    outfile.write(f"metric-rIOP_rate\n")
    outfile.write(f"time_points:{time_seconds}\n")
    outfile.write(f"data_points:{df_from_csv["r/s"].tolist()}\n")

    outfile.write(f"metric-wIOP_rate\n")
    outfile.write(f"time_points:{time_seconds}\n")
    outfile.write(f"data_points:{df_from_csv["w/s"].tolist()}\n")

  axs[fig_loc[0]].set_title('SSD IOPS Over Time')
  axs[fig_loc[0]].set_xlabel('Time (s)')
  axs[fig_loc[0]].set_ylabel('IOPS')
  axs[fig_loc[0]].set_xlim(xlim)
  # axs[fig_loc[0]].set_ylim(0,16)
  axs[fig_loc[0]].legend(loc='upper right')
  axs[fig_loc[0]].grid(True)

  # Creating a second y-axis
  ax2 = axs[fig_loc[0]].twinx()
  # Plotting on the secondary y-axis
  ax2.plot(time_seconds, df_from_csv["r/s"]/max_read_iops, label='Read IOPS Util', marker='x', linestyle='--', color='tab:green')
  ax2.plot(time_seconds, df_from_csv["w/s"]/max_write_iops, label='Write IOPS Util', marker='+', linestyle='--', color='tab:red')
  ax2.set_ylabel('Utilization (based on iops)')
  ax2.legend(loc='upper right')
  ax2.set_ylim(0,1)

  # Adjust the right margin to accommodate the second y-axis legend
  plt.subplots_adjust(right=0.85)

  with open(output_file, 'a') as outfile:
    outfile.write(f"metric-rIOP_util\n")
    outfile.write(f"time_points:{time_seconds}\n")
    outfile.write(f"data_points:{(df_from_csv["r/s"]/max_read_iops).tolist()}\n")

    outfile.write(f"metric-wIOP_util\n")
    outfile.write(f"time_points:{time_seconds}\n")
    outfile.write(f"data_points:{(df_from_csv["w/s"]/max_write_iops).tolist()}\n")

def plot_io_waittimes(output_file, axs, start_time_shift, xlim, fig_loc):
  df_from_csv = pd.read_csv("iostat_results.csv")
  time_seconds = np.arange(len(df_from_csv))
  time_seconds = [x + start_time_shift for x in time_seconds]

  axs[fig_loc[0]].plot(time_seconds, df_from_csv["r_await"], label='Read Await (per req)', marker='o', color='tab:green')
  # axs[fig_loc[0]].plot(time_seconds, df_from_csv["w_await"], label='Write Await (per req)', marker='o', color='tab:red')

  axs[fig_loc[0]].set_title('IO Wait Times (queueing + servicing)')
  axs[fig_loc[0]].set_xlabel('Time (s)')
  axs[fig_loc[0]].set_ylabel('Wait Time (ms)')
  axs[fig_loc[0]].set_xlim(xlim)
  # axs[fig_loc[0]].set_ylim(0,1)
  axs[fig_loc[0]].legend(loc='upper left')
  axs[fig_loc[0]].grid(True)

  ax2 = axs[fig_loc[0]].twinx()
  ax2.plot(time_seconds, [df_from_csv["r_await"][i] / df_from_csv["rareq-sz"][i] if df_from_csv["rareq-sz"][i] > 0 else df_from_csv["r_await"][i] for i in range(len(df_from_csv["r_await"]))], label='Read Await (per KB)', marker='x', color='tab:green')
  # ax2.plot(time_seconds, [df_from_csv["w_await"][i] / df_from_csv["wareq-sz"][i] if df_from_csv["wareq-sz"][i] > 0 else df_from_csv["w_await"][i]   for i in range(len(df_from_csv["w_await"]))], label='Write Await (per KB)', marker='x', color='tab:red')

  ax2.set_ylabel('IO Wait Times per KB')
  ax2.legend(loc='upper right')
  plt.subplots_adjust(right=0.85)

  with open(output_file, 'a') as outfile:
    outfile.write(f"metric-r_await\n")
    outfile.write(f"time_points:{time_seconds}\n")
    outfile.write(f"data_points:{df_from_csv["r_await"].tolist()}\n")

    outfile.write(f"metric-r_await_per_kb\n")
    outfile.write(f"time_points:{time_seconds}\n")
    outfile.write(f"data_points:{[df_from_csv["r_await"][i] / df_from_csv["rareq-sz"][i] if df_from_csv["rareq-sz"][i] > 0 else df_from_csv["r_await"][i] for i in range(len(df_from_csv["r_await"]))]}\n")

def plot_io_reqsize(output_file, axs, start_time_shift, xlim, fig_loc):
  df_from_csv = pd.read_csv("iostat_results.csv")
  time_seconds = np.arange(len(df_from_csv))
  time_seconds = [x + start_time_shift for x in time_seconds]

  axs[fig_loc[0]].plot(time_seconds, df_from_csv["rareq-sz"], label='Avg Read Size', marker='o', color='tab:green')
  axs[fig_loc[0]].plot(time_seconds, df_from_csv["wareq-sz"], label='Avg Write Size', marker='o', color='tab:red')

  axs[fig_loc[0]].set_title('Avg IO Sizes')
  axs[fig_loc[0]].set_xlabel('Time (s)')
  axs[fig_loc[0]].set_ylabel('Size (KB)')
  axs[fig_loc[0]].set_xlim(xlim)
  # axs[fig_loc[0]].set_ylim(0,1)
  axs[fig_loc[0]].legend(loc='upper right')
  axs[fig_loc[0]].grid(True)

  with open(output_file, 'a') as outfile:
    outfile.write(f"metric-r_size\n")
    outfile.write(f"time_points:{time_seconds}\n")
    outfile.write(f"data_points:{df_from_csv["rareq-sz"].tolist()}\n")

    outfile.write(f"metric-w_size\n")
    outfile.write(f"time_points:{time_seconds}\n")
    outfile.write(f"data_points:{df_from_csv["wareq-sz"].tolist()}\n")