### Bandwith Test Observability

Steps to get observability
*   Get test duration (from propagation logs)
*   Get time series metric values (based on CPU usage, Bytes Transmitted, Bytes Received, and Memory usage)
*   Get metrix values that match time series   



Get all the required libraries

In [None]:
import pandas as pd
import numpy as np
import json, re
from pandas import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import gaussian_kde

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Go to root directory

In [None]:
%cd /content/drive/My Drive/Colab Notebooks/bcgossip/sim/gpbc/cnsim_plosone/bandwidth

/content/drive/My Drive/Colab Notebooks/bcgossip/sim/gpbc/cnsim_plosone/bandwidth


Get bandwidth test duration by loading memory propagation time. This time duration will be used to extract metrics from metrics explorer.  

In [None]:
def get_test_duration(file_path):
    """
    Calculates the duration of a test from a CSV file.

    Args:
        file_path (str): The path to the CSV file.

    Returns:
        pandas.DataFrame or None: A DataFrame containing the number of nodes,
                                  minimum and maximum datetimes in MYT, or None
                                  if an error occurs.
    """
    # Get source file
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None

    try:
        # Remove rows where 'message' ends with "-0" (convergence phase)
        df = df[~df['message'].str.endswith("-0")]

        # Extract number of nodes from 'message'
        df['num_nodes'] = df['message'].str.extract(r'cubaan(\d+)-')[0].astype(int)

        # Group by 'num_nodes' and find the min and max of 'received_timestamp'
        duration_df = df.groupby('num_nodes')['received_timestamp'].agg(['min', 'max']).reset_index()

        # Convert min and max columns to datetime (assuming nanoseconds)
        duration_df['min_datetime_myt'] = pd.to_datetime(duration_df['min'], unit='ns')
        duration_df['max_datetime_myt'] = pd.to_datetime(duration_df['max'], unit='ns')

        # Convert to MYT
        duration_df['min_datetime_myt'] = duration_df['min_datetime_myt'].dt.tz_localize('UTC').dt.tz_convert('Asia/Kuala_Lumpur')
        duration_df['max_datetime_myt'] = duration_df['max_datetime_myt'].dt.tz_localize('UTC').dt.tz_convert('Asia/Kuala_Lumpur')

        # Format the datetime objects as "M/d/YYYY HH:MM:SS"
        duration_df['min_datetime_myt'] = duration_df['min_datetime_myt'].dt.strftime('%-m/%-d/%Y %H:%M:%S')
        duration_df['max_datetime_myt'] = duration_df['max_datetime_myt'].dt.strftime('%-m/%-d/%Y %H:%M:%S')

        return duration_df

    except KeyError as e:
        print(f"Error: Column not found. {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

In [None]:
## Get default bandwidth test duration
df_def_duration = get_test_duration('test-default-bwidth-10X.csv')
df_def_duration

Unnamed: 0,num_nodes,min,max,min_datetime_myt,max_datetime_myt
0,10,1.742951e+18,1.742951e+18,3/26/2025 08:56:05,3/26/2025 08:57:19
1,50,1.742951e+18,1.742951e+18,3/26/2025 09:00:21,3/26/2025 09:01:42
2,100,1.742951e+18,1.742951e+18,3/26/2025 09:04:32,3/26/2025 09:05:56
3,200,1.742951e+18,1.742951e+18,3/26/2025 09:07:54,3/26/2025 09:09:28
4,400,1.742952e+18,1.742952e+18,3/26/2025 09:11:57,3/26/2025 09:13:52
5,600,1.742952e+18,1.742952e+18,3/26/2025 09:22:59,3/26/2025 09:25:18


In [None]:
## Get 5Mbps bandwidth test duration
df_5M_duration = get_test_duration('test-bwidth-5M-10X.csv')
df_5M_duration

Unnamed: 0,num_nodes,min,max,min_datetime_myt,max_datetime_myt
0,10,1.742872e+18,1.742872e+18,3/25/2025 11:01:49,3/25/2025 11:03:03
1,50,1.742873e+18,1.742873e+18,3/25/2025 11:23:00,3/25/2025 11:24:18
2,100,1.742873e+18,1.742873e+18,3/25/2025 11:27:11,3/25/2025 11:28:37
3,200,1.742874e+18,1.742874e+18,3/25/2025 11:40:19,3/25/2025 11:41:57
4,400,1.742874e+18,1.742874e+18,3/25/2025 11:44:47,3/25/2025 11:46:51
5,600,1.742875e+18,1.742875e+18,3/25/2025 11:55:27,3/25/2025 11:57:59


In [None]:
## Get 30Mbps bandwidth test duration
df_30M_duration = get_test_duration('test-bwidth-30M-10X.csv')
df_30M_duration

Unnamed: 0,num_nodes,min,max,min_datetime_myt,max_datetime_myt
0,10,1.742877e+18,1.742877e+18,3/25/2025 12:25:42,3/25/2025 12:26:56
1,50,1.742877e+18,1.742877e+18,3/25/2025 12:21:53,3/25/2025 12:23:10
2,100,1.742876e+18,1.742876e+18,3/25/2025 12:18:30,3/25/2025 12:19:54
3,200,1.742876e+18,1.742876e+18,3/25/2025 12:14:36,3/25/2025 12:16:11
4,400,1.742876e+18,1.742876e+18,3/25/2025 12:10:51,3/25/2025 12:12:53
5,600,1.742875e+18,1.742876e+18,3/25/2025 12:04:57,3/25/2025 12:07:22


In [None]:
def get_timeseries(file_path):
    """
    Loads a CSV file containing time series data. Converts the 'TimeSeries ID'
    column to datetime objects in Asia/Kuala_Lumpur timezone, and returns the
    processed time series DataFrame.

    Args:
        file_path (str): The path to the CSV file.

    Returns:
        pandas.DataFrame: The DataFrame with the 'TimeSeries ID' column converted to datetime.
    """
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None

    # Define the datetime format string to match the data
    datetime_format = '%a %b %d %Y %H:%M:%S GMT%z (Malaysia Time)'

    try:
        # Convert 'TimeSeries ID' to datetime, convert to MYT, and format
        df['TimeSeries ID'] = (pd.to_datetime(df['TimeSeries ID'], format=datetime_format, utc=True)
                            .dt.tz_convert('Asia/Kuala_Lumpur')
                            .dt.strftime('%m/%d/%Y %H:%M:%S'))

        # Convert 'TimeSeries ID' to datetime in df
        df['TimeSeries ID'] = pd.to_datetime(df['TimeSeries ID'])
    except KeyError:
        print("Error: 'TimeSeries ID' column not found in the CSV file.")
        return None
    except ValueError as e:
        print(f"Error: Datetime conversion failed. {e}")
        return None

    return df

a. Observability for Bytes transmitted (with PromQL filtering)
```
sum(rate(kubernetes_io:pod_network_sent_bytes_count{monitored_resource="k8s_pod"}[${__interval}]))
```
Open metrics explorer and extract all data scraped by this PromQL to csv. Refer [here](https://drive.google.com/file/d/1P3t_etDRgOxGrtgxYQF3bTLfhBf9-qYT/view?usp=sharing) for the steps. Load this csv to pandas dataframe.



In [None]:
# Load Bytes transmitted csv file (from metrix explorer) for default memory, 150Mi and 300Mi tests
# Get these tests min datetime value and max datetime value and key in this in metrics explorer
# together with (a) PromQL command

df_def_transmit = get_timeseries('defaultBandwidthBytestransmitted.csv')
# df_def_transmit

df_5M_transmit = get_timeseries('5MBandwidthBytestransmitted.csv')
# df_5M_transmit

df_30M_transmit = get_timeseries('30MBandwidthBytestransmitted.csv')
# df_30M_transmit

In [None]:
# Function to find the max 'telemetry-explorer-0-0' value within a given time range
def find_max_telemetry(min_time, max_time, df_util):
    filtered_df = df_util[(df_util['TimeSeries ID'] >= min_time) & (df_util['TimeSeries ID'] <= max_time)]
    if not filtered_df.empty:
        return filtered_df['telemetry-explorer-0-0'].max()
    else:
        return None  # Return None if no data within the range

In [None]:
## Getting Bytes transmitted for Default Test

# Apply the function to each row in grouped_df
df_def_duration['bytes_transmit_default'] = df_def_duration.apply(
    lambda row: find_max_telemetry(row['min_datetime_myt'], row['max_datetime_myt'], df_def_transmit), axis=1
)

df_def_duration

Unnamed: 0,num_nodes,min,max,min_datetime_myt,max_datetime_myt,bytes_transmit_default
0,10,1.742951e+18,1.742951e+18,3/26/2025 08:56:05,3/26/2025 08:57:19,9329238016
1,50,1.742951e+18,1.742951e+18,3/26/2025 09:00:21,3/26/2025 09:01:42,11595186176
2,100,1.742951e+18,1.742951e+18,3/26/2025 09:04:32,3/26/2025 09:05:56,14180278272
3,200,1.742951e+18,1.742951e+18,3/26/2025 09:07:54,3/26/2025 09:09:28,19956338688
4,400,1.742952e+18,1.742952e+18,3/26/2025 09:11:57,3/26/2025 09:13:52,31008026624
5,600,1.742952e+18,1.742952e+18,3/26/2025 09:22:59,3/26/2025 09:25:18,42139561984


In [None]:
## Getting Bytes transmitted for 5Mbps Test

# Apply the function to each row in grouped_df
df_5M_duration['bytes_transmit_5Mbps'] = df_5M_duration.apply(
    lambda row: find_max_telemetry(row['min_datetime_myt'], row['max_datetime_myt'], df_5M_transmit), axis=1
)

df_5M_duration

Unnamed: 0,num_nodes,min,max,min_datetime_myt,max_datetime_myt,bytes_transmit_5Mbps
0,10,1.742872e+18,1.742872e+18,3/25/2025 11:01:49,3/25/2025 11:03:03,4208669.0
1,50,1.742873e+18,1.742873e+18,3/25/2025 11:23:00,3/25/2025 11:24:18,4725583.0
2,100,1.742873e+18,1.742873e+18,3/25/2025 11:27:11,3/25/2025 11:28:37,5261640.0
3,200,1.742874e+18,1.742874e+18,3/25/2025 11:40:19,3/25/2025 11:41:57,6264489.0
4,400,1.742874e+18,1.742874e+18,3/25/2025 11:44:47,3/25/2025 11:46:51,9036683.0
5,600,1.742875e+18,1.742875e+18,3/25/2025 11:55:27,3/25/2025 11:57:59,12000800.0


In [None]:
## Getting Bytes transmitted for 30Mbps Test

# Apply the function to each row in grouped_df
df_30M_duration['bytes_transmit_30Mbps'] = df_30M_duration.apply(
    lambda row: find_max_telemetry(row['min_datetime_myt'], row['max_datetime_myt'], df_30M_transmit), axis=1
)

df_30M_duration

Unnamed: 0,num_nodes,min,max,min_datetime_myt,max_datetime_myt,bytes_transmit_30Mbps
0,10,1.742877e+18,1.742877e+18,3/25/2025 12:25:42,3/25/2025 12:26:56,4101905.0
1,50,1.742877e+18,1.742877e+18,3/25/2025 12:21:53,3/25/2025 12:23:10,4589670.0
2,100,1.742876e+18,1.742876e+18,3/25/2025 12:18:30,3/25/2025 12:19:54,4853697.0
3,200,1.742876e+18,1.742876e+18,3/25/2025 12:14:36,3/25/2025 12:16:11,6951564.0
4,400,1.742876e+18,1.742876e+18,3/25/2025 12:10:51,3/25/2025 12:12:53,8770121.0
5,600,1.742875e+18,1.742876e+18,3/25/2025 12:04:57,3/25/2025 12:07:22,12360720.0


b. Observability for Bytes received (with PromQL filtering)
```
sum(rate(kubernetes_io:pod_network_received_bytes_count{monitored_resource="k8s_pod"}[${__interval}]))
```
Open metrics explorer and extract all data scraped by this PromQL to csv. Refer [here](https://drive.google.com/file/d/1P3t_etDRgOxGrtgxYQF3bTLfhBf9-qYT/view?usp=sharing) for the steps. Load this csv to pandas dataframe.

In [None]:
# Load Bytes received csv files (from metrix explorer) for default memory, 5Mbps and 30Mbps tests
# Get these tests min datetime value and max datetime value (from test duration) and key in it to metrics explorer
# together with (b) PromQL command

df_def_received = get_timeseries('defaultBandwidthBytesReceived.csv')
# df_def_received

df_5M_received = get_timeseries('5MBandwidthBytesReceived.csv')
# df_5M_received

df_30M_received = get_timeseries('30MBandwidthBytesReceived.csv')
# df_30M_received

In [None]:
## Getting Bytes received for default Test

# Apply the function to each row in grouped_df
df_def_duration['bytes_received_default'] = df_def_duration.apply(
    lambda row: find_max_telemetry(row['min_datetime_myt'], row['max_datetime_myt'], df_def_received), axis=1
)
# Select the desired columns using a list
df_def_duration

Unnamed: 0,num_nodes,min,max,min_datetime_myt,max_datetime_myt,bytes_transmit_default,bytes_received_default
0,10,1.742951e+18,1.742951e+18,3/26/2025 08:56:05,3/26/2025 08:57:19,9329238016,47174980.0
1,50,1.742951e+18,1.742951e+18,3/26/2025 09:00:21,3/26/2025 09:01:42,11595186176,209090300.0
2,100,1.742951e+18,1.742951e+18,3/26/2025 09:04:32,3/26/2025 09:05:56,14180278272,7418050.0
3,200,1.742951e+18,1.742951e+18,3/26/2025 09:07:54,3/26/2025 09:09:28,19956338688,23331430.0
4,400,1.742952e+18,1.742952e+18,3/26/2025 09:11:57,3/26/2025 09:13:52,31008026624,2219467.0
5,600,1.742952e+18,1.742952e+18,3/26/2025 09:22:59,3/26/2025 09:25:18,42139561984,2723310.0


In [None]:
## Getting Bytes received for 5Mbps test

# Apply the function to each row in grouped_df
df_5M_duration['bytes_received_5Mbps'] = df_5M_duration.apply(
    lambda row: find_max_telemetry(row['min_datetime_myt'], row['max_datetime_myt'], df_5M_received), axis=1
)

# Remove a column inplace
# df_150Mi_duration.drop('bytes_received_default', axis=1, inplace=True)
# Select the desired columns using a list
df_5M_duration

Unnamed: 0,num_nodes,min,max,min_datetime_myt,max_datetime_myt,bytes_transmit_5Mbps,bytes_received_5Mbps
0,10,1.742872e+18,1.742872e+18,3/25/2025 11:01:49,3/25/2025 11:03:03,4208669.0,160078000.0
1,50,1.742873e+18,1.742873e+18,3/25/2025 11:23:00,3/25/2025 11:24:18,4725583.0,346329500.0
2,100,1.742873e+18,1.742873e+18,3/25/2025 11:27:11,3/25/2025 11:28:37,5261640.0,11452100.0
3,200,1.742874e+18,1.742874e+18,3/25/2025 11:40:19,3/25/2025 11:41:57,6264489.0,12020380.0
4,400,1.742874e+18,1.742874e+18,3/25/2025 11:44:47,3/25/2025 11:46:51,9036683.0,6826106.0
5,600,1.742875e+18,1.742875e+18,3/25/2025 11:55:27,3/25/2025 11:57:59,12000800.0,9122132.0


In [None]:
## Getting Bytes received for 30Mbps Test

# Apply the function to each row in grouped_df
df_30M_duration['bytes_received_30Mbps'] = df_30M_duration.apply(
    lambda row: find_max_telemetry(row['min_datetime_myt'], row['max_datetime_myt'], df_30M_received), axis=1
)

# Remove a column inplace
# df_150Mi_duration.drop('bytes_received_default', axis=1, inplace=True)
# Select the desired columns using a list
df_30M_duration

Unnamed: 0,num_nodes,min,max,min_datetime_myt,max_datetime_myt,bytes_transmit_30Mbps,bytes_received_30Mbps
0,10,1.742877e+18,1.742877e+18,3/25/2025 12:25:42,3/25/2025 12:26:56,4101905.0,1035093.0
1,50,1.742877e+18,1.742877e+18,3/25/2025 12:21:53,3/25/2025 12:23:10,4589670.0,1613858.0
2,100,1.742876e+18,1.742876e+18,3/25/2025 12:18:30,3/25/2025 12:19:54,4853697.0,2055274.0
3,200,1.742876e+18,1.742876e+18,3/25/2025 12:14:36,3/25/2025 12:16:11,6951564.0,5399986.0
4,400,1.742876e+18,1.742876e+18,3/25/2025 12:10:51,3/25/2025 12:12:53,8770121.0,6834280.0
5,600,1.742875e+18,1.742876e+18,3/25/2025 12:04:57,3/25/2025 12:07:22,12360720.0,10061080.0


c. Observability for CPU usage (with PromQL filtering)
```
sum(rate(container_cpu_usage_seconds_total[${__interval}]))
```
Open metrics explorer and extract all data scraped by this PromQL to csv. Refer [here](https://drive.google.com/file/d/1P3t_etDRgOxGrtgxYQF3bTLfhBf9-qYT/view?usp=sharing) for the steps. Load this csv to pandas dataframe.

In [None]:
# Load CPU usage time series csv files (from metrics explorer) for default memory, 150Mi and 300Mi tests
# Get these tests min datetime value and max datetime value (from test duration) and key in it to metrics explorer
# together with (b) PromQL command

df_def_cpu = get_timeseries('defaultBandwidthCPUusage.csv')
# df_def_cpu

df_5M_cpu = get_timeseries('5MBandwidthCPUusage.csv')
# df_5M_cpu

df_30M_cpu = get_timeseries('30MBandwidthCPUusage.csv')
# df_30M_cpu

In [None]:
## Getting CPU usage for default Test

# Apply the function to each row in grouped_df
df_def_duration['cpu_usage_default'] = df_def_duration.apply(
    lambda row: find_max_telemetry(row['min_datetime_myt'], row['max_datetime_myt'], df_def_cpu), axis=1
)
# Select the desired columns using a list
df_def_duration

Unnamed: 0,num_nodes,min,max,min_datetime_myt,max_datetime_myt,bytes_transmit_default,bytes_received_default,cpu_usage_default
0,10,1.742951e+18,1.742951e+18,3/26/2025 08:56:05,3/26/2025 08:57:19,9329238016,47174980.0,9.050088
1,50,1.742951e+18,1.742951e+18,3/26/2025 09:00:21,3/26/2025 09:01:42,11595186176,209090300.0,16.925751
2,100,1.742951e+18,1.742951e+18,3/26/2025 09:04:32,3/26/2025 09:05:56,14180278272,7418050.0,12.981212
3,200,1.742951e+18,1.742951e+18,3/26/2025 09:07:54,3/26/2025 09:09:28,19956338688,23331430.0,25.300223
4,400,1.742952e+18,1.742952e+18,3/26/2025 09:11:57,3/26/2025 09:13:52,31008026624,2219467.0,34.152766
5,600,1.742952e+18,1.742952e+18,3/26/2025 09:22:59,3/26/2025 09:25:18,42139561984,2723310.0,49.263881


In [None]:
## Getting CPU usage for 5Mbps Test

# Apply the function to each row in grouped_df
df_5M_duration['cpu_usage_5Mbps'] = df_5M_duration.apply(
    lambda row: find_max_telemetry(row['min_datetime_myt'], row['max_datetime_myt'], df_5M_cpu), axis=1
)
# Select the desired columns using a list
df_5M_duration

Unnamed: 0,num_nodes,min,max,min_datetime_myt,max_datetime_myt,bytes_transmit_5Mbps,bytes_received_5Mbps,cpu_usage_5Mbps
0,10,1.742872e+18,1.742872e+18,3/25/2025 11:01:49,3/25/2025 11:03:03,4208669.0,160078000.0,17.225606
1,50,1.742873e+18,1.742873e+18,3/25/2025 11:23:00,3/25/2025 11:24:18,4725583.0,346329500.0,29.90123
2,100,1.742873e+18,1.742873e+18,3/25/2025 11:27:11,3/25/2025 11:28:37,5261640.0,11452100.0,13.592442
3,200,1.742874e+18,1.742874e+18,3/25/2025 11:40:19,3/25/2025 11:41:57,6264489.0,12020380.0,17.060342
4,400,1.742874e+18,1.742874e+18,3/25/2025 11:44:47,3/25/2025 11:46:51,9036683.0,6826106.0,46.178877
5,600,1.742875e+18,1.742875e+18,3/25/2025 11:55:27,3/25/2025 11:57:59,12000800.0,9122132.0,36.297967


In [None]:
## Getting CPU usage for 30Mbps Test

# Apply the function to each row in grouped_df
df_30M_duration['cpu_usage_30Mbps'] = df_30M_duration.apply(
    lambda row: find_max_telemetry(row['min_datetime_myt'], row['max_datetime_myt'], df_30M_cpu), axis=1
)
# Select the desired columns using a list
df_30M_duration

Unnamed: 0,num_nodes,min,max,min_datetime_myt,max_datetime_myt,bytes_transmit_30Mbps,bytes_received_30Mbps,cpu_usage_30Mbps
0,10,1.742877e+18,1.742877e+18,3/25/2025 12:25:42,3/25/2025 12:26:56,4101905.0,1035093.0,11.814776
1,50,1.742877e+18,1.742877e+18,3/25/2025 12:21:53,3/25/2025 12:23:10,4589670.0,1613858.0,14.886363
2,100,1.742876e+18,1.742876e+18,3/25/2025 12:18:30,3/25/2025 12:19:54,4853697.0,2055274.0,18.997858
3,200,1.742876e+18,1.742876e+18,3/25/2025 12:14:36,3/25/2025 12:16:11,6951564.0,5399986.0,31.798056
4,400,1.742876e+18,1.742876e+18,3/25/2025 12:10:51,3/25/2025 12:12:53,8770121.0,6834280.0,46.888647
5,600,1.742875e+18,1.742876e+18,3/25/2025 12:04:57,3/25/2025 12:07:22,12360720.0,10061080.0,63.597267


d. Observability for Memory usage (with PromQL filtering)
```
sum(avg_over_time(kubernetes_io:container_memory_used_bytes{monitored_resource="k8s_container"}[${__interval}]))
```
Open metrics explorer and extract all data scraped by this PromQL to csv. Refer [here](https://drive.google.com/file/d/1P3t_etDRgOxGrtgxYQF3bTLfhBf9-qYT/view?usp=sharing) for the steps. Load this csv to pandas dataframe.

In [None]:
# Load CPU usage time series csv files (from metrics explorer) for default memory, 150Mi and 300Mi tests
# Get these tests min datetime value and max datetime value (from test duration) and key in it to metrics explorer
# together with (b) PromQL command

df_def_memory = get_timeseries('defaultBandwidthUsageMemory.csv')
# df_def_memory

df_5M_memory = get_timeseries('5MBandwidthUsageMemory.csv')
# df_5M_memory

df_30M_memory = get_timeseries('30MBandwidthUsageMemory.csv')
# df_30M_memory

In [None]:
## Getting memory usage for default Test

# Apply the function to each row in grouped_df
df_def_duration['memory_usage_default'] = df_def_duration.apply(
    lambda row: find_max_telemetry(row['min_datetime_myt'], row['max_datetime_myt'], df_def_memory), axis=1
)
# Select the desired columns using a list
df_def_duration

Unnamed: 0,num_nodes,min,max,min_datetime_myt,max_datetime_myt,bytes_transmit_default,bytes_received_default,cpu_usage_default,memory_usage_default
0,10,1.742951e+18,1.742951e+18,3/26/2025 08:56:05,3/26/2025 08:57:19,9329238016,47174980.0,9.050088,9329238016
1,50,1.742951e+18,1.742951e+18,3/26/2025 09:00:21,3/26/2025 09:01:42,11595186176,209090300.0,16.925751,11595186176
2,100,1.742951e+18,1.742951e+18,3/26/2025 09:04:32,3/26/2025 09:05:56,14180278272,7418050.0,12.981212,14180278272
3,200,1.742951e+18,1.742951e+18,3/26/2025 09:07:54,3/26/2025 09:09:28,19956338688,23331430.0,25.300223,19956338688
4,400,1.742952e+18,1.742952e+18,3/26/2025 09:11:57,3/26/2025 09:13:52,31008026624,2219467.0,34.152766,31008026624
5,600,1.742952e+18,1.742952e+18,3/26/2025 09:22:59,3/26/2025 09:25:18,42139561984,2723310.0,49.263881,42139561984


In [None]:
## Getting memory usage for 150Mi Test

# Apply the function to each row in grouped_df
df_5M_duration['memory_usage_5Mbps'] = df_5M_duration.apply(
    lambda row: find_max_telemetry(row['min_datetime_myt'], row['max_datetime_myt'], df_5M_memory), axis=1
)
# Select the desired columns using a list
df_5M_duration

Unnamed: 0,num_nodes,min,max,min_datetime_myt,max_datetime_myt,bytes_transmit_5Mbps,bytes_received_5Mbps,cpu_usage_5Mbps,memory_usage_5Mbps
0,10,1.742872e+18,1.742872e+18,3/25/2025 11:01:49,3/25/2025 11:03:03,4208669.0,160078000.0,17.225606,23012179968
1,50,1.742873e+18,1.742873e+18,3/25/2025 11:23:00,3/25/2025 11:24:18,4725583.0,346329500.0,29.90123,24056582144
2,100,1.742873e+18,1.742873e+18,3/25/2025 11:27:11,3/25/2025 11:28:37,5261640.0,11452100.0,13.592442,27909603328
3,200,1.742874e+18,1.742874e+18,3/25/2025 11:40:19,3/25/2025 11:41:57,6264489.0,12020380.0,17.060342,32409198592
4,400,1.742874e+18,1.742874e+18,3/25/2025 11:44:47,3/25/2025 11:46:51,9036683.0,6826106.0,46.178877,44773908480
5,600,1.742875e+18,1.742875e+18,3/25/2025 11:55:27,3/25/2025 11:57:59,12000800.0,9122132.0,36.297967,57669419008


In [None]:
## Getting memory usage for 300Mi Test

# Apply the function to each row in grouped_df
df_30M_duration['memory_usage_30Mbps'] = df_30M_duration.apply(
    lambda row: find_max_telemetry(row['min_datetime_myt'], row['max_datetime_myt'], df_30M_memory), axis=1
)
# Select the desired columns using a list
df_30M_duration

Unnamed: 0,num_nodes,min,max,min_datetime_myt,max_datetime_myt,bytes_transmit_30Mbps,bytes_received_30Mbps,cpu_usage_30Mbps,memory_usage_30Mbps
0,10,1.742877e+18,1.742877e+18,3/25/2025 12:25:42,3/25/2025 12:26:56,4101905.0,1035093.0,11.814776,22754201600
1,50,1.742877e+18,1.742877e+18,3/25/2025 12:21:53,3/25/2025 12:23:10,4589670.0,1613858.0,14.886363,25108324352
2,100,1.742876e+18,1.742876e+18,3/25/2025 12:18:30,3/25/2025 12:19:54,4853697.0,2055274.0,18.997858,26949328896
3,200,1.742876e+18,1.742876e+18,3/25/2025 12:14:36,3/25/2025 12:16:11,6951564.0,5399986.0,31.798056,34674401280
4,400,1.742876e+18,1.742876e+18,3/25/2025 12:10:51,3/25/2025 12:12:53,8770121.0,6834280.0,46.888647,45562920960
5,600,1.742875e+18,1.742876e+18,3/25/2025 12:04:57,3/25/2025 12:07:22,12360720.0,10061080.0,63.597267,57723617280


In [None]:
# Combine all results to get a table
df_default = df_def_duration[['num_nodes', 'bytes_transmit_default', 'bytes_received_default', 'cpu_usage_default', 'memory_usage_default']]
# df_default

df_5M = df_5M_duration[['num_nodes', 'bytes_transmit_5Mbps', 'bytes_received_5Mbps', 'cpu_usage_5Mbps', 'memory_usage_5Mbps']]
# df_5M

df_30M = df_30M_duration[['num_nodes', 'bytes_transmit_30Mbps', 'bytes_received_30Mbps', 'cpu_usage_30Mbps', 'memory_usage_30Mbps']]
# df_30M

# Merge DataFrame
df_all = pd.merge(df_default, df_5M, on='num_nodes', how='outer')
df_all = pd.merge(df_all, df_30M, on='num_nodes', how='outer')
df_all

Unnamed: 0,num_nodes,bytes_transmit_default,bytes_received_default,cpu_usage_default,memory_usage_default,bytes_transmit_5Mbps,bytes_received_5Mbps,cpu_usage_5Mbps,memory_usage_5Mbps,bytes_transmit_30Mbps,bytes_received_30Mbps,cpu_usage_30Mbps,memory_usage_30Mbps
0,10,9329238016,47174980.0,9.050088,9329238016,4208669.0,160078000.0,17.225606,23012179968,4101905.0,1035093.0,11.814776,22754201600
1,50,11595186176,209090300.0,16.925751,11595186176,4725583.0,346329500.0,29.90123,24056582144,4589670.0,1613858.0,14.886363,25108324352
2,100,14180278272,7418050.0,12.981212,14180278272,5261640.0,11452100.0,13.592442,27909603328,4853697.0,2055274.0,18.997858,26949328896
3,200,19956338688,23331430.0,25.300223,19956338688,6264489.0,12020380.0,17.060342,32409198592,6951564.0,5399986.0,31.798056,34674401280
4,400,31008026624,2219467.0,34.152766,31008026624,9036683.0,6826106.0,46.178877,44773908480,8770121.0,6834280.0,46.888647,45562920960
5,600,42139561984,2723310.0,49.263881,42139561984,12000800.0,9122132.0,36.297967,57669419008,12360720.0,10061080.0,63.597267,57723617280


In [None]:
def convert_units(df):
    """
    Converts the numeric columns of a DataFrame to human-readable units.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: The DataFrame with converted units.
    """

    def human_readable_bytes(bytes_val):
        if bytes_val is None:
          return "N/A"
        units = ['B', 'KB', 'MB', 'GB', 'TB']
        bytes_val = float(bytes_val)
        i = 0
        while bytes_val >= 1024:
            bytes_val /= 1024
            i += 1
        return f"{bytes_val:.2f} {units[i]}"

    def human_readable_memory(bytes_val):
        if bytes_val is None:
          return "N/A"
        units = ['B', 'KB', 'MB', 'GB', 'TB']
        bytes_val = float(bytes_val)
        i = 0
        while bytes_val >= 1024:
            bytes_val /= 1024
            i += 1
        return f"{bytes_val:.2f} {units[i]}"

    def human_readable_cpu(cpu_val):
        if cpu_val is None:
          return "N/A"
        return f"{cpu_val:.2f}%"

    # Apply conversions
    df['bytes_transmit_default'] = df['bytes_transmit_default'].apply(human_readable_bytes)
    df['bytes_received_default'] = df['bytes_received_default'].apply(human_readable_bytes)
    df['cpu_usage_default'] = df['cpu_usage_default'].apply(human_readable_cpu)
    df['memory_usage_default'] = df['memory_usage_default'].apply(human_readable_memory)
    df['bytes_transmit_5Mbps'] = df['bytes_transmit_5Mbps'].apply(human_readable_bytes)
    df['bytes_received_5Mbps'] = df['bytes_received_5Mbps'].apply(human_readable_bytes)
    df['cpu_usage_5Mbps'] = df['cpu_usage_5Mbps'].apply(human_readable_cpu)
    df['memory_usage_5Mbps'] = df['memory_usage_5Mbps'].apply(human_readable_memory)
    df['bytes_transmit_30Mbps'] = df['bytes_transmit_30Mbps'].apply(human_readable_bytes)
    df['bytes_received_30Mbps'] = df['bytes_received_30Mbps'].apply(human_readable_bytes)
    df['cpu_usage_30Mbps'] = df['cpu_usage_30Mbps'].apply(human_readable_cpu)
    df['memory_usage_30Mbps'] = df['memory_usage_30Mbps'].apply(human_readable_memory)

    return df

In [None]:
# Convert units
df_converted = convert_units(df_all) #Use copy to avoid modifying the original dataframe
df_converted

Unnamed: 0,num_nodes,bytes_transmit_default,bytes_received_default,cpu_usage_default,memory_usage_default,bytes_transmit_5Mbps,bytes_received_5Mbps,cpu_usage_5Mbps,memory_usage_5Mbps,bytes_transmit_30Mbps,bytes_received_30Mbps,cpu_usage_30Mbps,memory_usage_30Mbps
0,10,8.69 GB,44.99 MB,9.05%,8.69 GB,4.01 MB,152.66 MB,17.23%,21.43 GB,3.91 MB,1010.83 KB,11.81%,21.19 GB
1,50,10.80 GB,199.40 MB,16.93%,10.80 GB,4.51 MB,330.29 MB,29.90%,22.40 GB,4.38 MB,1.54 MB,14.89%,23.38 GB
2,100,13.21 GB,7.07 MB,12.98%,13.21 GB,5.02 MB,10.92 MB,13.59%,25.99 GB,4.63 MB,1.96 MB,19.00%,25.10 GB
3,200,18.59 GB,22.25 MB,25.30%,18.59 GB,5.97 MB,11.46 MB,17.06%,30.18 GB,6.63 MB,5.15 MB,31.80%,32.29 GB
4,400,28.88 GB,2.12 MB,34.15%,28.88 GB,8.62 MB,6.51 MB,46.18%,41.70 GB,8.36 MB,6.52 MB,46.89%,42.43 GB
5,600,39.25 GB,2.60 MB,49.26%,39.25 GB,11.44 MB,8.70 MB,36.30%,53.71 GB,11.79 MB,9.59 MB,63.60%,53.76 GB
