This is a helper script that allows you to fetch prometheus data after the fact.
(e.g., when the data you collected while running the experiments were incorrect.)

In [1]:

import itertools
import json
import pandas as pd
from datetime import datetime
import os
import math

from gssi_experiment.util.prometheus_helper import (
    LatestCpuUtilizationFetcher,
    TIME_FORMAT,
)
from gssi_experiment.util.util import iterate_through_nested_folders


  warn(f"Class {__name__} is deprecated.")


In [2]:
# Fetches prometheus data.

output_path = "./posterior_prometheus_fetcher_data.csv"

time_window_in_minutes = 4 * 24 * 60 + 12 * 60
print(f'{time_window_in_minutes=}')

fetcher = LatestCpuUtilizationFetcher(output_path, time_window_in_minutes)
fetcher.fetch_latest_cpu_utilization()

time_window_in_minutes=6480
Wrote 527452 CPU utilization entries.


In [3]:
# Loads the dataset without ruining RAM

df: pd.DataFrame = None
for i, chunk in enumerate(pd.read_csv(output_path, header=0, chunksize=40000)):
    print(f'loaded chunk {i + 1}')
    if df is None:
        df = chunk
    else:
        df = pd.concat([df, chunk])

loaded chunk 1
loaded chunk 2
loaded chunk 3
loaded chunk 4
loaded chunk 5
loaded chunk 6
loaded chunk 7
loaded chunk 8
loaded chunk 9
loaded chunk 10
loaded chunk 11
loaded chunk 12
loaded chunk 13
loaded chunk 14


In [7]:
def reshape_timestamp(series: pd.Series):
    data = []
    for ele in series.values:
        try:
            q = datetime.strptime(ele, "%Y-%m-%d %H:%M:%S.%f")
        except:
            q = ""
        data.append(q)
    ser = pd.Series(data)
    return ser


df["parsed_timestamp"] = df["timestamp"].transform(reshape_timestamp)

# Removes entries with invalid datetime formats;
# which are marked by empty strings.
invalid_dates = df[df["parsed_timestamp"] == ""]
broken_entries_count = len(invalid_dates)
print(f"{broken_entries_count=}")
df = df.drop(invalid_dates.index)
print(f'{len(df)=}')

0        2023-11-19 20:42:50.002
1        2023-11-19 20:42:51.266
2        2023-11-19 20:42:51.421
3        2023-11-19 20:42:51.778
4        2023-11-19 20:42:53.606
                   ...          
526904   2023-11-24 08:42:39.573
526905   2023-11-24 08:42:40.107
526906   2023-11-24 08:42:40.300
526907   2023-11-24 08:42:41.082
526908   2023-11-24 08:42:41.536
Length: 526909, dtype: datetime64[ns]


ValueError: Function did not transform

In [9]:
def reset_column_counters(df: pd.DataFrame):
    """Resets the counter suffix in the column names."""
    base_names = [col.split(".")[0] for col in df.columns]
    counters = {un: 0 for un in set(base_names)}
    new_columns = []
    for column in base_names:
        if counters[column] == 0:
            # Adds the cleaned up name if there are no duplicates.
            new_columns.append(column)
        else:
            # Adds the index in the suffix again if its a duplicate.
            new_columns.append(f"{column}.{counters[column]}")
        counters[column] += 1
    df.columns = new_columns


def sample_relevant_data(experiment_folder: str):
    """Creates a .csv file for each experiment containing
    only relevant data to that experiment."""
    # Identifies the start and end time of the experiment.
    metadata_path = f"{experiment_folder}/metadata.json"
    with open(metadata_path, "r", encoding="utf-8") as metadata_file:
        metadata = json.loads(metadata_file.read())
    start_time = datetime.strptime(metadata["start_time"], TIME_FORMAT)
    end_time = datetime.strptime(metadata["end_time"], TIME_FORMAT)

    # Samples all data based on those times.
    sub_df: pd.DataFrame = df[df["parsed_timestamp"] >= start_time]
    sub_df = sub_df[sub_df["parsed_timestamp"] <= end_time]

    # Drops useless / duplicate data.
    sub_df.dropna(how="all", axis=1, inplace=True)
    sub_df = sub_df.drop(["parsed_timestamp"], axis=1)

    # Resets column names and row indices.
    reset_column_counters(sub_df)
    sub_df.reset_index()

    # Writes it to the output.
    cpu_utilization_path = f"{experiment_folder}/cpu_utilization_raw.csv"
    sub_df.to_csv(cpu_utilization_path, index=False)


ga_folder = os.path.abspath("../gateway_aggregator/results/pinciroli_replication_1")
go_folder = os.path.abspath("../gateway_offloading/results/pinciroli_replication_1")
pnfj_folder = os.path.abspath(
    "../pipes_and_filters/pipes_and_filters_joint/results/pinciroli_replication_1"
)
pnfs_folder = os.path.abspath(
    "../pipes_and_filters/pipes_and_filters_separated/results/pinciroli_replication_1"
)

experiment_folders = itertools.chain(
    iterate_through_nested_folders(ga_folder, max_depth=2),
    iterate_through_nested_folders(go_folder, max_depth=3),
    iterate_through_nested_folders(pnfj_folder, max_depth=2),
    iterate_through_nested_folders(pnfs_folder, max_depth=2),
)

count = 0
for experiment_folder in experiment_folders:
    print(experiment_folder)
    sample_relevant_data(experiment_folder)
    count += 1
print(f'Replaced {count} cpu utilization files.')

/workspaces/muBench-experiment/gssi_experiment/gateway_aggregator/results/pinciroli_replication_1/experiment_1/2023_11_21/0_steps
/workspaces/muBench-experiment/gssi_experiment/gateway_aggregator/results/pinciroli_replication_1/experiment_1/2023_11_21/1_steps
/workspaces/muBench-experiment/gssi_experiment/gateway_aggregator/results/pinciroli_replication_1/experiment_1/2023_11_21/2_steps
/workspaces/muBench-experiment/gssi_experiment/gateway_aggregator/results/pinciroli_replication_1/experiment_1/2023_11_21/3_steps
/workspaces/muBench-experiment/gssi_experiment/gateway_aggregator/results/pinciroli_replication_1/experiment_1/2023_11_21/4_steps
/workspaces/muBench-experiment/gssi_experiment/gateway_aggregator/results/pinciroli_replication_1/experiment_1/2023_11_21/5_steps
/workspaces/muBench-experiment/gssi_experiment/gateway_aggregator/results/pinciroli_replication_1/experiment_1/2023_11_23/0_steps
/workspaces/muBench-experiment/gssi_experiment/gateway_aggregator/results/pinciroli_replic