# Energy Measurement Evaluation

This notebook evaluates power consumption and energy trends from experiment results.  
The data is collected from multiple nodes and analyzed for insights into power usage, voltage, and energy consumption.

In [None]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
from typing import List, Dict, Set, Optional

# ----------------------------- #
# CONFIGURATION & SETUP
# ----------------------------- #
BASE_RESULT_FOLDER = "/srv/testbed/results/warmuth/default/"
DEFAULT_SUBFOLDER = "2025-03-19_11-59-58_754222"
sns.set_theme(style="whitegrid")
plt.rcParams.update({"axes.titlesize": 14, "axes.labelsize": 12})


# ----------------------------- #
# INPUT AND UI UTILITIES
# ----------------------------- #
def get_user_input(prompt: str, default: str) -> str:
    """Prompt user input with a default fallback."""
    try:
        user_input = input(f"{prompt}\n[{default}]\n> ").strip()
        return user_input or default
    except Exception:
        print("[Warning] Failed to read user input, using default.")
        return default


def make_link(text: str, url: str) -> str:
    """Generate an HTML link or fallback to plain text."""
    return f'<a href="{url}" target="_blank">{text}</a>' if url != "Unknown" else "Unknown"


def make_clickable(path: str) -> str:
    """Make a local file path clickable as an HTML link."""
    return f'<a href="{path}" target="_blank">Open PDF</a>' if path != "None" else "No topology available"


# ----------------------------- #
# FOLDER AND RUN HANDLING
# ----------------------------- #
def resolve_result_folder() -> str:
    """Resolve and validate the result folder path."""
    default_path = os.path.join(BASE_RESULT_FOLDER, DEFAULT_SUBFOLDER)
    result_folder = get_user_input("Enter result folder path (leave empty to use default):", default_path)
    if not os.path.isabs(result_folder):
        result_folder = os.path.join(BASE_RESULT_FOLDER, result_folder)
    if not os.path.exists(result_folder):
        raise FileNotFoundError(f"Result folder does not exist: {result_folder}")
    print(f"[Info] Using result folder:\n{result_folder}")
    return result_folder


def detect_runs(energy_folder: str) -> List[str]:
    """Detect available run IDs based on CSV filenames in the energy folder."""
    runs = set()
    for node in os.listdir(energy_folder):
        node_path = os.path.join(energy_folder, node)
        if os.path.isdir(node_path):
            for f in os.listdir(node_path):
                if f.endswith(".csv") and "_run" in f:
                    run_id = f.split("_run")[-1].split(".")[0]
                    runs.add(run_id)
    return sorted(runs)


def select_from_list(available_items: List[str], label: str) -> Set[str]:
    """Prompt user to select items from a list."""
    print(f"Available {label}:\n{', '.join(available_items)}")
    user_input = get_user_input(f"Enter {label} to include (comma-separated), or press Enter to include all:", "")
    if not user_input:
        return set(available_items)
    selected = {x.strip() for x in user_input.split(',')}
    invalid = selected - set(available_items)
    if invalid:
        raise ValueError(f"Invalid {label} entered: {', '.join(invalid)}")
    return selected


# ----------------------------- #
# RO-CRATE PARSER
# ----------------------------- #
class ROCrateParser:
    """Class to parse metadata from a RO-Crate JSON structure."""

    def __init__(self, metadata_path: str):
        self.metadata = self._load_json(metadata_path)

    def _load_json(self, path: str) -> dict:
        """Load and return a JSON object from a file path."""
        try:
            with open(path, 'r') as f:
                return json.load(f)
        except (FileNotFoundError, json.JSONDecodeError) as e:
            print(f"[Error] Failed to load JSON from {path}: {e}")
            return {}

    def extract_creators(self) -> List[Dict]:
        """Extract and format creator information."""
        creators = []
        credit_url = "https://www.elsevier.com/researcher/author/policies-and-guidelines/credit-author-statement"
        for item in self.metadata.get("@graph", []):
            if item.get("@type") == "Person" and "creator" in item.get("tags", []):
                description = item.get("description", "Unknown")
                contribution_link = make_link(description, credit_url) if description != "Unknown" else "Unknown"
                aff = item.get("affiliation", {}).get("@id", None)

                affiliation_info = {"Affiliation Name": "Unknown", "Affiliation ROR": "Unknown", "Affiliation URL": "Unknown"}
                if aff:
                    for org in self.metadata.get("@graph", []):
                        if org.get("@id") == aff:
                            affiliation_info = {
                                "Affiliation Name": org.get("name", "Unknown"),
                                "Affiliation ROR": make_link("ROR ID", org.get("@id", "Unknown")),
                                "Affiliation URL": make_link("Website", org.get("url", "Unknown"))
                            }
                            break

                creators.append({
                    "Creator Name": item.get("name", "Unknown"),
                    "ORCID": make_link("ORCID Profile", item.get("@id", "Unknown")),
                    "Contribution (CRediT)": contribution_link,
                    **affiliation_info
                })
        return creators

    def extract_nodes(self, result_folder: str) -> List[Dict]:
        """Extract node-level metadata including attached hardware and topologies."""
        nodes = []
        for item in self.metadata.get("@graph", []):
            if "tags" in item and "node" in item["tags"]:
                node = {
                    "name": item.get("name", "Unknown"),
                    "fqdn": item.get("fqdn", "Unknown"),
                    "topology_pdf": "None",
                    "hardware_json": "None"
                }
                for key, local_key in [("visualizedTopology", "topology_pdf"), ("hardware", "hardware_json")]:
                    obj_id = item.get(key, {}).get("@id")
                    if obj_id:
                        path = os.path.join(result_folder, obj_id)
                        if os.path.exists(path):
                            node[local_key] = path
                nodes.append(node)
        return nodes

    def extract_hardware_info(self, path: str) -> Dict:
        """Extract hardware information from a JSON file."""
        if not path or not os.path.exists(path):
            return {k: "Unknown" for k in ["CPU", "Cores", "Threads", "Memory", "NICs"]}
        try:
            data = self._load_json(path)
            cpus = data.get("processor", [])
            cpu_model = ", ".join(cpu.get("model", "Unknown") for cpu in cpus)
            cores = ", ".join(str(cpu.get("cores", "Unknown")) for cpu in cpus)
            threads = ", ".join(str(cpu.get("threads", "Unknown")) for cpu in cpus)
            mem = data.get("memory", {})
            mem_str = f"{mem.get('installed_capacity_human_val', 'Unknown')} {mem.get('installed_capacity_human_unit', '')}".strip()
            nics = "<br>".join(n.get("model", "") for n in data.get("network", []) if isinstance(n, dict)) or "No NICs detected"
            return {"CPU": cpu_model, "Cores": cores, "Threads": threads, "Memory": mem_str, "NICs": nics}
        except Exception:
            return {k: "Unknown" for k in ["CPU", "Cores", "Threads", "Memory", "NICs"]}


# ----------------------------- #
# ENERGY DATA LOADING
# ----------------------------- #
def load_energy_data_from_csv(result_folder: str, selected_nodes: Set[str], selected_runs: Set[str]) -> pd.DataFrame:
    """Load and preprocess energy data from CSV files."""
    energy_folder = os.path.join(result_folder, "energy")
    all_data = []
    for node in os.listdir(energy_folder):
        if selected_nodes and node not in selected_nodes:
            continue
        node_path = os.path.join(energy_folder, node)
        if os.path.isdir(node_path):
            for file in os.listdir(node_path):
                if file.endswith(".csv") and "_run" in file:
                    run_id = file.split("_run")[-1].split(".")[0]
                    if selected_runs and run_id not in selected_runs:
                        continue
                    df = pd.read_csv(os.path.join(node_path, file))
                    df["timestamp"] = pd.to_datetime(df["timestamp"], format="%Y%m%d%H%M%S%f")
                    df["node"], df["run"] = node, run_id
                    df = merge_energy_channels(df)
                    all_data.append(df)
    if not all_data:
        raise ValueError("No valid CSV files found.")
    return pd.concat(all_data, ignore_index=True)


def load_energy_data(source: str = "csv", **kwargs) -> pd.DataFrame:
    """Dispatch loading logic based on source type."""
    if source == "csv":
        return load_energy_data_from_csv(**kwargs)
    raise ValueError(f"Unsupported energy data source: {source}")


def merge_energy_channels(df: pd.DataFrame) -> pd.DataFrame:
    """Merge _0/_1 columns into unified channel columns."""
    prefixes = {col.rsplit('_', 1)[0] for col in df.columns if col.endswith('_0') or col.endswith('_1')}
    for prefix in prefixes:
        col0, col1 = f"{prefix}_0", f"{prefix}_1"
        if col0 in df.columns and col1 in df.columns:
            df[prefix] = df[col0] + df[col1]
            df.drop(columns=[col0, col1], inplace=True)
    return df

def remove_outliers(df: pd.DataFrame) -> pd.DataFrame:
    """
    Remove outliers from all numeric columns using the IQR method.
    """
    filtered_df = df.copy()
    numeric_cols = filtered_df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        Q1 = filtered_df[col].quantile(0.25)
        Q3 = filtered_df[col].quantile(0.75)
        IQR = Q3 - Q1
        mask = (filtered_df[col] >= Q1 - 1.5 * IQR) & (filtered_df[col] <= Q3 + 1.5 * IQR)
        filtered_df = filtered_df[mask]
    return filtered_df

def compute_corrected_energy(df: pd.DataFrame) -> pd.DataFrame:
    """Compute energy usage for each (node, run), accounting for resets."""
    result = []
    for (node, run), group in df.groupby(["node", "run"]):
        first, last = group["energy_counter_Wh"].iloc[0], group["energy_counter_Wh"].iloc[-1]
        used = last if (group["energy_counter_Wh"].diff() < 0).any() else last - first
        result.append({"node": node, "run": run, "energy_used": used})
    return pd.DataFrame(result)

def preprocess_energy_data(df: pd.DataFrame, smoothing_window: int = 5) -> pd.DataFrame:
    """Adds derived columns needed for plotting"""
    df = df.copy()
    df["timestamp_relative"] = (df["timestamp"] - df["timestamp"].min()).dt.total_seconds()
    df["energy_counter_mWh"] = df["energy_counter_Wh"] * 1000
    df["voltage_V_smoothed"] = df["voltage_V"].rolling(window=smoothing_window, min_periods=1).mean()
    return df

def compute_total_corrected_energy(df: pd.DataFrame) -> pd.DataFrame:
    """Computes corrected energy per run and aggregates it per node."""
    df_corrected = compute_corrected_energy(df)
    df_grouped = df_corrected.groupby("node")["energy_used"].sum().reset_index()
    return df_grouped

# ----------------------------- #
# VISUALIZATION FUNCTIONS
# ----------------------------- #
def plot_bar_with_labels(data: pd.DataFrame, x: str, y: str, title: str, ylabel: str) -> plt.Figure:
    """Create a bar plot with numeric labels."""
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(data=data, x=x, y=y, hue=x, dodge=False, legend=False, ax=ax)
    for index, row in data.iterrows():
        ax.text(index, row[y] + 0.01, f"{row[y]:.2f}", ha="center", va="bottom", fontsize=10)
    ax.set_title(title)
    ax.set_xlabel(x)
    ax.set_ylabel(ylabel)
    fig.tight_layout()


def plot_time_series(df: pd.DataFrame, x: str, y: str, hue: str, title: str, ylabel: str) -> plt.Figure:
    """Plot time series data grouped by hue."""
    fig, ax = plt.subplots(figsize=(12, 6))
    sns.lineplot(data=df, x=x, y=y, hue=hue, linewidth=2, ax=ax)
    ax.set_title(title)
    ax.set_xlabel(x)
    ax.set_ylabel(ylabel)
    ax.tick_params(axis='x', rotation=45)
    ax.legend(title=hue, bbox_to_anchor=(1.05, 1), loc='upper left')
    fig.tight_layout()


def plot_raw_energy_counter(df: pd.DataFrame) -> plt.Figure:
    """Plot raw energy counter data for each node."""
    fig, ax = plt.subplots(figsize=(12, 6))
    for node, group in df.groupby("node"):
        ax.plot(group["timestamp"], group["energy_counter_Wh"], label=node, marker="o")
    ax.set_title("Raw Energy Counter Over Time")
    ax.set_xlabel("Timestamp")
    ax.set_ylabel("Energy (Wh)")
    ax.tick_params(axis='x', rotation=45)
    ax.legend(title="Node", bbox_to_anchor=(1.05, 1), loc='upper left')
    fig.tight_layout()

# ----------------------------- #
# BASIC DATAFRAME DISPLAY HELPERS
# ----------------------------- #
def display_df_head_tail(df):
    display(HTML("<h3>First Rows</h3>"))
    display(df.head())
    display(HTML("<h3>Last Rows</h3>"))
    display(df.tail())

def display_summary_statistics(df):
    temp_df = df.drop(columns=['node', 'run'], errors='ignore')
    stats = temp_df.describe(exclude=[np.datetime64])
    display(HTML("<h3>Summary Statistics</h3>"))
    display(stats)

# ----------------------------- #
# STYLED HTML TABLE DISPLAY
# ----------------------------- #
def display_styled_html_table(df, width="90%"):
    html_table = df.to_html(escape=False, index=False)
    styled_table = f"""
    <style>
        table {{
            width: {width};
            border-collapse: collapse;
            margin: 20px 0;
        }}
        th, td {{
            padding: 8px 12px;
            border: 1px solid #ddd;
            text-align: left;
        }}
        th {{
            background-color: #f4f4f4;
            font-weight: bold;
        }}
    </style>
    {html_table}
    """
    display(HTML(styled_table))

## Specify the Result Folder

Before loading data, enter the path to your experiment result folder.  
By default, the last used path is shown, but you can change it to any valid directory.

In [None]:
result_folder = resolve_result_folder()
energy_folder = os.path.join(result_folder, "energy")

runs = detect_runs(energy_folder)
selected_runs = select_from_list(runs, "run IDs")

nodes = sorted([d for d in os.listdir(energy_folder) if os.path.isdir(os.path.join(energy_folder, d))])
selected_nodes = select_from_list(nodes, "node names")

parser = ROCrateParser(os.path.join(result_folder, "ro-crate-metadata.json"))

## Creator Information

The following table presents details about the experiment's creator, extracted from the **RO-Crate metadata**.

- **Name:** The name of the creator.
- **ORCID:** A unique researcher identifier, linked to the official ORCID profile.
- **Affiliation:** The institution the creator is affiliated with.
- **Affiliation ROR:** A **Research Organization Registry (ROR) ID**, used for standard identification of research institutions.
- **Affiliation URL:** A direct link to the institutionâ€™s website.

In [None]:
creator_df = pd.DataFrame(parser.extract_creators())
display_styled_html_table(creator_df)

## Node Information & Topology Visualization

Each experiment setup includes metadata about the participating nodes.  
This section extracts details such as:
- Node names
- Links to the Testbed -> Entrypoint to Testbed
- Fully Qualified Domain Names (FQDN)
- Topology information (if available).

If a **topology visualization** is provided in the RO-Crate metadata, it is linked below.

In [None]:
nodes_info = parser.extract_nodes(result_folder)
hw_details = [parser.extract_hardware_info(n["hardware_json"]) for n in nodes_info]

df_nodes = pd.DataFrame(nodes_info).drop(columns="hardware_json")
df_hw = pd.DataFrame(hw_details)

full_node_df = pd.concat([df_nodes, df_hw], axis=1)
full_node_df["topology_pdf"] = full_node_df["topology_pdf"].apply(make_clickable)

display_styled_html_table(full_node_df)

## Loading and Previewing Data

The energy measurement data is stored in CSV format, with each node having its own folder inside the `energy` directory.

The dataset includes:
- **Timestamp** (`timestamp`): Time when the measurement was recorded.
- **Current** (`current_mA`): Measured current in milliamps (mA).
- **Voltage** (`voltage_V`): Measured voltage in volts (V).
- **Power Consumption** (`power_active_W`): Active power in watts (W).
- **Energy Counter** (`energy_counter_Wh`): Cumulative energy usage in watt-hours (Wh).

Below, we load the data and display a preview.

In [None]:
df = load_energy_data(
    source='csv',
    result_folder=result_folder,
    selected_nodes=selected_nodes,
    selected_runs=selected_runs
)
df = remove_outliers(df)

display_df_head_tail(df)

## Common Set of Statistical Evaluations

After loading the data, the notebook performs a standard set of statistical evaluations  
to understand the structure and integrity of the dataset. This includes:

- Summary statistics of numerical columns  
- Detection of missing values  
- Identification of outliers or unusual value ranges  
- Checks for consistency across runs and nodes  

These steps support further analysis by providing insights into data quality and distribution.

In [None]:
display_summary_statistics(df)

## Energy Data Visualizations

This section presents several plots to help analyze the energy consumption behavior across different nodes and time

In [None]:
df = preprocess_energy_data(df)
df_grouped = compute_total_corrected_energy(df)

## Total Corrected Energy Consumption Per Node

This bar plot shows the total energy consumed by each node across all selected experiment runs.
The values are computed from the energy counter, with corrections to account for possible counter resets.
It provides a straightforward comparison of total power usage per node during the observed workload.

In [None]:
plot_bar_with_labels(df_grouped, "node", "energy_used", "Total Corrected Energy Consumption Per Node", "Total Energy (Wh)")

## Power Consumption Over Time

The following plot shows the power consumption trends over time for different nodes.  
This helps us observe variations in power usage and detect potential anomalies.

In [None]:
plot_time_series(df, "timestamp", "power_active_W", "node", "Power Over Time", "Power (W)")
plot_time_series(df, "timestamp_relative", "power_active_W", "node", "Relative Power Over Time", "Power (W)")

## Cumulative Energy Consumption

The energy counter represents the cumulative energy consumed over time.  
This plot provides insights into the total energy usage per node and how it changes over the experiment duration.

In [None]:
plot_time_series(df, "timestamp", "energy_counter_mWh", "node", "Cumulative Energy", "Energy (mWh)")

## Current and Voltage Trends

To better understand the electrical characteristics, we visualize:
- **Current (mA) over time** to see how power draw fluctuates.
- **Voltage (V) over time** to ensure stability across measurements.

In [None]:
plot_time_series(df, "timestamp", "current_mA", "node", "Current Over Time", "Current (mA)")
plot_time_series(df, "timestamp", "voltage_V_smoothed", "node", "Smoothed Voltage", "Voltage (V)")

### Energy Consumption Rate Over Time (TODO)

This plot shows the **rate at which energy is consumed over time (mW/s)**.  
Instead of cumulative energy, this visualization helps identify **periods of high workload**.  
A higher energy rate means that the system was **actively consuming more power**,  
which may indicate high CPU load or network traffic.

In [None]:
plot_raw_energy_counter(df)