# **CPU Energy Model Construction Notebook**

This notebook builds a server-specific energy model by analyzing stress experiment runs.  
The aim is to estimate the **per-core power consumption** based on controlled workloads, and use this to form a predictive model for energy estimation.

We assume:
- Each experiment run uses a known number of CPU cores.
- Power consumption is measured over time.
- The base (idle) power consumption of the system is known or measured separately.

In [None]:
import os
import json
from datetime import datetime
from typing import Dict, List, Optional, Tuple

import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
import seaborn as sns

# === Constants ===
POWER_COLUMN = "power_active_W"
CPU_MODEL_FOLDER = "cpu_models"

# === Build User Config ===
def build_config(server_name=None, result_subfolder=None, idle_power=None):
    if not server_name:
        server_name = input("Enter the server name (e.g., riga): ").strip()
    if not result_subfolder:
        result_subfolder = input("Enter the result subfolder name (e.g., 2024_stress_4_cores): ").strip()
    try:
        idle_power_input = input("Enter the idle power in watts (e.g., 58.0): ").strip()
        idle_power = float(idle_power_input)
    except ValueError:
        print("Invalid number. Please enter a valid float value (e.g., 58.0).")
        raise

    base_result_path = os.path.join("/srv/testbed/results/warmuth/default")
    full_result_folder = os.path.join(base_result_path, result_subfolder)

    if not os.path.exists(full_result_folder):
        raise ValueError(f"Warning: The folder {full_result_folder} does not exist.")

    return {
        "result_folder": str(full_result_folder),
        "server_name": server_name,
        "idle_power": idle_power,
        "csv_base_name": "measurement_run",
        "csv_extension": ".csv",
        "map_to": "cores",
        "model_output": "cpu_model",
        "fix_idle": False
    }

# === Configuration Utilities ===
def validate_config(config: Dict, required_keys: List[str]) -> None:
    missing = [key for key in required_keys if key not in config]
    if missing:
        raise ValueError(f"Missing required config keys: {missing}")

def get_csv_folder(config: Dict) -> str:
    return os.path.join(config["result_folder"], "energy", config["server_name"])

# === Data Loading and Processing ===
def generate_run_mapping(config: Dict, fallback_cores: int = 4) -> Dict[str, int]:
    if "start_index" not in config or "end_index" not in config:
        cpu_info = load_cpu_info(config)
        cores = cpu_info.get("cores", fallback_cores)
        config["start_index"] = 0
        config["end_index"] = cores - 1

    start = config["start_index"]
    end = config["end_index"]

    max_digits = len(str(end + 1))

    mapping = {
        f"{config['csv_base_name']}{str(i).zfill(max_digits)}{config['csv_extension']}": i + 1
        for i in range(start, end + 1)
    }

    print(f"Run mapping for '{config['map_to']}':\n{json.dumps(mapping, indent=2)}")
    return mapping

def load_power_data(
    csv_folder: str,
    run_mapping: Dict[str, int],
    p_base: float,
    power_column: str = POWER_COLUMN
) -> pd.DataFrame:
    results = []
    for filename, cores in run_mapping.items():
        path = os.path.join(csv_folder, filename)
        df = pd.read_csv(path)
        avg_power = df[power_column].mean()
        per_core_power = (avg_power - p_base) / cores
        results.append({
            "filename": filename,
            "cores": cores,
            "avg_power": round(avg_power, 2),
            "per_core_power": round(per_core_power, 2),
        })
    print(f"Loaded data for {len(results)} runs.")
    return pd.DataFrame(results)

# === Modeling ===
def fit_models(
    model_df: pd.DataFrame,
    p_base: float,
    fix_idle: bool = False
) -> Tuple[LinearRegression, LinearRegression, PolynomialFeatures]:
    X = model_df[["cores"]]
    y = model_df["avg_power"]

    if fix_idle:
        reg_linear = LinearRegression(fit_intercept=False).fit(X, y - p_base)
        model_df["predicted_linear"] = reg_linear.predict(X) + p_base
    else:
        reg_linear = LinearRegression().fit(X, y)
        model_df["predicted_linear"] = reg_linear.predict(X)

    model_df["error_linear"] = (
        (model_df["avg_power"] - model_df["predicted_linear"]).abs()
        / model_df["avg_power"] * 100
    )

    poly = PolynomialFeatures(degree=2)
    X_poly = poly.fit_transform(X)

    if fix_idle:
        reg_poly = LinearRegression(fit_intercept=False).fit(X_poly, y - p_base)
        model_df["predicted_poly"] = reg_poly.predict(X_poly) + p_base
    else:
        reg_poly = LinearRegression().fit(X_poly, y)
        model_df["predicted_poly"] = reg_poly.predict(X_poly)

    model_df["error_poly"] = (
        (model_df["avg_power"] - model_df["predicted_poly"]).abs()
        / model_df["avg_power"] * 100
    )

    print("Model fitting complete.")
    return reg_linear, reg_poly, poly

def predict_cpu_power_detailed(
    per_core_loads: List[float],
    per_core_powers: List[float],
    p_base: float
) -> float:
    return p_base + sum(l * p for l, p in zip(per_core_loads, per_core_powers))

# === Visualization ===
def plot_predicted_load_levels(
    num_cores: Optional[int] = None,
    p_core: Optional[float] = None,
    p_base: Optional[float] = None,
    load_levels: Optional[List[float]] = None,
    labels: Optional[List[str]] = None,
    cpu_model: Optional[Dict] = None,
    method: str = "linear",
    per_core_steps: bool = False
) -> None:
    """
    Plot predicted total server power at different CPU load levels
    or for each number of active cores.
    """
    if cpu_model:
        num_cores = cpu_model.get("cpu_info", {}).get("cores")

    if num_cores is None:
        raise ValueError("Missing 'num_cores' information.")

    if per_core_steps:
        core_counts = list(range(1, num_cores + 1))
        labels = [f"{cores} cores" for cores in core_counts]
        predicted = []
        for cores in core_counts:
            if cpu_model:
                predicted_power = predict_power(cpu_model, num_cores=cores, method=method)
            else:
                predicted_power = p_base + (p_core * cores)
            predicted.append(predicted_power)
    else:
        load_levels = load_levels or [0.0, 0.25, 0.5, 1.0]
        labels = labels or [f"{int(load * 100)}% Load" if load > 0 else "Idle" for load in load_levels]
        predicted = []
        for load in load_levels:
            active_cores = num_cores * load
            if cpu_model:
                if active_cores == 0:
                    predicted_power = cpu_model["linear_model"]["p_base"]
                else:
                    predicted_power = predict_power(cpu_model, num_cores=active_cores, method=method)
            else:
                predicted_power = p_base + (p_core * active_cores)
            predicted.append(predicted_power)

    plt.figure(figsize=(8, 5))
    bars = plt.bar(labels, predicted, color="skyblue", edgecolor="black")
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width() / 2, yval + 1, f"{yval:.1f} W",
                 ha='center', va='bottom', fontsize=10)
    title_mode = "Per Core Steps @100% Load" if per_core_steps else f"{method.capitalize()} Model"
    plt.title(f"Predicted Server Power ({title_mode})")
    plt.ylabel("Power (W)")
    plt.ylim(0, max(predicted) + 20)
    plt.grid(axis="y", linestyle="--", alpha=0.7)
    plt.tight_layout()
    plt.show()

def plot_model_fit(
    model_df: pd.DataFrame,
    reg_linear: LinearRegression,
    reg_poly: LinearRegression,
    poly: PolynomialFeatures
) -> None:
    x_vals = pd.DataFrame({
        "cores": np.linspace(model_df["cores"].min(), model_df["cores"].max(), 100)
    })
    y_linear = reg_linear.predict(x_vals)
    y_poly = reg_poly.predict(poly.transform(x_vals))

    sns.set_theme(style="whitegrid")
    plt.figure(figsize=(8, 5))
    sns.scatterplot(data=model_df, x="cores", y="avg_power", s=80, label="Measured")
    plt.plot(x_vals, y_linear, color="red", label="Linear Fit", linewidth=2)
    plt.plot(x_vals, y_poly, color="blue", label="Polynomial Fit (Degree 2)", linewidth=2)
    plt.title("Measured Power vs. Model Predictions")
    plt.xlabel("CPU Cores Used")
    plt.ylabel("Power (W)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

def display_model_errors(model_df: pd.DataFrame) -> None:
    display_df = model_df.copy()

    display_df["error_linear"] = display_df["error_linear"].map(lambda x: f"{x:.2f}%")
    display_df["error_poly"] = display_df["error_poly"].map(lambda x: f"{x:.2f}%")

    for col in ["cores", "avg_power", "predicted_linear", "predicted_poly"]:
        if col in display_df.columns:
            display_df[col] = display_df[col].round(2)

    display(display_df)

# === Model Persistence ===
def load_cpu_info(config: Dict) -> Dict:
    path = os.path.join(config["result_folder"], "config", config["server_name"], "hardware.json")
    with open(path, "r") as file:
        hardware = json.load(file)
        cpu = hardware.get("processor", [{}])[0]
        return {"cores": cpu.get("cores"), "model": cpu.get("model")}
    return {}

def save_cpu_model(
    config: Dict,
    p_core: float,
    intercept: float,
    poly_coeffs: Optional[Tuple[float, float, float]] = None,
    mapping: Optional[Dict] = None,
    cpu_info: Optional[Dict] = None
) -> Dict:
    output_name = f"{config['model_output']}_{config['server_name']}.json"
    output_path = os.path.join(CPU_MODEL_FOLDER, output_name)
    p_base = config["idle_power"]
    node_name = config["server_name"]

    model = {
        "model_type": "linear_fixed_idle" if config.get("fix_idle") else "linear_free_fit",
        "linear_model": {
            "p_base": p_base,
            "p_core": p_core,
            "fitted_intercept": intercept
        },
        "timestamp": datetime.now().isoformat(),
        "node_name": node_name,
        "source_files": mapping or {},
        "cpu_info": cpu_info or {}
    }

    if poly_coeffs is not None:
        a, b, c = poly_coeffs
        model["polynomial_model"] = {
            "a": a,
            "b": b,
            "c": c
        }

    os.makedirs(CPU_MODEL_FOLDER, exist_ok=True)
    with open(output_path, "w") as f:
        json.dump(model, f, indent=2)
    print(f"Saved model to {output_path}")
    return model

def load_cpu_model(config: Dict) -> Dict:
    name = f"{config['model_output']}_{config['server_name']}.json"
    path = os.path.join(CPU_MODEL_FOLDER, name)
    with open(path, "r") as file:
        model = json.load(file)
        print(f"Loaded model from {path}")
        return model

def predict_power(model: Dict, num_cores: int, method: str = "linear") -> float:
    if method == "linear":
        if model["model_type"] == "linear_fixed_idle":
            p_core = model["linear_model"]["p_core"]
            p_base = model["linear_model"]["p_base"]
            return p_core * num_cores + p_base
        elif model["model_type"] == "linear_free_fit":
            p_core = model["linear_model"]["p_core"]
            intercept = model["linear_model"]["fitted_intercept"]
            return p_core * num_cores + intercept
        else:
            raise ValueError(f"Unknown linear model type: {model['model_type']}")
    elif method == "polynomial":
        if "polynomial_model" not in model:
            raise ValueError("Polynomial model coefficients not available in model.")
        a = model["polynomial_model"]["a"]
        b = model["polynomial_model"]["b"]
        c = model["polynomial_model"]["c"]
        return a * (num_cores ** 2) + b * num_cores + c
    else:
        raise ValueError(f"Unknown prediction method: {method}")


## Experiment Configuration (CONFIG)

The CONFIG dictionary defines the core setup for running a CPU power modeling experiment.  
It controls where the data comes from, how the models are built, and where results are saved.

### Configuration Fields

| Key               | Type    | Description 
|------------------|---------|-------------
| result_folder  | str   | Root path to the experiment results. This is used to locate CSV files (under energy/) and system metadata (hardware.json under config/). 
| server_name    | str   | Logical name or hostname of the test server. Used to build subpaths to results and metadata folders. 
| idle_power     | float | Idle (baseline) power consumption in watts. Used as a reference point to calculate per-core dynamic power. 
| csv_base_name  | str   | Prefix for measurement CSV filenames (e.g., measurement_run0.csv, measurement_run1.csv, ...). 
| csv_extension  | str   | File extension for the measurement files. Typically .csv. 
| map_to         | str   | Logical label for mapping measurements to a parameter (e.g., "cores" to track how many CPU cores were active). Mostly used in logging and plots. 
| model_output   | str   | Base filename for the saved CPU power model. Final output will be stored in the cpu_models/ folder as model_output_server_name.json. 
| fix_idle       | bool  | If True, forces the model to use a fixed intercept at idle_power. If False, the model fits the intercept freely based on data. 


### Additional Behavior

- If start_index and end_index are not provided, they are automatically inferred from the number of CPU cores (based on hardware.json).
- Models are saved as JSON files and include timestamps, node info, and fitted parameters.
- Filenames for data are automatically constructed using {csv_base_name}{index}{csv_extension}

In [None]:
CONFIG = build_config()
run_mapping = generate_run_mapping(CONFIG)

# Load Stress Run Data

Each CSV file represents a stress run using a known number of CPU cores.  
We compute the average power and derive the per-core contribution using:

$$
P_{\text{core}} = \frac{\bar{P}_{\text{measured}} - P_{\text{base}}}{\text{\# active cores}}
$$

In [None]:
csv_folder = get_csv_folder(CONFIG)
model_df = load_power_data(csv_folder, run_mapping, CONFIG["idle_power"])

# Model Construction

We fit two models to understand how CPU usage affects total power consumption:

- **Linear Regression**: Assumes each core adds a constant amount of power.
- **Polynomial Regression (Degree 2)**: Allows for non-linear effects like saturation or thermal throttling.

The model can be used in this generalized prediction formula:

$$
P_{\text{server}} = P_{\text{base}} + \sum_{i=1}^{n} \lambda_i \cdot P_{\text{core}, i}
$$

Where:

- $P_{\text{base}}$: Idle power consumption of the server  
- $n$: Number of active cores  
- $\lambda_i$: Load factor (0â€“1) of core $i$  
- $P_{\text{core}, i}$: Power used by core $i$ at full load

In [None]:
reg_linear, reg_poly, poly = fit_models(model_df, CONFIG["idle_power"], fix_idle=CONFIG["fix_idle"])

# Save Model

Export the linear model (with or without fixed idle intercept) as a reusable .json file.  
This can later be used for power estimation during experiments.

In [None]:
p_core = reg_linear.coef_[0]
intercept = CONFIG["idle_power"] if CONFIG["fix_idle"] else reg_linear.intercept_
poly_coeffs = (reg_poly.coef_[2], reg_poly.coef_[1], reg_poly.intercept_)

# Load CPU metadata
cpu_info = load_cpu_info(CONFIG)

# Save model including polynomial model
cpu_model = save_cpu_model(
    config=CONFIG,
    p_core=p_core,
    intercept=intercept,
    poly_coeffs=poly_coeffs,  # ðŸ†• Now passed!
    mapping=run_mapping,
    cpu_info=cpu_info
)

# Model Fit Visualization

We compare the measured power values to the predictions from:

- **Linear Regression**  
- **Polynomial Regression (Degree 2)**

This plot helps assess how well each model captures the real trend in power usage as more CPU cores are utilized.

We plot:

- Measured values as scatter points  
- Linear model as a red line  
- Polynomial model as a blue curve

In [None]:
plot_model_fit(model_df, reg_linear, reg_poly, poly)

# Model Predictions

Use the model to predict total server power at different CPU load levels (e.g. 25%, 50%, 100%).  
This helps in estimating energy use under different workload intensities.

In [None]:
model = load_cpu_model(CONFIG)
plot_predicted_load_levels(cpu_model=model)
plot_predicted_load_levels(cpu_model=model, method='polynomial')
plot_predicted_load_levels(cpu_model=model, per_core_steps=True)

## Prediction Accuracy: Linear vs. Polynomial

Below is a comparison table showing the actual power consumption values (real_power), the predicted values from both models, and their corresponding percentage errors.
This helps evaluate how well each model fits the data.


In [None]:
display_model_errors(model_df)