In [4]:
import csv
import math
import re
from pathlib import Path
from collections import defaultdict
import numpy as np
import time

SPEC_ROOT_DIR = Path("/home/ac.zzheng/power/GPGPU/data/H100/spec_power_motif")
SPEC_APPS = ['lbm', 'cloverleaf', 'tealeaf', 'minisweep', 'pot3d', 'hpgmg']
SPEC_FOCUS_POWER_CAPS = [800, 900, 1000, 1200, 1400, 1600, 2000]

ML_ROOT_DIR = Path("/home/ac.zzheng/power/GPGPU/data/H100/ml_power_motif")
# Set to None to auto-discover app folders under ML_ROOT_DIR.
ML_APPS = None
ML_FOCUS_POWER_CAPS = [400, 500, 600, 700, 800, 900, 1000, 1200, 1400, 1600, 1800]

WORKLOADS = [
    {
        "suite": "spec",
        "root_dir": SPEC_ROOT_DIR,
        "apps": SPEC_APPS,
        "focus_caps": SPEC_FOCUS_POWER_CAPS,
        "target_csv": "runtime.csv",
        "target_kind": "runtime",
    },
    {
        "suite": "ml",
        "root_dir": ML_ROOT_DIR,
        "apps": ML_APPS,
        "focus_caps": ML_FOCUS_POWER_CAPS,
        "target_csv": "throughput.csv",
        "target_kind": "throughput",
    },
]

# Run only one case when set; set to None to run all.
# SINGLE_CASE = {"suite": "ml", "app": "bert", "cap": 800}
SINGLE_CASE = None

ACTIVE_POWER_TH = 120.0
ACTIVE_SM_TH = 400.0
PROFILE_SECONDS = 5.0

FILE_RE = re.compile(r"(?P<cap>\d+)_(?P<gpus>\d+)_gpu_metrics\.csv$")


def f(x):
    try:
        return float(str(x).strip())
    except:
        return None


def mean(xs):
    return sum(xs) / len(xs) if xs else float("nan")


def sigmoid(z):
    z = np.clip(z, -40, 40)
    return 1.0 / (1.0 + np.exp(-z))


def read_target_lookup(path, target_kind):
    m = {}
    if not path.exists():
        return m

    with path.open(newline="") as fh:
        r = csv.DictReader(fh)
        cols = set(r.fieldnames or [])

        if target_kind == "runtime":
            req = {"power_cap", "gpu_count", "runtime_seconds"}
            if not req.issubset(cols):
                return m
            for row in r:
                cap, g, t = f(row.get("power_cap")), f(row.get("gpu_count")), f(row.get("runtime_seconds"))
                if cap is not None and g is not None and t is not None:
                    m[(int(round(cap)), int(round(g)))] = t

        elif target_kind == "throughput":
            req = {"total_gpu_cap", "gpu_count"}
            if not req.issubset(cols):
                return m

            tp_col = None
            for c in ["throughput_images_per_sec", "throughput_tokens_per_sec"]:
                if c in cols:
                    tp_col = c
                    break
            if tp_col is None:
                return m

            has_status = "status" in cols
            for row in r:
                if has_status and str(row.get("status", "")).strip().lower() != "ok":
                    continue
                cap, g, t = f(row.get("total_gpu_cap")), f(row.get("gpu_count")), f(row.get(tp_col))
                if cap is not None and g is not None and t is not None:
                    m[(int(round(cap)), int(round(g)))] = t
    return m


def analyze_metric_file(path, target_lookup, target_kind):
    m = FILE_RE.match(path.name)
    if not m:
        return None
    cap = int(m.group("cap"))
    gcount = int(m.group("gpus"))

    with path.open(newline="") as fh:
        r = csv.DictReader(fh)
        rows = list(r)
        cols = r.fieldnames or []

    if not rows or "Time (s)" not in cols:
        return None

    # Drop idle points where GPU0 DRAM activity is zero
    if "GPU0_DRAM_Active" in cols:
        filtered_rows = []
        for row in rows:
            v = f(row.get("GPU0_DRAM_Active"))
            if v is not None and v != 0.0:
                filtered_rows.append(row)
        rows = filtered_rows

    if not rows:
        return None

    gpu_ids = sorted({int(c.split("_")[0].replace("GPU", "")) for c in cols if c.startswith("GPU") and "_" in c})

    def colvals(c):
        out = []
        for row in rows:
            v = f(row.get(c))
            if v is not None:
                out.append(v)
        return out

    times = colvals("Time (s)")
    if not times:
        return None

    # Restrict feature extraction to first PROFILE_SECONDS (online-like profiling)
    t0 = min(times)
    t1 = t0 + PROFILE_SECONDS
    profile_rows = []
    for row in rows:
        tv = f(row.get("Time (s)"))
        if tv is not None and tv <= t1:
            profile_rows.append(row)
    if not profile_rows:
        profile_rows = rows

    def prof_colvals(c):
        out = []
        for row in profile_rows:
            v = f(row.get(c))
            if v is not None:
                out.append(v)
        return out

    # per-GPU means from first PROFILE_SECONDS only
    p_avg, sm_avg, dr_avg = {}, {}, {}
    for gid in gpu_ids:
        p_avg[gid] = mean(prof_colvals(f"GPU{gid}_Power (W)"))
        sm_avg[gid] = mean(prof_colvals(f"GPU{gid}_SM_Clock (MHz)"))
        dr_avg[gid] = mean(prof_colvals(f"GPU{gid}_DRAM_Active"))

    active = [gid for gid in gpu_ids if ((not math.isnan(p_avg[gid]) and p_avg[gid] >= ACTIVE_POWER_TH) or
                                         (not math.isnan(sm_avg[gid]) and sm_avg[gid] >= ACTIVE_SM_TH))]
    if not active:
        active = gpu_ids[:]

    target_val = target_lookup.get((cap, gcount), None)
    if target_val is None:
        if target_kind == "runtime":
            target_val = max(times) - min(times)
        else:
            # For ML throughput, skip runs without throughput CSV entry.
            return None

    avg_power = mean([p_avg[g] for g in active])
    avg_sm = mean([sm_avg[g] for g in active])
    avg_dram = mean([dr_avg[g] for g in active])

    return {
        "power_cap": cap,
        "gpu_count": gcount,
        "target": target_val,
        "target_kind": target_kind,
        "avg_power": avg_power,
        "avg_sm": avg_sm,
        "avg_dram": avg_dram,
        "avg_power_x_gpu": avg_power * gcount,
        "avg_sm_x_gpu": avg_sm * gcount,
        "avg_dram_x_gpu": avg_dram * gcount,
    }


def load_runs(app_dir, target_csv, target_kind):
    target_lookup = read_target_lookup(target_csv, target_kind)
    rows = []
    for p in sorted(app_dir.iterdir()):
        if FILE_RE.match(p.name):
            x = analyze_metric_file(p, target_lookup, target_kind)
            if x is not None:
                rows.append(x)
    rows.sort(key=lambda z: (z["power_cap"], z["gpu_count"]))
    return rows


def filter_runs_by_caps(rows, focus_caps=None):
    if focus_caps is None:
        return rows
    focus = {int(x) for x in focus_caps}
    return [r for r in rows if int(r["power_cap"]) in focus]


def fit_and_rank_single_cap(cap_rows, higher_is_better=False, lr=0.08, epochs=5000, l2=1e-3):
    cap_rows = sorted(cap_rows, key=lambda r: r["gpu_count"])
    feats = [np.array([
        r["gpu_count"],
        r["avg_power_x_gpu"],
        r["avg_sm_x_gpu"],
        r["avg_dram_x_gpu"],
    ], dtype=float) for r in cap_rows]

    X, y = [], []
    for i in range(len(cap_rows)):
        for j in range(i + 1, len(cap_rows)):
            diff = feats[i] - feats[j]
            if higher_is_better:
                better = cap_rows[i]["target"] > cap_rows[j]["target"]
            else:
                better = cap_rows[i]["target"] < cap_rows[j]["target"]
            label = 1.0 if better else 0.0
            X.append(diff); y.append(label)
            X.append(-diff); y.append(1.0 - label)

    X = np.array(X, dtype=float)
    y = np.array(y, dtype=float)

    mu = X.mean(axis=0)
    sd = X.std(axis=0)
    sd[sd == 0] = 1.0
    Z = (X - mu) / sd

    w = np.zeros(Z.shape[1], dtype=float)
    b = 0.0
    n = len(y)

    for _ in range(epochs):
        p = sigmoid(Z @ w + b)
        grad_w = (Z.T @ (p - y)) / n + l2 * w
        grad_b = np.mean(p - y)
        w -= lr * grad_w
        b -= lr * grad_b

    scores = np.zeros(len(cap_rows), dtype=float)
    for i in range(len(cap_rows)):
        for j in range(len(cap_rows)):
            if i == j:
                continue
            diff = (feats[i] - feats[j] - mu) / sd
            scores[i] += sigmoid(diff @ w + b)

    order = np.argsort(-scores)
    pred_rank = [cap_rows[k]["gpu_count"] for k in order]
    if higher_is_better:
        true_rank = [r["gpu_count"] for r in sorted(cap_rows, key=lambda x: x["target"], reverse=True)]
    else:
        true_rank = [r["gpu_count"] for r in sorted(cap_rows, key=lambda x: x["target"])]
    return pred_rank, true_rank


summary_rows = []

for workload in WORKLOADS:
    suite = workload["suite"]
    if SINGLE_CASE is not None and suite != str(SINGLE_CASE.get("suite", "")):
        continue
    root_dir = workload["root_dir"]
    focus_caps = workload["focus_caps"]
    target_csv_name = workload["target_csv"]
    target_kind = workload["target_kind"]
    higher_is_better = (target_kind == "throughput")

    if workload["apps"] is None:
        apps = sorted([p.name for p in root_dir.iterdir() if p.is_dir()]) if root_dir.exists() else []
    else:
        apps = workload["apps"]

    if SINGLE_CASE is not None:
        apps = [a for a in apps if a == str(SINGLE_CASE.get("app", ""))]

    print(f"\n######## Suite: {suite} ########")

    for app in apps:
        app_dir = root_dir / app
        target_csv = app_dir / target_csv_name

        if not app_dir.exists():
            print(f"[skip] app dir not found: {app_dir}")
            continue

        runs = load_runs(app_dir, target_csv, target_kind)
        cap_filter = focus_caps
        if SINGLE_CASE is not None:
            cap_filter = [int(SINGLE_CASE.get("cap"))]
        runs = filter_runs_by_caps(runs, cap_filter)

        if not runs:
            print(f"[skip] no runs after filter for suite={suite}, app={app}")
            continue

        by_cap = defaultdict(list)
        for r in runs:
            by_cap[r["power_cap"]].append(r)

        caps = sorted(by_cap.keys())
        print(f"\n=== {suite}/{app} ===")
        print(f"Focused power caps: {caps}")

        results = []

        def improvement_pct(selected, baseline, higher_is_better):
            if baseline is None or baseline <= 0:
                return float("nan")
            if higher_is_better:
                return (selected - baseline) / baseline * 100.0
            return (baseline - selected) / baseline * 100.0

        print("Independent per-cap ranking:")
        for cap in caps:
            pred_rank, true_rank = fit_and_rank_single_cap(by_cap[cap], higher_is_better=higher_is_better)

            cap_rows = list(by_cap[cap])
            target_by_gpu = {r["gpu_count"]: r["target"] for r in cap_rows}
            selected_gpu = pred_rank[0]
            selected_val = target_by_gpu[selected_gpu]
            max_gpu = max(target_by_gpu.keys())
            max_gpu_val = target_by_gpu[max_gpu]

            g1 = target_by_gpu.get(1)
            g2 = target_by_gpu.get(2)
            g3 = target_by_gpu.get(3)
            g4 = target_by_gpu.get(4)

            improve_vs_max_gpu_pct = improvement_pct(selected_val, max_gpu_val, higher_is_better)
            improve_vs_1gpu_pct = improvement_pct(selected_val, g1, higher_is_better)
            improve_vs_2gpu_pct = improvement_pct(selected_val, g2, higher_is_better)
            improve_vs_3gpu_pct = improvement_pct(selected_val, g3, higher_is_better)
            improve_vs_4gpu_pct = improvement_pct(selected_val, g4, higher_is_better)

            results.append({
                "power_cap": cap,
                "pred_rank": pred_rank,
                "true_rank": true_rank,
                "pred_optimal_gpu": pred_rank[0],
                "true_optimal_gpu": true_rank[0],
                "selected_target": selected_val,
                "max_gpu": max_gpu,
                "max_gpu_target": max_gpu_val,
                "target_1gpu": g1,
                "target_2gpu": g2,
                "target_3gpu": g3,
                "target_4gpu": g4,
                "improve_vs_max_gpu_pct": improve_vs_max_gpu_pct,
                "improve_vs_1gpu_pct": improve_vs_1gpu_pct,
                "improve_vs_2gpu_pct": improve_vs_2gpu_pct,
                "improve_vs_3gpu_pct": improve_vs_3gpu_pct,
                "improve_vs_4gpu_pct": improve_vs_4gpu_pct,
            })
            print(
                f"  cap {cap}: pred={pred_rank} | true={true_rank} "
                f"| vs_max_gpu={improve_vs_max_gpu_pct:.2f}% "
                f"| vs_1gpu={improve_vs_1gpu_pct:.2f}% "
                f"| vs_2gpu={improve_vs_2gpu_pct:.2f}% "
                f"| vs_3gpu={improve_vs_3gpu_pct:.2f}% "
                f"| vs_4gpu={improve_vs_4gpu_pct:.2f}%"
            )


        out_feat_csv = app_dir / f"{app}_rank_features_scaled.csv"
        with out_feat_csv.open("w", newline="") as fh:
            wcsv = csv.writer(fh)
            wcsv.writerow([
                "power_cap", "gpu_count", "target", "target_kind",
                "avg_power_active", "avg_sm_active", "avg_dram_active",
                "avg_power_x_gpu", "avg_sm_x_gpu", "avg_dram_x_gpu"
            ])
            for r in sorted(runs, key=lambda x: (x["power_cap"], x["gpu_count"])):
                wcsv.writerow([
                    r["power_cap"], r["gpu_count"], r["target"], r["target_kind"],
                    r["avg_power"], r["avg_sm"], r["avg_dram"],
                    r["avg_power_x_gpu"], r["avg_sm_x_gpu"], r["avg_dram_x_gpu"]
                ])

        out_rank_csv = app_dir / f"{app}_independent_per_cap_rankings.csv"
        with out_rank_csv.open("w", newline="") as fh:
            wcsv = csv.writer(fh)
            wcsv.writerow([
                "power_cap", "pred_rank", "true_rank", "pred_optimal_gpu", "true_optimal_gpu",
                "selected_target", "max_gpu", "max_gpu_target",
                "target_1gpu", "target_2gpu", "target_3gpu", "target_4gpu",
                "improve_vs_max_gpu_pct", "improve_vs_1gpu_pct", "improve_vs_2gpu_pct", "improve_vs_3gpu_pct", "improve_vs_4gpu_pct"
            ])
            for row in sorted(results, key=lambda x: x["power_cap"]):
                wcsv.writerow([
                    row["power_cap"], str(row["pred_rank"]), str(row["true_rank"]),
                    row["pred_optimal_gpu"], row["true_optimal_gpu"],
                    row["selected_target"], row["max_gpu"], row["max_gpu_target"],
                    row["target_1gpu"], row["target_2gpu"], row["target_3gpu"], row["target_4gpu"],
                    row["improve_vs_max_gpu_pct"], row["improve_vs_1gpu_pct"], row["improve_vs_2gpu_pct"], row["improve_vs_3gpu_pct"], row["improve_vs_4gpu_pct"]
                ])





######## Suite: spec ########

=== spec/lbm ===
Focused power caps: [800, 900, 1000, 1200, 1400, 1600, 2000]
Independent per-cap ranking:
  cap 800: pred=[4, 3, 2, 1] | true=[4, 3, 2, 1] | vs_max_gpu=0.00% | vs_1gpu=60.46% | vs_2gpu=25.35% | vs_3gpu=2.56% | vs_4gpu=0.00%
  cap 900: pred=[4, 3, 2, 1] | true=[4, 3, 2, 1] | vs_max_gpu=0.00% | vs_1gpu=61.94% | vs_2gpu=29.35% | vs_3gpu=3.12% | vs_4gpu=0.00%
  cap 1000: pred=[4, 3, 2, 1] | true=[4, 3, 2, 1] | vs_max_gpu=0.00% | vs_1gpu=64.28% | vs_2gpu=33.51% | vs_3gpu=8.31% | vs_4gpu=0.00%
  cap 1200: pred=[4, 3, 2, 1] | true=[4, 3, 2, 1] | vs_max_gpu=0.00% | vs_1gpu=66.61% | vs_2gpu=37.91% | vs_3gpu=17.18% | vs_4gpu=0.00%
  cap 1400: pred=[4, 3, 2, 1] | true=[4, 3, 2, 1] | vs_max_gpu=0.00% | vs_1gpu=66.70% | vs_2gpu=37.16% | vs_3gpu=13.55% | vs_4gpu=0.00%
  cap 1600: pred=[4, 3, 2, 1] | true=[4, 3, 2, 1] | vs_max_gpu=0.00% | vs_1gpu=66.44% | vs_2gpu=37.30% | vs_3gpu=13.57% | vs_4gpu=0.00%
  cap 2000: pred=[4, 3, 2, 1] | true=[4, 3, 2, 1] 