In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_colwidth", 800)

### Review the workload context and usage assumptions provided in the notebook

In [None]:
# ---------------------------------------------------------
# Review the workload context and usage assumptions provided in the notebook.
# ---------------------------------------------------------
# TODO:
# 1) Create a dictionary called `workload_context` with the following keys:
#    - workload_name
#    - traffic_pattern
#    - requests_per_day
#    - avg_input_tokens
#    - avg_output_tokens
#    - sla_notes
#    - kg_co2e_per_kwh
# 3) Display the dictionary as a one-row DataFrame.

# workload_context = {
#     ...
# }

# display(pd.DataFrame([workload_context]))


### Load the baseline inference metrics for the simulated generative AI workload

In [None]:
# ---------------------------------------------------------
# Instruction Step 3
# Load the baseline inference metrics for the simulated workload.
# ---------------------------------------------------------
baseline_path = Path("baseline_inference_metrics.csv")

# TODO:
# 1) Add a simple check that raises a helpful error if the CSV is not found.
# 2) Read the CSV into a DataFrame called `baseline_df`.
# 3) Display `baseline_df`.

# if not baseline_path.exists():
#     raise FileNotFoundError("...")

# baseline_df = pd.read_csv(baseline_path)
# display(baseline_df)

### Analyze the baseline metrics to identify potential efficiency concerns

In [None]:
# Pull baseline values into a dict for easy downstream calculations.
baseline = dict(zip(baseline_df["metric"], baseline_df["baseline_value"]))

# A few simple checks based on the workload notes.
baseline_findings = []

p95 = baseline.get("p95_latency_ms")
if p95 is not None:
    if p95 > 1200:
        baseline_findings.append(f"p95 latency is {p95:.0f} ms, which may be high for an interactive SLA.")
    else:
        baseline_findings.append(f"p95 latency is {p95:.0f} ms, within the stated interactive target range.")

mem = baseline.get("average_memory_gb")
if mem is not None:
    if mem >= 24:
        baseline_findings.append(f"Average memory is {mem:.1f} GB, which may force larger (more expensive) GPUs.")
    elif mem >= 16:
        baseline_findings.append(f"Average memory is {mem:.1f} GB; watch GPU sizing and headroom for spikes.")
    else:
        baseline_findings.append(f"Average memory is {mem:.1f} GB, leaving reasonable headroom for common GPU tiers.")

cost = baseline.get("cost_per_1k_requests_usd")
if cost is not None:
    baseline_findings.append(f"Estimated cost is ${cost:.2f} per 1k requests. Scale impact depends on daily volume.")

energy = baseline.get("estimated_energy_kwh_per_1k_requests")
if energy is not None:
    baseline_findings.append(f"Estimated energy is {energy:.2f} kWh per 1k requests. Consider energy as a constraint at scale.")

pd.DataFrame({"baseline_findings": baseline_findings})

### Review the simulated optimization scenario and its post-optimization metrics

#### Optimization Scenario 

For this exercise, assume the following optimization has already been applied to the generative AI inference system.

##### Optimization Applied
**INT8 quantization with dynamic batching**

##### Description
- Reduced-precision (INT8) inference is used to lower memory usage and improve throughput.
- Dynamic batching groups multiple requests together to increase hardware utilization under steady traffic.

##### Expected Benefits
- Lower GPU memory footprint
- Higher sustained throughput
- Reduced cost per request
- Reduced energy consumption per request

##### Known Tradeoffs
- Median latency may increase slightly due to batching overhead
- Output quality impact is expected to be minor but not zero
- Occasional formatting drift or subtle tone changes may occur for longer outputs

##### Your Task
You are **not** asked to design or implement this optimization.

Your task is to **evaluate whether this optimization is acceptable** for the workload using the provided performance, cost, and energy metrics.


In [None]:
optimization_scenario = {
    "name": "INT8 quantization + dynamic batching",
    "notes": [
        "Reduced precision inference lowers memory and can improve throughput.",
        "Batching can increase throughput but may increase median latency under certain traffic patterns.",
        "Quality impact is assumed to be minimal but not zero (monitor formatting + factuality).",
    ],
    "quality_impact_note": "Minor: occasional slight tone shift and rare formatting drift under long outputs.",
}

pd.DataFrame({"scenario_notes": optimization_scenario["notes"]})

### Construct a comparison table that summarizes baseline and optimized metrics side by side

In [None]:
# ---------------------------------------------------------
# Construct a comparison table (baseline vs optimized).
# ---------------------------------------------------------
# TODO:
# 1) Create a dictionary called `optimized_metrics` with the SAME metric names as baseline_df["metric"].
#    The values should represent the optimized scenario.
# 2) Convert it into a DataFrame called `optimized_df` with columns:
#    - metric
#    - optimized_value
# 3) Merge baseline_df and optimized_df into `comparison`.
# 4) Add:
#    - delta (optimized - baseline)
#    - pct_change ((delta / baseline) * 100)

# optimized_metrics = {
#     ...
# }
# optimized_df = pd.DataFrame({
#     "metric": ...,
#     "optimized_value": ...
# })
# comparison = baseline_df.merge(optimized_df, on="metric", how="left")
# comparison["delta"] = ...
# comparison["pct_change"] = ...
# display(comparison)


### Calculate and summarize the differences between baseline and optimized metrics

In [None]:
requests_per_day = workload_context["requests_per_day"]

# Pull per-1k request cost/energy and scale them
baseline_cost_per_day = (baseline["cost_per_1k_requests_usd"] * requests_per_day) / 1000
optimized_cost_per_day = (optimized_metrics["cost_per_1k_requests_usd"] * requests_per_day) / 1000

baseline_energy_per_day = (baseline["estimated_energy_kwh_per_1k_requests"] * requests_per_day) / 1000
optimized_energy_per_day = (optimized_metrics["estimated_energy_kwh_per_1k_requests"] * requests_per_day) / 1000

# Optional CO2e estimate (assumption-based)
kg_co2e_per_kwh = workload_context.get("kg_co2e_per_kwh", None)
baseline_co2e_kg_per_day = baseline_energy_per_day * kg_co2e_per_kwh if kg_co2e_per_kwh is not None else None
optimized_co2e_kg_per_day = optimized_energy_per_day * kg_co2e_per_kwh if kg_co2e_per_kwh is not None else None

scale_summary = {
    "requests_per_day": requests_per_day,
    "baseline_cost_per_day_usd": baseline_cost_per_day,
    "optimized_cost_per_day_usd": optimized_cost_per_day,
    "daily_cost_savings_usd": baseline_cost_per_day - optimized_cost_per_day,
    "baseline_energy_kwh_per_day": baseline_energy_per_day,
    "optimized_energy_kwh_per_day": optimized_energy_per_day,
    "daily_energy_savings_kwh": baseline_energy_per_day - optimized_energy_per_day,
}

if baseline_co2e_kg_per_day is not None:
    scale_summary.update({
        "assumed_kg_co2e_per_kwh": kg_co2e_per_kwh,
        "baseline_co2e_kg_per_day": baseline_co2e_kg_per_day,
        "optimized_co2e_kg_per_day": optimized_co2e_kg_per_day,
        "daily_co2e_savings_kg": baseline_co2e_kg_per_day - optimized_co2e_kg_per_day,
    })

pd.DataFrame([scale_summary]).T.rename(columns={0: "value"})

### Evaluate the tradeoffs introduced by the optimization

In [None]:
tradeoffs = []

# Latency tradeoff (p50 may increase; p95 may improve)
tradeoffs.append({
    "area": "Latency",
    "what_changed": f"p50: {baseline['p50_latency_ms']} -> {optimized_metrics['p50_latency_ms']} ms, "
                    f"p95: {baseline['p95_latency_ms']} -> {optimized_metrics['p95_latency_ms']} ms",
    "interpretation": "Median latency slightly increased (batching overhead), but tail latency improved."
})

# Throughput
tradeoffs.append({
    "area": "Throughput",
    "what_changed": f"{baseline['throughput_requests_per_sec']} -> {optimized_metrics['throughput_requests_per_sec']} req/s",
    "interpretation": "Higher throughput reduces queueing risk and can stabilize tail latency under load."
})

# Memory
tradeoffs.append({
    "area": "Memory",
    "what_changed": f"{baseline['average_memory_gb']} -> {optimized_metrics['average_memory_gb']} GB",
    "interpretation": "Lower memory enables smaller instances or more headroom, improving cost flexibility."
})

# Quality / flexibility (scenario-based note)
tradeoffs.append({
    "area": "Output Quality",
    "what_changed": optimization_scenario["quality_impact_note"],
    "interpretation": "Treat as a monitoring requirement. Add regression tests for tone, formatting, and factuality."
})

pd.DataFrame(tradeoffs)

### Short analysis and clear recommendation 

In [None]:
# ---------------------------------------------------------
# Write a short analysis summarizing wins, risks, and acceptability.
# ---------------------------------------------------------
# TODO:
# Write a short markdown-style summary (as a Python multi-line string)
# that includes:
# - the biggest improvements (cost, energy, throughput, etc.)
# - the biggest tradeoffs (latency, quality, flexibility)
# - whether the optimization is acceptable for this workload, and why
#
# Tip: Use f-strings and the baseline/optimized values.

# analysis = f"""
# ...
# """.strip()
# print(analysis)

In [None]:
# ---------------------------------------------------------
# Conclude with a clear recommendation supported by the data.
# ---------------------------------------------------------
# TODO:
# Create a dictionary called `recommendation` with:
# - decision: one of ["Adopt", "Adopt with guardrails", "Do not adopt"]
# - guardrails: list of strings (if applicable)
# - why: short justification tied to your results
#
# Display it as a one-row DataFrame.

# recommendation = {
#     "decision": "...",
#     "guardrails": ["...", "..."],
#     "why": "..."
# }
# display(pd.DataFrame({
#     "decision": [recommendation["decision"]],
#     "why": [recommendation["why"]],
#     "guardrails": ["; ".join(recommendation.get("guardrails", []))]
# }))
