## Adjusting CAP6 baselines by interpolating 5-year periods

In [35]:
"""
Interpolate the existing CAP6 SSP_baselines.csv (decadal points) to 5-year steps.

Input format (as given):
year,2020,2030,2040,2050,2060,2070,2080,2090,2100
SSP1,39.804013,34.734424,26.509183,17.963539,10.527979,4.476328,0.0,0.0,0.0
...

Output format:
year,2020,2025,2030,2035,...,2095,2100
SSP1,...
...

Notes
-----
- Pure linear interpolation between the given decadal points.
- Leaves the original values unchanged at the decade years.
- Units are preserved (GtCO2/yr).
- Optional `CLIP_NONNEGATIVE` to clip any negative interpolated values to 0.
"""

import numpy as np
import pandas as pd
from typing import List

In [36]:
INPUT_PATH  = "/user/tlm2160/cap6_tm/data/SSP_baselines.csv"      # your original CAP6 CSV (decadal)
OUTPUT_PATH = "/user/tlm2160/cap6_tm/data/SSP_baselines_5y.csv"   # interpolated output (5-year grid)
START_YEAR  = 2020
END_YEAR    = 2100
STEP        = 5
CLIP_NONNEGATIVE = False  # set True to enforce >= 0 after interpolation

In [37]:
# Read original decadal baseline (rows = SSP names; columns = years as strings)
df = pd.read_csv(INPUT_PATH)
# First column header is "year" but actually contains scenario names (SSP1..SSP5)
df = df.set_index(df.columns[0])

# Source years (decadal) and target years (5-year grid)
src_years: List[int] = sorted(int(c) for c in df.columns)
tgt_years: List[int] = list(range(START_YEAR, END_YEAR + 1, STEP))

# Interpolate each SSP row
out_rows = {}
src_years_np = np.array(src_years, dtype=int)

for ssp, row in df.iterrows():
    src_vals = row.values.astype(float)
    # Interpolate linearly to target grid
    tgt_vals = np.interp(tgt_years, src_years_np, src_vals)

    # Optional: clip negatives
    if CLIP_NONNEGATIVE:
        tgt_vals = np.maximum(tgt_vals, 0.0)

    out_rows[ssp] = tgt_vals

# Assemble output in CAP6 shape
out = pd.DataFrame(out_rows, index=tgt_years).T
out.index.name = "scenario"
out.columns = [str(y) for y in tgt_years]

# Write with header "year,2020,2025,...,2100"
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    header = ["year"] + list(out.columns)
    f.write(",".join(header) + "\n")
    for ssp in out.index:
        row = [ssp] + [f"{out.loc[ssp, str(y)]:.6f}" for y in tgt_years]
        f.write(",".join(row) + "\n")

print(f"Wrote {OUTPUT_PATH}")

Wrote /user/tlm2160/cap6_tm/data/SSP_baselines_5y.csv


## Using SSP baselines, but tbd

In [32]:
"""
Create CAP6-style SSP_baselines.csv on a 5-year grid (2020..2100) using
the **marker models per Riahi et al.**, with units in GtCO2/yr and
negative emissions clipped to zero.

- Source: SSP IAM Database V2 CSV (wide format with year columns).
- REGION == "World", VARIABLE == "Emissions|CO2" (total CO2).
- For each SSP, pick the marker MODEL and its "SSP*-Baseline" SCENARIO.
- Convert MtCO2/yr -> GtCO2/yr by dividing by 1e3.
- Clip negatives to zero (authorsâ€™ stated alteration).
- Interpolate linearly to a **5-year grid** between 2020 and 2100.
- Write CSV with header: year,2020,2025,...,2100 and rows SSP1..SSP5.

Option (to mirror original CAP6 quirk):
- Set FORCE_SSP1_ZERO_FROM_2080 = True to force SSP1 to 0 from 2080 onward.

Run:
    python make_ssp_baselines_markers_5y.py
"""

import re
import numpy as np
import pandas as pd
from typing import List

In [None]:
# ---------------- CONFIG ----------------
INPUT_PATH  = "data/SSP_IAM_V2_201811/SSP_IAM_V2_201811.csv"
OUTPUT_PATH = "data/SSP_baselines.csv"   # CAP6-style header; 5y grid, 2020..2100

REGION   = "World"
VARIABLE = "Emissions|CO2"               # total CO2 (not harmonized, not subsets)

# Marker models per Riahi et al.
MARKER_MODEL = {
    "SSP1": "IMAGE",
    "SSP2": "MESSAGE-GLOBIOM",
    "SSP3": "AIM/CGE",
    "SSP4": "GCAM4",
    "SSP5": "REMIND-MAGPIE",
}
SSPS   = ["SSP1", "SSP2", "SSP3", "SSP4", "SSP5"]

# Target 5-year grid and database years we will interpolate *from*
YEARS_DB   = [2020, 2030, 2040, 2050, 2060, 2070, 2080, 2090, 2100]  # database timesteps
YEARS_5Y   = list(range(2020, 2100 + 1, 5))                          # 2020..2100, 5y

# Optional CAP6 quirk: SSP1 = exactly 0 from 2080 onward
FORCE_SSP1_ZERO_FROM_2080 = False

def find_year_columns(df: pd.DataFrame) -> List[str]:
    """Return sorted list of four-digit year columns."""
    yrs = sorted([c for c in df.columns if re.fullmatch(r"\d{4}", str(c))], key=int)
    if not yrs:
        raise ValueError("No year columns found in the IAM CSV.")
    return yrs

def get_marker_series(df_world_total: pd.DataFrame, ssp: str, year_cols: List[str]) -> pd.Series:
    """
    Return the marker model's SSP*-Baseline series for given SSP, indexed by `year_cols`.
    - Exact MODEL == MARKER_MODEL[ssp], SCENARIO == f"{ssp}-Baseline".
    - Fallback: same model + scenario startswith ssp and contains 'Baseline'.
    - Convert Mt -> Gt and clip negatives to zero.
    """
    model = MARKER_MODEL[ssp]
    scen_exact = f"{ssp}-Baseline"

    rows = df_world_total[(df_world_total["MODEL"] == model) &
                          (df_world_total["SCENARIO"] == scen_exact)]
    if rows.empty:
        rows = df_world_total[(df_world_total["MODEL"] == model) &
                              (df_world_total["SCENARIO"].str.startswith(ssp, na=False)) &
                              (df_world_total["SCENARIO"].str.contains("Baseline", na=False))]
    if rows.empty:
        raise RuntimeError(f"No marker baseline found for {ssp} with MODEL='{model}' and SCENARIO like '{scen_exact}'.")

    # Convert MtCO2 -> GtCO2 and clip negatives to zero, then median across any duplicates
    vals = (rows[year_cols].astype(float) / 1e3).median(axis=0)
    vals[vals < 0.0] = 0.0
    return vals

In [34]:
df = pd.read_csv(INPUT_PATH)

for col in ["MODEL", "SCENARIO", "REGION", "VARIABLE"]:
    if col not in df.columns:
        raise ValueError(f"Missing required column '{col}' in {INPUT_PATH}")

year_cols_all = find_year_columns(df)
need_db_cols = [str(y) for y in YEARS_DB]
missing = [c for c in need_db_cols if c not in year_cols_all]
if missing:
    raise ValueError(f"Required database years absent in CSV: {missing}")

# Filter to World total CO2
sub = df[(df["REGION"] == REGION) & (df["VARIABLE"] == VARIABLE)].copy()
if sub.empty:
    raise ValueError(f"No rows for REGION='{REGION}' and VARIABLE='{VARIABLE}'.")

# Build marker series from database years, then interpolate to 5-year grid
out = {}
db_years = np.array(YEARS_DB, dtype=int)
for ssp in SSPS:
    series_db = get_marker_series(sub, ssp, need_db_cols)  # index=str years in YEARS_DB
    y_db = series_db.values.astype(float)                  # GtCO2/yr at 10y points

    # Interpolate to 5-year grid (2020..2100)
    y_5 = np.interp(YEARS_5Y, db_years, y_db)

    # Optional: force SSP1 to 0 from 2080 onward (to mirror original CAP6 file)
    if FORCE_SSP1_ZERO_FROM_2080 and ssp == "SSP1":
        mask = np.array(YEARS_5Y) >= 2080
        y_5[mask] = 0.0

    # Re-clip (in case interpolation crosses below zero)
    y_5 = np.maximum(y_5, 0.0)

    out[ssp] = y_5

# Assemble output (CAP6 expects header 'year,2020,2025,...,2100')
out_df = pd.DataFrame(out, index=YEARS_5Y).T
out_df.index.name = "scenario"
out_df.columns = [str(y) for y in YEARS_5Y]

with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    header = ["year"] + list(out_df.columns)
    f.write(",".join(header) + "\n")
    for ssp in SSPS:
        if ssp in out_df.index:
            row = [ssp] + [f"{out_df.loc[ssp, str(y)]:.6f}" for y in YEARS_5Y]
            f.write(",".join(row) + "\n")

print(f"Wrote {OUTPUT_PATH}")
print("Columns:", ", ".join(out_df.columns))

Wrote data/SSP_baselines.csv
Columns: 2020, 2025, 2030, 2035, 2040, 2045, 2050, 2055, 2060, 2065, 2070, 2075, 2080, 2085, 2090, 2095, 2100
