# OOF EDA: Setup & Imports


In [6]:
# -------------------------
# 0) CONFIG
# -------------------------
import os, sys

OOF_PATH = "/notebooks/kaggle/csiro/oof/oof_preds.csv"  # <- set this
BUILD_OOF = False  # set True if you want to rebuild OOF here

# Optional OOF rebuild settings
CSIRO_CODE_DIR = "/notebooks/CSIRO"
DINO_REPO = "/notebooks/dinov3"
DINO_WEIGHTS = "/notebooks/kaggle/csiro/weights/dinov3/dinov3_vitb16_pretrain.pth"
PT_PATH = "/notebooks/kaggle/csiro/output/v7_tile_swapTruetiled_inpTruen_models2_e15.pt"
DATA_ROOT = "/notebooks/kaggle/csiro"
TRAIN_CSV = f"{DATA_ROOT}/train.csv"
IMG_SIZE = 512
BATCH_SIZE = 64
NUM_WORKERS = max(0, (os.cpu_count() or 0) - 2)
DEVICE = "cuda"
CV_PARAMS = dict(mode="gkf", cv_seed=126015, n_splits=5)
CSIRO_CODE_DIR = "/notebooks/CSIRO"
sys.path.insert(0, CSIRO_CODE_DIR)

# Column guesses
DATE_COL = "Sampling_Date"
STATE_COL = "State"
SITE_CANDIDATES = ["Site", "Site_ID", "Location", "Location_ID", "Farm", "Plot"]

MIN_N = 30  # minimum group size for reporting


In [4]:
# -------------------------
# 1) Imports
# -------------------------
import numpy as np
import pandas as pd

from csiro.config import TARGETS, DEFAULT_LOSS_WEIGHTS


In [None]:
# -------------------------
# 2) Optional OOF rebuild
# -------------------------
if BUILD_OOF:
    import sys
    import torch

    sys.path.insert(0, DINO_REPO)
    os.environ["DINO_B_WEIGHTS_PATH"] = DINO_WEIGHTS

    from csiro.config import DEFAULT_MODEL_SIZE, DEFAULT_PLUS, dino_hub_name
    from csiro.data import BiomassTiledCached, load_train_wide
    from csiro.utils_v2 import make_oof_predictions

    wide_df = load_train_wide(TRAIN_CSV, root=DATA_ROOT)
    dataset = BiomassTiledCached(wide_df, img_size=IMG_SIZE)

    backbone = torch.hub.load(
        DINO_REPO,
        dino_hub_name(model_size=str(DEFAULT_MODEL_SIZE), plus=str(DEFAULT_PLUS)),
        source="local",
        weights=DINO_WEIGHTS,
    ).to(DEVICE)
    backbone.eval()

    oof = make_oof_predictions(
        dataset=dataset,
        wide_df=wide_df,
        backbone=backbone,
        pt_paths=[PT_PATH],
        cv_params=CV_PARAMS,
        batch_size=BATCH_SIZE,
        num_workers=NUM_WORKERS,
        device=DEVICE,
        inner_agg="mean",
        outer_agg="mean",
    )

    preds = oof["preds"].numpy()
    fold_id = oof["fold_id"]

    df_out = wide_df.copy()
    df_out["fold_id"] = fold_id
    for i, t in enumerate(TARGETS):
        df_out[f"{t}_pred"] = preds[:, i]

    os.makedirs(os.path.dirname(OOF_PATH), exist_ok=True)
    df_out.to_csv(OOF_PATH, index=False)
    print("Wrote", OOF_PATH)


# OOF EDA: Fold Predictions


In [8]:
# -------------------------
# 3) Load OOF table
# -------------------------
df = pd.read_csv(OOF_PATH)
print("rows", len(df))
df.head(3)


rows 357


Unnamed: 0,image_path,Sampling_Date,State,Species,Pre_GSHH_NDVI,Height_Ave_cm,Dry_Clover_g,Dry_Dead_g,Dry_Green_g,Dry_Total_g,GDM_g,abs_path,fold_id,Dry_Green_g_pred,Dry_Clover_g_pred,Dry_Dead_g_pred,GDM_g_pred,Dry_Total_g_pred
0,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,0.0,31.9984,16.2751,48.2735,16.275,/notebooks/kaggle/csiro/train/ID1011485656.jpg,0,17.118351,0.305687,36.007355,17.51256,51.881493
1,train/ID1012260530.jpg,2015/4/1,NSW,Lucerne,0.55,16.0,0.0,0.0,7.6,7.6,7.6,/notebooks/kaggle/csiro/train/ID1012260530.jpg,1,7.722764,0.062323,0.068389,8.825125,9.653315
2,train/ID1025234388.jpg,2015/9/1,WA,SubcloverDalkeith,0.38,1.0,6.05,0.0,0.0,6.05,6.05,/notebooks/kaggle/csiro/train/ID1025234388.jpg,1,0.012798,6.840681,0.230506,6.244605,7.056628


In [10]:
# -------------------------
# 4) Sanity checks + derived columns
# -------------------------
for t in TARGETS:
    if t not in df.columns:
        raise ValueError(f"Missing target column: {t}")
    if f"{t}_pred" not in df.columns:
        raise ValueError(f"Missing pred column: {t}_pred")

if "fold_id" not in df.columns:
    raise ValueError("Missing fold_id column in OOF table.")

if DATE_COL in df.columns:
    df[DATE_COL] = pd.to_datetime(df[DATE_COL], errors="coerce")
    df["month"] = df[DATE_COL].dt.month
    df["year"] = df[DATE_COL].dt.year
    def _season(m):
        if pd.isna(m):
            return None
        m = int(m)
        if m in (12, 1, 2):
            return "summer"
        if m in (3, 4, 5):
            return "autumn"
        if m in (6, 7, 8):
            return "winter"
        return "spring"
    df["season"] = df["month"].apply(_season)

site_col = None
for c in SITE_CANDIDATES:
    if c in df.columns:
        site_col = c
        break
print("site_col:", site_col)


site_col: None


In [11]:
# -------------------------
# 5) Metrics helpers
# -------------------------
W = np.asarray(DEFAULT_LOSS_WEIGHTS, dtype=np.float32)


def r2_score(y, p):
    y = np.asarray(y, dtype=np.float32)
    p = np.asarray(p, dtype=np.float32)
    ss_res = np.square(y - p).sum()
    mu = y.mean()
    ss_tot = np.square(y - mu).sum()
    return 1.0 - ss_res / (ss_tot + 1e-12)


def w_r2(y, p, w):
    y = np.asarray(y, dtype=np.float32)
    p = np.asarray(p, dtype=np.float32)
    w = np.asarray(w, dtype=np.float32).reshape(1, -1)
    ss_res = (w * np.square(y - p)).sum()
    sum_w = w.sum() * y.shape[0]
    sum_wy = (w * y).sum()
    sum_wy2 = (w * y * y).sum()
    mu = sum_wy / (sum_w + 1e-12)
    ss_tot = sum_wy2 - sum_w * mu * mu
    return 1.0 - ss_res / (ss_tot + 1e-12)


def mae(y, p):
    y = np.asarray(y, dtype=np.float32)
    p = np.asarray(p, dtype=np.float32)
    return np.abs(y - p).mean()


def group_metrics(df_in, group_col, min_n=MIN_N):
    rows = []
    for g, dfg in df_in.groupby(group_col):
        if len(dfg) < min_n:
            continue
        y = dfg[TARGETS].values
        p = dfg[[f"{t}_pred" for t in TARGETS]].values
        row = {
            group_col: g,
            "n": len(dfg),
            "wR2": w_r2(y, p, W),
        }
        for t in TARGETS:
            row[f"{t}_R2"] = r2_score(dfg[t].values, dfg[f"{t}_pred"].values)
            row[f"{t}_MAE"] = mae(dfg[t].values, dfg[f"{t}_pred"].values)
        rows.append(row)
    return pd.DataFrame(rows).sort_values("wR2", ascending=False)


# OOF EDA: Diagnostics


In [12]:
# -------------------------
# 6) Overall + per-target metrics
# -------------------------
y_all = df[TARGETS].values
p_all = df[[f"{t}_pred" for t in TARGETS]].values

print("Overall wR2:", w_r2(y_all, p_all, W))
for t in TARGETS:
    r2_t = r2_score(df[t].values, df[f"{t}_pred"].values)
    mae_t = mae(df[t].values, df[f"{t}_pred"].values)
    print(f"{t}: R2={r2_t:.3f} | MAE={mae_t:.3f}")


Overall wR2: 0.95156765
Dry_Green_g: R2=0.942 | MAE=3.699
Dry_Clover_g: R2=0.934 | MAE=1.428
Dry_Dead_g: R2=0.881 | MAE=2.672
GDM_g: R2=0.947 | MAE=3.625
Dry_Total_g: R2=0.934 | MAE=5.103


In [13]:
# -------------------------
# 7) Slice metrics by State / Season / Month / Site
# -------------------------
if STATE_COL in df.columns:
    display(group_metrics(df, STATE_COL))

if "season" in df.columns:
    display(group_metrics(df, "season"))

if "month" in df.columns:
    display(group_metrics(df, "month"))

if site_col is not None:
    display(group_metrics(df, site_col))


Unnamed: 0,State,n,wR2,Dry_Green_g_R2,Dry_Green_g_MAE,Dry_Clover_g_R2,Dry_Clover_g_MAE,Dry_Dead_g_R2,Dry_Dead_g_MAE,GDM_g_R2,GDM_g_MAE,Dry_Total_g_R2,Dry_Total_g_MAE
0,NSW,75,0.947346,0.895112,6.193196,0.677878,0.236583,0.8981445,2.911901,0.913582,5.908791,0.917746,6.527842
3,WA,32,0.947018,0.944983,2.538841,0.827978,4.719428,-20164070000000.0,0.500694,0.93758,3.247122,0.943659,3.352941
2,Vic,112,0.941918,0.875468,3.993036,0.936847,1.641834,0.7828405,2.581783,0.940302,3.551911,0.910343,4.804412
1,Tas,138,0.936939,0.927692,2.37264,0.966848,1.138566,0.8685887,3.117251,0.936799,2.531307,0.903945,4.977506


Unnamed: 0,season,n,wR2,Dry_Green_g_R2,Dry_Green_g_MAE,Dry_Clover_g_R2,Dry_Clover_g_MAE,Dry_Dead_g_R2,Dry_Dead_g_MAE,GDM_g_R2,GDM_g_MAE,Dry_Total_g_R2,Dry_Total_g_MAE
2,summer,41,0.963385,0.912048,6.069418,0.693058,0.30083,0.8214,2.150946,0.92614,5.475066,0.941856,5.103185
0,autumn,52,0.947886,0.952582,3.134916,0.45464,0.358245,0.902759,2.860172,0.946953,3.33983,0.91697,6.01912
1,spring,133,0.939824,0.920612,4.134541,0.92804,2.108093,0.872339,3.332964,0.93709,4.053189,0.917628,6.091959
3,winter,131,0.932885,0.900245,2.737567,0.922035,1.514761,0.863455,2.088193,0.923332,2.725027,0.892851,3.736008


Unnamed: 0,month,n,wR2,Dry_Green_g_R2,Dry_Green_g_MAE,Dry_Clover_g_R2,Dry_Clover_g_MAE,Dry_Dead_g_R2,Dry_Dead_g_MAE,GDM_g_R2,GDM_g_MAE,Dry_Total_g_R2,Dry_Total_g_MAE
4,9,67,0.952648,0.924644,4.109186,0.898288,2.226834,0.951401,1.929714,0.953675,3.580014,0.933171,5.254139
5,11,37,0.94673,0.926336,2.667274,0.957795,2.498839,0.797611,4.037508,0.947305,3.247264,0.925744,5.661002
0,5,42,0.946524,0.950826,3.05383,0.440355,0.428433,0.884852,3.471737,0.945476,3.058511,0.918319,6.032393
3,8,37,0.928512,0.868816,3.933647,0.879013,3.290029,0.747959,1.832995,0.821934,4.19073,0.880337,3.770337
2,7,41,0.92384,0.847187,2.406284,0.973438,1.093662,0.90218,1.441487,0.914892,2.207855,0.862869,3.382716
1,6,53,0.923278,0.930076,2.158842,0.953461,0.60118,0.800737,2.766632,0.913692,2.101878,0.865761,3.985344


In [14]:
# -------------------------
# 8) Interaction slice: State ? Season
# -------------------------
if STATE_COL in df.columns and "season" in df.columns:
    df_tmp = df.copy()
    df_tmp["state_season"] = df_tmp[STATE_COL].astype(str) + "__" + df_tmp["season"].astype(str)
    display(group_metrics(df_tmp, "state_season"))


Unnamed: 0,state_season,n,wR2,Dry_Green_g_R2,Dry_Green_g_MAE,Dry_Clover_g_R2,Dry_Clover_g_MAE,Dry_Dead_g_R2,Dry_Dead_g_MAE,GDM_g_R2,GDM_g_MAE,Dry_Total_g_R2,Dry_Total_g_MAE
0,NSW__summer,41,0.963385,0.912048,6.069418,0.693058,0.30083,0.8214,2.150946,0.92614,5.475066,0.941856,5.103185
1,Tas__spring,71,0.942912,0.917795,2.575346,0.962506,1.732459,0.875106,3.44993,0.935174,2.945393,0.918149,5.229776
3,Vic__spring,39,0.934126,0.877473,5.431769,0.933801,2.525734,0.790941,3.155072,0.939056,4.519428,0.868937,6.664508
4,Vic__winter,73,0.930568,0.843947,3.224397,0.807905,1.169613,0.747918,2.275505,0.899201,3.035018,0.883063,3.810662
2,Tas__winter,38,0.919309,0.924852,1.923201,0.962927,0.52216,0.886105,2.526744,0.896526,1.926783,0.852306,4.093919


In [15]:
# -------------------------
# 9) Bias diagnostics (mean error by group)
# -------------------------
if STATE_COL in df.columns:
    bias = df.groupby(STATE_COL).apply(
        lambda d: pd.Series({
            "n": len(d),
            **{f"{t}_bias": float((d[f"{t}_pred"] - d[t]).mean()) for t in TARGETS}
        })
    )
    display(bias.sort_values(by="n", ascending=False))


  bias = df.groupby(STATE_COL).apply(


Unnamed: 0_level_0,n,Dry_Green_g_bias,Dry_Clover_g_bias,Dry_Dead_g_bias,GDM_g_bias,Dry_Total_g_bias
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Tas,138.0,-0.72105,-0.078376,-0.060893,-0.077461,2.06507
Vic,112.0,-2.049189,-0.066972,-0.070385,-0.550931,2.453816
NSW,75.0,-2.61951,0.092981,-0.854726,-1.412431,0.877272
WA,32.0,-1.34358,0.155956,0.500694,-1.091552,1.057339


In [16]:
# -------------------------
# 10) Biomass regime diagnostics (by target quantiles)
# -------------------------
for t in TARGETS:
    q = pd.qcut(df[t], q=4, labels=False, duplicates="drop")
    df_tmp = df.copy()
    df_tmp["bin"] = q
    out = df_tmp.groupby("bin").apply(
        lambda d: pd.Series({
            "n": len(d),
            f"{t}_R2": r2_score(d[t].values, d[f"{t}_pred"].values),
            f"{t}_MAE": mae(d[t].values, d[f"{t}_pred"].values),
        })
    )
    print("", t)
    display(out)


 Dry_Green_g


  out = df_tmp.groupby("bin").apply(


Unnamed: 0_level_0,n,Dry_Green_g_R2,Dry_Green_g_MAE
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,90.0,0.742893,0.855025
1,89.0,0.186991,2.569668
2,89.0,-0.508913,4.08361
3,89.0,0.82225,7.317785


 Dry_Clover_g


  out = df_tmp.groupby("bin").apply(


Unnamed: 0_level_0,n,Dry_Clover_g_R2,Dry_Clover_g_MAE
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,179.0,-0.551545,0.275637
1,89.0,0.196276,1.172374
2,89.0,0.847116,4.00104


 Dry_Dead_g


  out = df_tmp.groupby("bin").apply(


Unnamed: 0_level_0,n,Dry_Dead_g_R2,Dry_Dead_g_MAE
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,90.0,-0.816595,0.977945
1,89.0,-2.28816,1.857685
2,89.0,-0.764574,2.754205
3,89.0,0.618262,5.115527


 GDM_g


  out = df_tmp.groupby("bin").apply(


Unnamed: 0_level_0,n,GDM_g_R2,GDM_g_MAE
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,90.0,0.733346,1.562081
1,89.0,-0.109613,2.536358
2,89.0,0.155465,3.516624
3,89.0,0.812744,6.908975


 Dry_Total_g


  out = df_tmp.groupby("bin").apply(


Unnamed: 0_level_0,n,Dry_Total_g_R2,Dry_Total_g_MAE
bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,90.0,0.384606,3.524159
1,89.0,-0.824692,4.066088
2,89.0,-0.664046,5.38417
3,89.0,0.823404,7.45646


In [17]:
# -------------------------
# 11) Generalization gap (seen vs unseen)
# -------------------------
# For each fold, mark values not present in the training folds.

def unseen_mask_by_fold(df_in, col):
    out = np.zeros(len(df_in), dtype=bool)
    for f in sorted(df_in["fold_id"].unique()):
        train_vals = set(df_in.loc[df_in["fold_id"] != f, col].dropna().unique())
        mask = df_in["fold_id"] == f
        out[mask] = ~df_in.loc[mask, col].isin(train_vals).values
    return out

for col in [STATE_COL, "month", site_col]:
    if col is None or col not in df.columns:
        continue
    df_tmp = df.copy()
    df_tmp["unseen"] = unseen_mask_by_fold(df_tmp, col)
    seen = df_tmp[~df_tmp["unseen"]]
    unseen = df_tmp[df_tmp["unseen"]]
    if len(unseen) < MIN_N:
        print(f"{col}: not enough unseen samples ({len(unseen)}).")
        continue
    y_seen = seen[TARGETS].values
    p_seen = seen[[f"{t}_pred" for t in TARGETS]].values
    y_unseen = unseen[TARGETS].values
    p_unseen = unseen[[f"{t}_pred" for t in TARGETS]].values
    print(f"{col} seen vs unseen")
    print("seen wR2:", w_r2(y_seen, p_seen, W))
    print("unseen wR2:", w_r2(y_unseen, p_unseen, W))


State: not enough unseen samples (0).
month: not enough unseen samples (27).
