# CV Split Search (v1)


In [None]:
# -------------------------
# 0) Config
# -------------------------
import os

CSIRO_CODE_DIR = "/notebooks/CSIRO"
DATA_ROOT = "/notebooks/kaggle/csiro"
TRAIN_CSV = f"{DATA_ROOT}/train.csv"

N_TRIALS = 300
TOP_K = 10
SEED_START = 0
N_SPLITS = 5

# Constraints / scoring
MIN_FOLD_N = None          # None = auto
MIN_TARGET_VAR = 1e-3
MIN_STATES_PER_FOLD = None
MIN_SEASONS_PER_FOLD = 2
N_BINS = 4
MIN_BIN_N = 5

GROUP_MODE = "state_quarter"  # "state_quarter" or "date"
DATE_COL = "Sampling_Date"
STATE_COL = "State"

OUT_PATH = "/notebooks/cv/cv_split_search_v1.csv"

# Guard rails
for name, val in {
    "CSIRO_CODE_DIR": CSIRO_CODE_DIR,
    "TRAIN_CSV": TRAIN_CSV,
}.items():
    if val is None:
        raise ValueError(f"{name} is None; set it before running.")


In [None]:
# -------------------------
# 1) Imports
# -------------------------
import sys
import pandas as pd

sys.path.insert(0, CSIRO_CODE_DIR)

from csiro.data import load_train_wide
from csiro.utils_v2 import search_cv_splits


In [None]:
# -------------------------
# 2) Load data
# -------------------------
wide_df = load_train_wide(TRAIN_CSV, root=DATA_ROOT)
print("rows", len(wide_df))


In [None]:
# -------------------------
# 3) Run search
# -------------------------
results = search_cv_splits(
    wide_df,
    n_splits=N_SPLITS,
    n_trials=N_TRIALS,
    seed_start=SEED_START,
    top_k=TOP_K,
    group_mode=GROUP_MODE,
    date_col=DATE_COL,
    state_col=STATE_COL,
    min_fold_n=MIN_FOLD_N,
    min_target_var=MIN_TARGET_VAR,
    min_states_per_fold=MIN_STATES_PER_FOLD,
    min_seasons_per_fold=MIN_SEASONS_PER_FOLD,
    n_bins=N_BINS,
    min_bin_n=MIN_BIN_N,
)

print("found", len(results), "candidates")
results[:3]


In [None]:
# -------------------------
# 4) Save top-K
# -------------------------
import os
import pandas as pd

os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)

if results:
    df_out = pd.DataFrame(results)
    df_out.to_csv(OUT_PATH, index=False)
    print("Wrote", OUT_PATH)
else:
    print("No candidates found; relax constraints.")
