In [309]:
import pandas as pd

df_train = pd.read_csv("/kaggle/input/axion-2025-dataset/train.csv")
df_test = pd.read_csv("/kaggle/input/axion-2025-dataset/test.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53448 entries, 0 to 53447
Data columns (total 12 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Timestamp                       53448 non-null  object 
 1   Record number                   53448 non-null  int64  
 2   Average Water Speed             53229 non-null  float64
 3   Average Water Direction         53447 non-null  float64
 4   Chlorophyll                     51744 non-null  float64
 5   Temperature                     44234 non-null  float64
 6   Dissolved Oxygen                49139 non-null  float64
 7   Dissolved Oxygen (%Saturation)  47690 non-null  float64
 8   pH                              52355 non-null  float64
 9   Salinity                        49481 non-null  float64
 10  Specific Conductance            52072 non-null  float64
 11  Turbidity                       47382 non-null  float64
dtypes: float64(10), int64(1), object

Unnamed: 0,Timestamp,Record number,Average Water Speed,Average Water Direction,Chlorophyll,Temperature,Dissolved Oxygen,Dissolved Oxygen (%Saturation),pH,Salinity,Specific Conductance,Turbidity
0,2023-08-04 23:00:00,1468,4.834,73.484,1.621,20.018,7.472,101.175,8.176,35.215,53.262,2.068
1,2023-08-04 23:30:00,1469,2.544,106.424,1.959,19.986,7.455,100.884,8.175,35.209,53.254,1.994
2,2023-08-04 23:00:00,1470,1.26,156.755,1.62,20.001,7.43,100.571,8.171,35.207,53.252,2.03
3,2023-08-04 23:30:00,1471,0.76,281.754,1.761,19.983,7.419,100.398,8.171,35.211,53.257,1.973
4,2023-08-04 23:00:00,1472,3.397,244.637,1.635,19.986,7.429,100.538,8.171,35.208,53.253,1.944


In [310]:
# Cek Top Corr
import numpy as np
import pandas as pd

# opsional (p-value); fallback ke Pandas-only jika SciPy tak tersedia
try:
    from scipy.stats import pearsonr
    SCIPY_OK = True
except Exception:
    SCIPY_OK = False

# Path sesuai instruksi
df_train = pd.read_csv("/kaggle/input/axion-2025-dataset/train.csv")
df_test  = pd.read_csv("/kaggle/input/axion-2025-dataset/test.csv")  # tidak dipakai untuk korelasi

# Blok 2 — Fungsi korelasi target-wise (pairwise NaN-safe)
def pearson_target_corr(df, target="Turbidity", exclude_cols=("Timestamp", "Record number")):
    # pilih kolom numerik
    num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    # drop kolom yang ingin dikecualikan + target sendiri
    features = [c for c in num_cols if c != target and c not in exclude_cols]

    rows = []
    for col in features:
        pair = df[[target, col]].dropna()
        n = len(pair)
        # hindari error pada kolom konstan atau sampel terlalu kecil
        if n < 3 or pair[col].nunique(dropna=True) < 2 or pair[target].nunique(dropna=True) < 2:
            r, p = np.nan, np.nan
        else:
            if SCIPY_OK:
                r, p = pearsonr(pair[target].values, pair[col].values)
            else:
                # fallback tanpa p-value
                r = pair[target].corr(pair[col], method="pearson")
                p = np.nan
        rows.append({
            "feature": col,
            "n_pair": n,
            "pearson_r": r,
            "abs_r": abs(r) if pd.notnull(r) else np.nan,
            "p_value": p
        })

    out = pd.DataFrame(rows).sort_values("abs_r", ascending=False, na_position="last").reset_index(drop=True)
    return out

# Blok 3 — Jalankan & tampilkan hasil
corr_tbl = pearson_target_corr(df_train, target="Turbidity", exclude_cols=("Timestamp","Record number"))
display_cols = ["feature", "n_pair", "pearson_r", "p_value"]
print(corr_tbl[display_cols].to_string(index=False))

# opsional: simpan ke CSV
corr_tbl.to_csv("turbidity_pearson_correlations.csv", index=False)


                       feature  n_pair  pearson_r       p_value
          Specific Conductance   46317  -0.179238  0.000000e+00
           Average Water Speed   47349   0.173904 3.912140e-318
                      Salinity   44413  -0.168092 8.846373e-279
Dissolved Oxygen (%Saturation)   42444  -0.158935 3.912737e-238
                            pH   46577   0.113211 1.124025e-132
              Dissolved Oxygen   43550  -0.105243 1.721306e-107
                   Temperature   38260  -0.066157  2.224704e-38
                   Chlorophyll   45738   0.031591  1.401403e-11
       Average Water Direction   47381  -0.008140  7.641617e-02


In [311]:
# Blok 1 — Setup + Paths + Load Data (gabung)
import os, sys, gc, warnings, math, json, random
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error

warnings.filterwarnings("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

SEED = 42
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
set_seed(SEED)

# Paths
DATA_DIR = Path("/kaggle/input/axion-2025-dataset")
WORK_DIR = Path("/kaggle/working")
TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH  = DATA_DIR / "test.csv"
SAMPLE_SUB_PATH = DATA_DIR / "sample_submission.csv"

# Kolom penting
TARGET_COL = "Turbidity"
ID_COL     = "Record number"
TIME_COL   = "Timestamp"
RAW_DIR_COL= "Average Water Direction"   # circular (0-360)

# Seleksi fitur (di Blok 6 semua ranking dicetak penuh)
# 1 = Pearson, 2 = Spearman, [1,2] = gabungan (porsi adil)
FEATURE_METHODS = [1, 2]
TOP_K = 25
ALWAYS_KEEP = []  # misal: ["hour_sin","hour_cos"]

# Imputasi & winsor
KNN_K = 5
WINSOR_Q_LOW, WINSOR_Q_HIGH = 0.005, 0.995

# Load
df_train = pd.read_csv(TRAIN_PATH)
df_test  = pd.read_csv(TEST_PATH)
sample_sub = pd.read_csv(SAMPLE_SUB_PATH)

# pastikan kolom unik
df_train = df_train.loc[:, ~df_train.columns.duplicated()].copy()
df_test  = df_test.loc[:,  ~df_test.columns.duplicated()].copy()

print("Loaded:", df_train.shape, df_test.shape, sample_sub.shape)


Loaded: (53448, 12) (14610, 11) (14610, 2)


In [312]:
# Blok 2 — Utilities (metrik, normalisasi angka koma→titik, FE waktu, winsor, korelasi)
from scipy.stats import pearsonr, spearmanr

def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

def mape(y_true, y_pred, eps=1e-6):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    denom = np.clip(np.abs(y_true), eps, None)
    return np.mean(np.abs((y_true - y_pred) / denom)) * 100.0

def smape(y_true, y_pred, eps=1e-6):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    denom = np.clip((np.abs(y_true) + np.abs(y_pred)), eps, None) / 2.0
    return np.mean(np.abs(y_pred - y_true) / denom) * 100.0

def normalize_comma_decimals(df: pd.DataFrame, cols):
    d = df.copy()
    for c in cols:
        if c in d.columns:
            if d[c].dtype == object:
                d[c] = d[c].str.replace(" ", "", regex=False)
                d[c] = d[c].str.replace(",", ".", regex=False)
            d[c] = pd.to_numeric(d[c], errors="coerce")
    return d

def add_time_features(df: pd.DataFrame, time_col: str = "Timestamp"):
    d = df.copy()
    d[time_col] = pd.to_datetime(d[time_col], errors="coerce", utc=False)
    d["year"] = d[time_col].dt.year
    d["month"] = d[time_col].dt.month
    d["dayofweek"] = d[time_col].dt.dayofweek
    d["hour"] = d[time_col].dt.hour
    d["dayofyear"] = d[time_col].dt.dayofyear
    # cyclical dari waktu (aman karena tidak bergantung sensor)
    d["hour_sin"]  = np.sin(2*np.pi*(d["hour"].fillna(0)/24))
    d["hour_cos"]  = np.cos(2*np.pi*(d["hour"].fillna(0)/24))
    d["month_sin"] = np.sin(2*np.pi*((d["month"].fillna(1)-1)/12))
    d["month_cos"] = np.cos(2*np.pi*((d["month"].fillna(1)-1)/12))
    d["doy_sin"]   = np.sin(2*np.pi*((d["dayofyear"].fillna(1)-1)/365))
    d["doy_cos"]   = np.cos(2*np.pi*((d["dayofyear"].fillna(1)-1)/365))
    return d

def make_winsor_bounds(df: pd.DataFrame, cols, q_low=0.005, q_high=0.995):
    bounds = {}
    for c in cols:
        lo = df[c].quantile(q_low)
        hi = df[c].quantile(q_high)
        bounds[c] = (lo, hi)
    return bounds

def corr_with_target(df, features, target, method="pearson"):
    rows = []
    for c in features:
        x = df[c].values
        y = df[target].values
        mask = np.isfinite(x) & np.isfinite(y)
        n_pair = int(mask.sum())
        if n_pair < 3 or np.unique(x[mask]).size < 2:
            r, p = np.nan, np.nan
        else:
            if method == "pearson":
                r, p = pearsonr(x[mask], y[mask])
            elif method == "spearman":
                r, p = spearmanr(x[mask], y[mask])
            else:
                raise ValueError("Unknown method")
        rows.append({"feature": c, "n_pair": n_pair, "r": r, "abs_r": abs(r) if pd.notnull(r) else np.nan, "p_value": p})
    out = pd.DataFrame(rows).sort_values("abs_r", ascending=False).reset_index(drop=True)
    return out

print("Utilities ready")


Utilities ready


In [313]:
# Blok 3 — Normalisasi angka ("," → ".") untuk semua kolom numerik (train & test)
numeric_cols_all = [
    "Average Water Speed",
    "Average Water Direction",
    "Chlorophyll",
    "Temperature",
    "Dissolved Oxygen",
    "Dissolved Oxygen (%Saturation)",
    "pH",
    "Salinity",
    "Specific Conductance",
    "Turbidity"  # train only; di test akan diabaikan jika tak ada
]

df_train = normalize_comma_decimals(df_train, numeric_cols_all)
df_test  = normalize_comma_decimals(df_test,  numeric_cols_all)

print("Dtypes (train numerics):")
print(df_train[[c for c in numeric_cols_all if c in df_train.columns]].dtypes)


Dtypes (train numerics):
Average Water Speed               float64
Average Water Direction           float64
Chlorophyll                       float64
Temperature                       float64
Dissolved Oxygen                  float64
Dissolved Oxygen (%Saturation)    float64
pH                                float64
Salinity                          float64
Specific Conductance              float64
Turbidity                         float64
dtype: object


In [314]:
# Blok 4 — FE waktu (aman) + kunci grup untuk imputasi slice waktu
train_time = add_time_features(df_train, TIME_COL)
test_time  = add_time_features(df_test,  TIME_COL)

for d in (train_time, test_time):
    d["_month"] = d["month"].astype("Int64")
    d["_hour"]  = d["hour"].astype("Int64")

print("Time features ready. Keys for grouping added (_month, _hour).")


Time features ready. Keys for grouping added (_month, _hour).


In [315]:
# # Blok 5 — Imputasi BASE numerik → KNN → clip pH → winsor + diagnostik
# base_numeric_cols = [
#     "Average Water Speed",
#     "Average Water Direction",
#     "Chlorophyll",
#     "Temperature",
#     "Dissolved Oxygen",
#     "Dissolved Oxygen (%Saturation)",
#     "pH",
#     "Salinity",
#     "Specific Conductance",
# ]

# missing_before_base = df_train[base_numeric_cols].isna().sum().sort_values(ascending=False)
# print("\n[Diagnostik] Missing BASE sebelum imputasi (TRAIN):")
# print(missing_before_base[missing_before_base>0].to_string())

# # rangkai data dengan kunci waktu
# train_b = pd.concat([train_time[["_month","_hour"]], df_train[base_numeric_cols]], axis=1)
# test_b  = pd.concat([test_time[["_month","_hour"]],  df_test[base_numeric_cols]],  axis=1)

# # Tahap-1: median per slice waktu
# group_keys = ["_month","_hour"]
# group_medians = {c: train_b.groupby(group_keys, observed=True)[c].median() for c in base_numeric_cols}
# global_medians = {c: train_b[c].median() for c in base_numeric_cols}

# def apply_group_median_fill_base(df_slice: pd.DataFrame):
#     d = df_slice.copy()
#     keys = list(zip(d["_month"].fillna(0).astype(int), d["_hour"].fillna(0).astype(int)))
#     for c in base_numeric_cols:
#         gm = group_medians[c]
#         fill_vals = [gm.get((m, h), np.nan) for (m, h) in keys]
#         fill_vals = pd.Series(fill_vals, index=d.index)
#         d[c] = d[c].fillna(fill_vals)
#         d[c] = d[c].fillna(global_medians[c])
#     return d

# train_b1 = apply_group_median_fill_base(train_b)
# test_b1  = apply_group_median_fill_base(test_b)
# missing_after_stage1_base = train_b1[base_numeric_cols].isna().sum().sort_values(ascending=False)

# # Tahap-2: KNNImputer (fit di TRAIN)
# scaler_base = StandardScaler()
# imputer_base = KNNImputer(n_neighbors=KNN_K, weights="uniform")

# scaler_base.fit(train_b1[base_numeric_cols])
# train_scaled = scaler_base.transform(train_b1[base_numeric_cols])
# test_scaled  = scaler_base.transform(test_b1[base_numeric_cols])

# imputer_base.fit(train_scaled)
# train_imputed_scaled = imputer_base.transform(train_scaled)
# test_imputed_scaled  = imputer_base.transform(test_scaled)

# train_b2 = train_b1.copy()
# test_b2  = test_b1.copy()
# train_b2[base_numeric_cols] = scaler_base.inverse_transform(train_imputed_scaled)
# test_b2[base_numeric_cols]  = scaler_base.inverse_transform(test_imputed_scaled)

# # Domain-clip pH
# for d in (train_b2, test_b2):
#     d["pH"] = d["pH"].clip(0, 14)

# # Winsorize ringan (bounds dari TRAIN)
# winsor_cols = [
#     "Average Water Speed","Chlorophyll","Temperature",
#     "Dissolved Oxygen","Dissolved Oxygen (%Saturation)","pH",
#     "Salinity","Specific Conductance","Average Water Direction"
# ]
# def apply_winsor_bounds_inplace(df_slice, bounds, cols):
#     for c in cols:
#         lo, hi = bounds[c]
#         df_slice[c] = df_slice[c].clip(lo, hi)

# winsor_bounds = make_winsor_bounds(train_b2, winsor_cols, WINSOR_Q_LOW, WINSOR_Q_HIGH)
# apply_winsor_bounds_inplace(train_b2, winsor_bounds, winsor_cols)
# apply_winsor_bounds_inplace(test_b2,  winsor_bounds, winsor_cols)

# # Fallback median bila masih ada residual NaN
# for c in base_numeric_cols:
#     med = train_b2[c].median()
#     train_b2[c] = train_b2[c].fillna(med)
#     test_b2[c]  = test_b2[c].fillna(med)

# missing_after_final_base = train_b2[base_numeric_cols].isna().sum().sort_values(ascending=False)

# # Diagnostik lengkap fase imputasi base
# filled_stage1 = (missing_before_base - missing_after_stage1_base).clip(lower=0)
# filled_stage2 = (missing_after_stage1_base - missing_after_final_base).clip(lower=0)
# filled_total  = (missing_before_base - missing_after_final_base).clip(lower=0)

# print("\n[Diagnostik] Imputasi BASE per kolom (TRAIN):")
# print(pd.DataFrame({
#     "before": missing_before_base,
#     "after_stage1": missing_after_stage1_base,
#     "after_final": missing_after_final_base,
#     "filled_stage1": filled_stage1,
#     "filled_stage2": filled_stage2,
#     "filled_total":  filled_total
# }).loc[missing_before_base.index].to_string())

# if int(missing_after_final_base.sum()) == 0:
#     print("\n[OK] Tidak ada missing tersisa pada BASE numerik (TRAIN).")
# else:
#     print("\n[Warning] Masih ada missing di BASE numerik (TRAIN):")
#     print(missing_after_final_base[missing_after_final_base>0].to_string())


In [316]:
# # Blok 5 — Imputasi BASE numerik (per kolom; sesuai saran) + clip pH + winsor + diagnostik
# from sklearn.ensemble import RandomForestRegressor

# base_numeric_cols = [
#     "Average Water Speed",
#     "Average Water Direction",
#     "Chlorophyll",
#     "Temperature",
#     "Dissolved Oxygen",
#     "Dissolved Oxygen (%Saturation)",
#     "pH",
#     "Salinity",
#     "Specific Conductance",
# ]

# # ====== 0) Diagnostik awal ======
# missing_before_base = df_train[base_numeric_cols].isna().sum().sort_values(ascending=False)
# print("\n[Diagnostik] Missing BASE sebelum imputasi (TRAIN):")
# print(missing_before_base[missing_before_base>0].to_string())

# # rangkai data dengan kunci waktu & waktu-bantu (_month, _hour) yg sudah dibuat di blok sebelumnya
# train_b = pd.concat([df_train[[TIME_COL]], train_time[["_month","_hour"]], df_train[base_numeric_cols]], axis=1)
# test_b  = pd.concat([df_test[[TIME_COL]],  test_time[["_month","_hour"]],  df_test[base_numeric_cols]],  axis=1)

# # ====== 1) Helper per-metode ======
# def _time_interp_linear_train(s: pd.Series, t: pd.Series) -> pd.Series:
#     d = pd.DataFrame({TIME_COL: t, "val": s}).sort_values(TIME_COL).set_index(TIME_COL)
#     d["val"] = d["val"].interpolate(method="time", limit_direction="both")
#     d["val"] = d["val"].ffill().bfill()
#     return d["val"].reset_index(drop=True)

# def _time_interp_linear_test(train_s: pd.Series, train_t: pd.Series,
#                              test_s: pd.Series,  test_t: pd.Series) -> pd.Series:
#     tr = pd.DataFrame({TIME_COL: train_t, "val": train_s, "_is_test": 0})
#     te = pd.DataFrame({TIME_COL: test_t,  "val": test_s,  "_is_test": 1})
#     both = pd.concat([tr, te], axis=0, ignore_index=True).sort_values(TIME_COL).set_index(TIME_COL)
#     mask_te = both["_is_test"]==1
#     both["val"] = both["val"].interpolate(method="time", limit_direction="forward")
#     both["val"] = both["val"].ffill()
#     out = both.loc[mask_te, "val"].reset_index(drop=True)
#     return out

# def _median_by_month(train_col: pd.Series, train_month: pd.Series,
#                      test_col: pd.Series,  test_month: pd.Series) -> tuple[pd.Series, pd.Series]:
#     m_map = train_col.groupby(train_month).median()
#     g_med = float(train_col.median())
#     tr = train_col.copy()
#     te = test_col.copy()
#     tr = tr.fillna(train_month.map(m_map))
#     tr = tr.fillna(g_med)
#     te = te.fillna(test_month.map(m_map))
#     te = te.fillna(g_med)
#     return tr, te

# def _rolling_impute(train_s: pd.Series, train_t: pd.Series,
#                     test_s: pd.Series,  test_t: pd.Series,
#                     win: int = 5, how: str = "median") -> tuple[pd.Series, pd.Series]:
#     # gabungkan, urut waktu → rolling past-only
#     tr = pd.DataFrame({TIME_COL: train_t, "val": train_s, "_is_test":0})
#     te = pd.DataFrame({TIME_COL: test_t,  "val": test_s,  "_is_test":1})
#     both = pd.concat([tr, te], axis=0, ignore_index=True).sort_values(TIME_COL)
#     if how == "median":
#         roll = both["val"].rolling(win, min_periods=1).median()
#     else:
#         roll = both["val"].rolling(win, min_periods=1).mean()
#     both["val"] = both["val"].fillna(roll)
#     # fallback sedikit agar aman
#     g_med = float(pd.to_numeric(train_s, errors="coerce").median())
#     both["val"] = both["val"].fillna(g_med)
#     # pisah kembali
#     tr_out = both.loc[both["_is_test"]==0, "val"].reset_index(drop=True)
#     te_out = both.loc[both["_is_test"]==1, "val"].reset_index(drop=True)
#     return tr_out, te_out

# # mapping strategi sesuai "BEST METHOD PER COLUMN"
# strategy = {
#     "Average Water Direction": "rf_per_column",
#     "Average Water Speed": "median_by_month",
#     "Chlorophyll": "rolling_median_w5",
#     "Dissolved Oxygen": "rf_per_column",
#     "Dissolved Oxygen (%Saturation)": "rf_per_column",
#     "pH": "rolling_mean_w5",
#     "Salinity": "rf_per_column",
#     "Specific Conductance": "time_interpolate_linear",
#     "Temperature": "time_interpolate_linear",
# }

# # ====== 2) Tahap NON-RF — isi kolom non-RF sesuai metode ======
# train_step = train_b[base_numeric_cols].copy()
# test_step  = test_b[base_numeric_cols].copy()

# orig_missing_train = {c: train_step[c].isna().values for c in base_numeric_cols}
# orig_missing_test  = {c: test_step[c].isna().values  for c in base_numeric_cols}

# for c in base_numeric_cols:
#     meth = strategy.get(c, "median_by_month")
#     if meth == "time_interpolate_linear":
#         train_step[c] = _time_interp_linear_train(train_b[c], train_b[TIME_COL])
#         test_step[c]  = _time_interp_linear_test(train_b[c], train_b[TIME_COL], test_b[c], test_b[TIME_COL])
#     elif meth == "median_by_month":
#         tr, te = _median_by_month(train_b[c], train_b["_month"], test_b[c], test_b["_month"])
#         train_step[c], test_step[c] = tr, te
#     elif meth == "rolling_median_w5":
#         tr, te = _rolling_impute(train_b[c], train_b[TIME_COL], test_b[c], test_b[TIME_COL], win=5, how="median")
#         train_step[c], test_step[c] = tr, te
#     elif meth == "rolling_mean_w5":
#         tr, te = _rolling_impute(train_b[c], train_b[TIME_COL], test_b[c], test_b[TIME_COL], win=5, how="mean")
#         train_step[c], test_step[c] = tr, te
#     elif meth == "rf_per_column":
#         # akan diisi pada Tahap RF; sementara biarkan (akan ada fallback median sementara untuk fitur lain)
#         pass
#     else:
#         # fallback aman
#         tr, te = _median_by_month(train_b[c], train_b["_month"], test_b[c], test_b["_month"])
#         train_step[c], test_step[c] = tr, te

# missing_after_nonrf = train_step[base_numeric_cols].isna().sum().sort_values(ascending=False)

# # ====== 3) Tahap RF per kolom ======
# rf_cols = [c for c,m in strategy.items() if m=="rf_per_column" and c in base_numeric_cols]
# # siapkan fitur RF: semua kolom base (hasil non-RF) + _month, _hour
# Xtr_base = pd.concat([train_time[["_month","_hour"]].reset_index(drop=True), train_step.reset_index(drop=True)], axis=1)
# Xte_base = pd.concat([test_time[["_month","_hour"]].reset_index(drop=True),  test_step.reset_index(drop=True)],  axis=1)

# # Isi sementara NA prediktor dengan median TRAIN (agar model bisa fit)
# med_map = {c: pd.to_numeric(Xtr_base[c], errors="coerce").median() for c in Xtr_base.columns if c not in ["_month","_hour"]}
# for D in (Xtr_base, Xte_base):
#     for c in D.columns:
#         if c in ["_month","_hour"]: 
#             D[c] = pd.to_numeric(D[c], errors="coerce").fillna(0).astype(int)
#         else:
#             D[c] = pd.to_numeric(D[c], errors="coerce").fillna(med_map.get(c, 0.0))

# for c in rf_cols:
#     # target y
#     y_tr = pd.to_numeric(df_train[c], errors="coerce")
#     mask_train_known = y_tr.notna()
#     if mask_train_known.sum() == 0:
#         # bila seluruhnya NaN, skip dan pakai median
#         fill_val = float(y_tr.median() if pd.notna(y_tr.median()) else 0.0)
#         train_step[c] = train_step[c].fillna(fill_val)
#         test_step[c]  = test_step[c].fillna(fill_val)
#         continue

#     # fitur = semua prediktor kecuali target kolom c
#     feat_cols_rf = [col for col in Xtr_base.columns if col not in [c]]
#     X_tr_rf = Xtr_base.loc[mask_train_known, feat_cols_rf]
#     y_tr_rf = y_tr.loc[mask_train_known].astype(float)

#     rf = RandomForestRegressor(
#         n_estimators=300, max_depth=12, min_samples_leaf=3,
#         random_state=SEED, n_jobs=-1
#     )
#     rf.fit(X_tr_rf, y_tr_rf)

#     # prediksi untuk baris yang tadinya NA saja
#     if orig_missing_train[c].any():
#         idx_na_tr = np.where(orig_missing_train[c])[0]
#         if len(idx_na_tr):
#             pred_tr = rf.predict(Xtr_base.iloc[idx_na_tr][feat_cols_rf])
#             train_step.loc[idx_na_tr, c] = pred_tr.astype(float)
#     if orig_missing_test[c].any():
#         idx_na_te = np.where(orig_missing_test[c])[0]
#         if len(idx_na_te):
#             pred_te = rf.predict(Xte_base.iloc[idx_na_te][feat_cols_rf])
#             test_step.loc[idx_na_te, c] = pred_te.astype(float)

# # ====== 4) Domain-clip pH & Winsorize ringan ======
# for d in (train_step, test_step):
#     d["pH"] = pd.to_numeric(d["pH"], errors="coerce").clip(0, 14)

# winsor_cols = [
#     "Average Water Speed","Chlorophyll","Temperature",
#     "Dissolved Oxygen","Dissolved Oxygen (%Saturation)","pH",
#     "Salinity","Specific Conductance","Average Water Direction"
# ]
# winsor_bounds = make_winsor_bounds(train_step, winsor_cols, WINSOR_Q_LOW, WINSOR_Q_HIGH)
# def _apply_winsor(df_slice, bounds, cols):
#     for cc in cols:
#         lo, hi = bounds.get(cc, (np.nan, np.nan))
#         if pd.notna(lo) and pd.notna(hi) and cc in df_slice.columns:
#             df_slice[cc] = pd.to_numeric(df_slice[cc], errors="coerce").clip(lo, hi)
# _apply_winsor(train_step, winsor_bounds, winsor_cols)
# _apply_winsor(test_step,  winsor_bounds, winsor_cols)

# # ====== 5) Fallback median jika masih ada NA ======
# for c in base_numeric_cols:
#     med = pd.to_numeric(train_step[c], errors="coerce").median()
#     train_step[c] = pd.to_numeric(train_step[c], errors="coerce").fillna(med)
#     test_step[c]  = pd.to_numeric(test_step[c],  errors="coerce").fillna(med)

# # ====== 6) Output akhir + diagnostik ======
# train_b2 = train_step.copy()
# test_b2  = test_step.copy()

# missing_after_final_base = train_b2[base_numeric_cols].isna().sum().sort_values(ascending=False)

# filled_stage1 = (missing_before_base - missing_after_nonrf).clip(lower=0)
# filled_stage2 = (missing_after_nonrf - missing_after_final_base).clip(lower=0)
# filled_total  = (missing_before_base - missing_after_final_base).clip(lower=0)

# print("\n[Diagnostik] Imputasi BASE per kolom (TRAIN):")
# print(pd.DataFrame({
#     "before": missing_before_base,
#     "after_stage1(nonRF)": missing_after_nonrf,
#     "after_final": missing_after_final_base,
#     "filled_stage1": filled_stage1,
#     "filled_stage2(RF+post)": filled_stage2,
#     "filled_total":  filled_total
# }).loc[missing_before_base.index].to_string())

# if int(missing_after_final_base.sum()) == 0:
#     print("\n[OK] Tidak ada missing tersisa pada BASE numerik (TRAIN).")
# else:
#     print("\n[Warning] Masih ada missing di BASE numerik (TRAIN):")
#     print(missing_after_final_base[missing_after_final_base>0].to_string())

# # (Compat) Buat scaler_base dummy agar blok penyimpanan artefak tidak error
# class _DummyScaler: pass
# scaler_base = _DummyScaler()
# scaler_base.mean_  = train_b2[base_numeric_cols].mean().values
# _s = train_b2[base_numeric_cols].std(ddof=0).replace(0, 1.0)
# scaler_base.scale_ = _s.values


In [317]:
# Blok 5 — Imputasi BASE numerik (per kolom) + clip pH + winsor + imputasi TARGET orde-2 + diagnostik
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import pandas as pd

# ---- Konfigurasi umum ----
base_numeric_cols = [
    "Average Water Speed",
    "Average Water Direction",
    "Chlorophyll",
    "Temperature",
    "Dissolved Oxygen",
    "Dissolved Oxygen (%Saturation)",
    "pH",
    "Salinity",
    "Specific Conductance",
]

# Mapping strategi sesuai “BEST METHOD PER COLUMN”
strategy = {
    "Average Water Direction": "rf_per_column",
    "Average Water Speed": "median_by_month",
    "Chlorophyll": "rolling_median_w5",
    "Dissolved Oxygen": "rf_per_column",
    "Dissolved Oxygen (%Saturation)": "rf_per_column",
    "pH": "rolling_mean_w5",
    "Salinity": "rf_per_column",
    "Specific Conductance": "time_interpolate_linear",
    "Temperature": "time_interpolate_linear",
}

# Toggle target imputasi sebagai label (default False = label asli saja)
USE_IMPUTED_TARGET_AS_LABEL = False

# ===== A) Diagnostik awal =====
missing_before_base = df_train[base_numeric_cols].isna().sum().sort_values(ascending=False)
print("\n[Diagnostik] Missing BASE sebelum imputasi (TRAIN):")
print(missing_before_base[missing_before_base>0].to_string())

# rangkai data dengan waktu bantu (_month, _hour) dari blok waktu sebelumnya
train_b = pd.concat([df_train[[TIME_COL]], train_time[["_month","_hour"]], df_train[base_numeric_cols]], axis=1)
test_b  = pd.concat([df_test[[TIME_COL]],  test_time[["_month","_hour"]],  df_test[base_numeric_cols]],  axis=1)

# ===== B) Helper per-metode =====
def _time_interp_linear_train(s: pd.Series, t: pd.Series) -> pd.Series:
    d = pd.DataFrame({TIME_COL: t, "val": s}).sort_values(TIME_COL).set_index(TIME_COL)
    d["val"] = d["val"].interpolate(method="time", limit_direction="both")
    d["val"] = d["val"].ffill().bfill()
    return d["val"].reset_index(drop=True)

def _time_interp_linear_test(train_s: pd.Series, train_t: pd.Series,
                             test_s: pd.Series,  test_t: pd.Series) -> pd.Series:
    tr = pd.DataFrame({TIME_COL: train_t, "val": train_s, "_is_test": 0})
    te = pd.DataFrame({TIME_COL: test_t,  "val": test_s,  "_is_test": 1})
    both = pd.concat([tr, te], axis=0, ignore_index=True).sort_values(TIME_COL).set_index(TIME_COL)
    mask_te = both["_is_test"]==1
    both["val"] = both["val"].interpolate(method="time", limit_direction="forward")
    both["val"] = both["val"].ffill()
    out = both.loc[mask_te, "val"].reset_index(drop=True)
    return out

def _median_by_month(train_col: pd.Series, train_month: pd.Series,
                     test_col: pd.Series,  test_month: pd.Series) -> tuple[pd.Series, pd.Series]:
    m_map = train_col.groupby(train_month).median()
    g_med = float(train_col.median())
    tr = train_col.copy()
    te = test_col.copy()
    tr = tr.fillna(train_month.map(m_map))
    tr = tr.fillna(g_med)
    te = te.fillna(test_month.map(m_map))
    te = te.fillna(g_med)
    return tr, te

def _rolling_impute(train_s: pd.Series, train_t: pd.Series,
                    test_s: pd.Series,  test_t: pd.Series,
                    win: int = 5, how: str = "median") -> tuple[pd.Series, pd.Series]:
    # gabungkan, urut waktu → rolling past-only
    tr = pd.DataFrame({TIME_COL: train_t, "val": train_s, "_is_test":0})
    te = pd.DataFrame({TIME_COL: test_t,  "val": test_s,  "_is_test":1})
    both = pd.concat([tr, te], axis=0, ignore_index=True).sort_values(TIME_COL)
    if how == "median":
        roll = both["val"].rolling(win, min_periods=1).median()
    else:
        roll = both["val"].rolling(win, min_periods=1).mean()
    both["val"] = both["val"].fillna(roll)
    # fallback sedikit agar aman
    g_med = float(pd.to_numeric(train_s, errors="coerce").median())
    both["val"] = both["val"].fillna(g_med)
    # pisah kembali
    tr_out = both.loc[both["_is_test"]==0, "val"].reset_index(drop=True)
    te_out = both.loc[both["_is_test"]==1, "val"].reset_index(drop=True)
    return tr_out, te_out

# ===== C) Tahap NON-RF — isi kolom non-RF sesuai metode =====
train_step = train_b[base_numeric_cols].copy()
test_step  = test_b[base_numeric_cols].copy()

orig_missing_train = {c: train_step[c].isna().values for c in base_numeric_cols}
orig_missing_test  = {c: test_step[c].isna().values  for c in base_numeric_cols}

for c in base_numeric_cols:
    meth = strategy.get(c, "median_by_month")
    if meth == "time_interpolate_linear":
        train_step[c] = _time_interp_linear_train(train_b[c], train_b[TIME_COL])
        test_step[c]  = _time_interp_linear_test(train_b[c], train_b[TIME_COL], test_b[c], test_b[TIME_COL])
    elif meth == "median_by_month":
        tr, te = _median_by_month(train_b[c], train_b["_month"], test_b[c], test_b["_month"])
        train_step[c], test_step[c] = tr, te
    elif meth == "rolling_median_w5":
        tr, te = _rolling_impute(train_b[c], train_b[TIME_COL], test_b[c], test_b[TIME_COL], win=5, how="median")
        train_step[c], test_step[c] = tr, te
    elif meth == "rolling_mean_w5":
        tr, te = _rolling_impute(train_b[c], train_b[TIME_COL], test_b[c], test_b[TIME_COL], win=5, how="mean")
        train_step[c], test_step[c] = tr, te
    elif meth == "rf_per_column":
        # akan diisi pada Tahap RF; biarkan dulu
        pass
    else:
        # fallback aman
        tr, te = _median_by_month(train_b[c], train_b["_month"], test_b[c], test_b["_month"])
        train_step[c], test_step[c] = tr, te

missing_after_nonrf = train_step[base_numeric_cols].isna().sum().sort_values(ascending=False)

# ===== D) Tahap RF per kolom =====
rf_cols = [c for c,m in strategy.items() if m=="rf_per_column" and c in base_numeric_cols]

# fitur RF: semua kolom base (hasil non-RF) + _month, _hour
Xtr_base = pd.concat([train_time[["_month","_hour"]].reset_index(drop=True), train_step.reset_index(drop=True)], axis=1)
Xte_base = pd.concat([test_time[["_month","_hour"]].reset_index(drop=True),  test_step.reset_index(drop=True)],  axis=1)

# Isi sementara NA prediktor dengan median TRAIN (agar model bisa fit)
med_map = {c: pd.to_numeric(Xtr_base[c], errors="coerce").median() for c in Xtr_base.columns if c not in ["_month","_hour"]}
for D in (Xtr_base, Xte_base):
    for c in D.columns:
        if c in ["_month","_hour"]:
            D[c] = pd.to_numeric(D[c], errors="coerce").fillna(0).astype(int)
        else:
            D[c] = pd.to_numeric(D[c], errors="coerce").fillna(med_map.get(c, 0.0))

for c in rf_cols:
    # target y = nilai asli kolom c (TRAIN)
    y_tr = pd.to_numeric(df_train[c], errors="coerce")
    mask_train_known = y_tr.notna()
    if mask_train_known.sum() == 0:
        fill_val = float(y_tr.median() if pd.notna(y_tr.median()) else 0.0)
        train_step[c] = train_step[c].fillna(fill_val)
        test_step[c]  = test_step[c].fillna(fill_val)
        continue

    feat_cols_rf = [col for col in Xtr_base.columns if col != c]
    X_tr_rf = Xtr_base.loc[mask_train_known, feat_cols_rf]
    y_tr_rf = y_tr.loc[mask_train_known].astype(float)

    rf = RandomForestRegressor(
        n_estimators=300, max_depth=12, min_samples_leaf=3,
        random_state=SEED, n_jobs=-1
    )
    rf.fit(X_tr_rf, y_tr_rf)

    # prediksi hanya untuk posisi yang tadinya NA
    if orig_missing_train[c].any():
        idx_na_tr = np.where(orig_missing_train[c])[0]
        if len(idx_na_tr):
            pred_tr = rf.predict(Xtr_base.iloc[idx_na_tr][feat_cols_rf])
            train_step.loc[idx_na_tr, c] = pred_tr.astype(float)
    if orig_missing_test[c].any():
        idx_na_te = np.where(orig_missing_test[c])[0]
        if len(idx_na_te):
            pred_te = rf.predict(Xte_base.iloc[idx_na_te][feat_cols_rf])
            test_step.loc[idx_na_te, c] = pred_te.astype(float)

# ===== E) Domain-clip pH & Winsorize ringan =====
for d in (train_step, test_step):
    d["pH"] = pd.to_numeric(d["pH"], errors="coerce").clip(0, 14)

winsor_cols = [
    "Average Water Speed","Chlorophyll","Temperature",
    "Dissolved Oxygen","Dissolved Oxygen (%Saturation)","pH",
    "Salinity","Specific Conductance","Average Water Direction"
]
winsor_bounds = make_winsor_bounds(train_step, winsor_cols, WINSOR_Q_LOW, WINSOR_Q_HIGH)

def _apply_winsor(df_slice, bounds, cols):
    for cc in cols:
        lo, hi = bounds.get(cc, (np.nan, np.nan))
        if pd.notna(lo) and pd.notna(hi) and cc in df_slice.columns:
            df_slice[cc] = pd.to_numeric(df_slice[cc], errors="coerce").clip(lo, hi)

_apply_winsor(train_step, winsor_bounds, winsor_cols)
_apply_winsor(test_step,  winsor_bounds, winsor_cols)

# ===== F) Fallback median jika masih ada NA =====
for c in base_numeric_cols:
    med = pd.to_numeric(train_step[c], errors="coerce").median()
    train_step[c] = pd.to_numeric(train_step[c], errors="coerce").fillna(med)
    test_step[c]  = pd.to_numeric(test_step[c],  errors="coerce").fillna(med)

# Output akhir fitur imputasi
train_b2 = train_step.copy()
test_b2  = test_step.copy()

missing_after_final_base = train_b2[base_numeric_cols].isna().sum().sort_values(ascending=False)

# ===== G) Imputasi TARGET orde-2 (context) + toggle label =====
tgt_df = df_train[[TIME_COL, TARGET_COL]].copy().sort_values(TIME_COL).set_index(TIME_COL)
missing_before_tgt = int(tgt_df[TARGET_COL].isna().sum())
try:
    tgt_df[TARGET_COL] = tgt_df[TARGET_COL].interpolate(method="spline", order=2, limit_direction="both")
except Exception:
    tgt_df[TARGET_COL] = tgt_df[TARGET_COL].interpolate(method="polynomial", order=2, limit_direction="both")
tgt_df[TARGET_COL] = tgt_df[TARGET_COL].ffill().bfill()
df_train["Turbidity_ctx"] = tgt_df.reset_index()[TARGET_COL].astype(float).values
missing_after_tgt = int(df_train["Turbidity_ctx"].isna().sum())

if USE_IMPUTED_TARGET_AS_LABEL:
    df_train["Turbidity_filled"] = (
        pd.to_numeric(df_train[TARGET_COL], errors="coerce")
        .fillna(df_train["Turbidity_ctx"])
        .astype(float)
    )
    label_note = "Label latihan = Turbidity_filled (pakai imputasi)."
else:
    df_train["Turbidity_filled"] = pd.to_numeric(df_train[TARGET_COL], errors="coerce").astype(float)
    label_note = "Label latihan = nilai asli (NaN tidak dilatih)."

# ===== H) Diagnostik lengkap =====
filled_stage1 = (missing_before_base - missing_after_nonrf).clip(lower=0)
filled_stage2 = (missing_after_nonrf - missing_after_final_base).clip(lower=0)
filled_total  = (missing_before_base - missing_after_final_base).clip(lower=0)

print("\n[Diagnostik] Imputasi BASE per kolom (TRAIN):")
print(pd.DataFrame({
    "before": missing_before_base,
    "after_stage1(nonRF)": missing_after_nonrf,
    "after_final": missing_after_final_base,
    "filled_stage1": filled_stage1,
    "filled_stage2(RF+post)": filled_stage2,
    "filled_total":  filled_total
}).loc[missing_before_base.index].to_string())

if int(missing_after_final_base.sum()) == 0:
    print("\n[OK] Tidak ada missing tersisa pada BASE numerik (TRAIN).")
else:
    print("\n[Warning] Masih ada missing di BASE numerik (TRAIN):")
    print(missing_after_final_base[missing_after_final_base>0].to_string())

print(f"\n[TARGET_CTX] Missing sebelum={missing_before_tgt} | sesudah={missing_after_tgt} (ctx encoder).")
print(f"[TARGET_LABEL] {label_note}")

# (Compat) Buat scaler_base dummy agar blok penyimpanan artefak tidak error
class _DummyScaler: pass
scaler_base = _DummyScaler()
scaler_base.mean_  = train_b2[base_numeric_cols].mean().values
_s = train_b2[base_numeric_cols].std(ddof=0).replace(0, 1.0)
scaler_base.scale_ = _s.values

# (Opsional) Cetak ringkas strategi yang dipakai
print("\n[STRATEGY] Imputasi per kolom:")
for k in base_numeric_cols:
    print(f"  - {k}: {strategy.get(k, 'median_by_month')}")



[Diagnostik] Missing BASE sebelum imputasi (TRAIN):
Temperature                       9214
Dissolved Oxygen (%Saturation)    5758
Dissolved Oxygen                  4309
Salinity                          3967
Chlorophyll                       1704
Specific Conductance              1376
pH                                1093
Average Water Speed                219
Average Water Direction              1


ValueError: time-weighted interpolation only works on Series or DataFrames with a DatetimeIndex

In [None]:
# Blok 6 — FE turunan (SETELAH imputasi) + rakit train_ready/test_ready + candidate_features
# (PATCH: dedup kolom + akses kolom arah yang robust)

# Satukan waktu + base terimput
train_base = pd.concat(
    [train_time.reset_index(drop=True), train_b2[[
        "Average Water Speed","Average Water Direction","Chlorophyll","Temperature",
        "Dissolved Oxygen","Dissolved Oxygen (%Saturation)","pH","Salinity","Specific Conductance"
    ]].reset_index(drop=True)],
    axis=1
)
test_base  = pd.concat(
    [test_time.reset_index(drop=True),  test_b2[[
        "Average Water Speed","Average Water Direction","Chlorophyll","Temperature",
        "Dissolved Oxygen","Dissolved Oxygen (%Saturation)","pH","Salinity","Specific Conductance"
    ]].reset_index(drop=True)],
    axis=1
)

# >>>> NEW: pastikan tidak ada kolom duplikat (keep first) <<<<
train_base = train_base.loc[:, ~train_base.columns.duplicated()].copy()
test_base  = test_base.loc[:,  ~test_base.columns.duplicated()].copy()

# >>>> NEW: helper agar selalu dapat Series 1D biarpun ada duplikat tersisa <<<<
def _get_series(df: pd.DataFrame, col: str) -> pd.Series:
    obj = df[col]
    # Jika karena suatu alasan masih DataFrame (duplikat), ambil kolom pertama
    if isinstance(obj, pd.DataFrame):
        return obj.iloc[:, 0]
    return obj

# FE sirkular & turunan penting (aman karena semua base sudah diimput)
def add_sensor_fe(df: pd.DataFrame) -> pd.DataFrame:
    d = df.copy()
    # arah (pakai accessor robust)
    dir_deg = pd.to_numeric(_get_series(d, "Average Water Direction"), errors="coerce")
    rad = np.deg2rad(dir_deg)
    d["dir_sin"] = np.sin(rad)
    d["dir_cos"] = np.cos(rad)

    # interaksi dan nonlin
    spd = pd.to_numeric(_get_series(d, "Average Water Speed"), errors="coerce")
    tmp = pd.to_numeric(_get_series(d, "Temperature"), errors="coerce")
    sal = pd.to_numeric(_get_series(d, "Salinity"), errors="coerce")
    sc  = pd.to_numeric(_get_series(d, "Specific Conductance"), errors="coerce")
    do_sat = pd.to_numeric(_get_series(d, "Dissolved Oxygen (%Saturation)"), errors="coerce")

    d["speed_x_dir_sin"] = spd * d["dir_sin"]
    d["speed_x_dir_cos"] = spd * d["dir_cos"]
    d["temp2"] = tmp ** 2
    d["cond_per_sal"] = sc / sal.replace(0, np.nan)
    d["do_sat_diff"] = do_sat - 100.0
    return d

train_base = add_sensor_fe(train_base)
test_base  = add_sensor_fe(test_base)

# Kandidat fitur (tanpa Average Water Direction mentah — sudah diwakili sin/cos)
base_sensor_cols = [
    "Average Water Speed","Chlorophyll","Temperature","Dissolved Oxygen",
    "Dissolved Oxygen (%Saturation)","pH","Salinity","Specific Conductance"
]
eng_time = [
    "hour","month","dayofweek","dayofyear",
    "hour_sin","hour_cos","month_sin","month_cos","doy_sin","doy_cos",
]
eng_circ = ["dir_sin","dir_cos"]
eng_extra = ["speed_x_dir_sin","speed_x_dir_cos","temp2","cond_per_sal","do_sat_diff"]

candidate_features = base_sensor_cols + eng_time + eng_circ + eng_extra

# Rakit frame siap modeling (BELUM seleksi fitur)
train_ready = train_base.copy()
train_ready[TARGET_COL] = df_train[TARGET_COL].values
test_ready  = test_base.copy()

# Safety FE: ubah inf→NaN lalu isi median berdasar TRAIN
for D in (train_ready, test_ready):
    D.replace([np.inf, -np.inf], np.nan, inplace=True)

med_map_full = train_ready[candidate_features].median(numeric_only=True).to_dict()
for D in (train_ready, test_ready):
    for c in candidate_features:
        if D[c].isna().any():
            D[c] = D[c].fillna(med_map_full.get(c, 0.0))

print(f"[FE] Candidate features: {len(candidate_features)} -> {candidate_features}")


In [None]:
train_ready.info()

In [None]:
# Blok 7 — Ranking korelasi (FULL) + seleksi fitur + build X/y/X_test
methods_map = {1: "pearson", 2: "spearman"}
chosen_methods = [methods_map[m] for m in FEATURE_METHODS]

features_available = [c for c in candidate_features if c in train_ready.columns]
if len(features_available) < len(candidate_features):
    missing_feat = [c for c in candidate_features if c not in train_ready.columns]
    print("[Warn] Fitur tidak ditemukan (skip):", missing_feat)

top_cap = min(TOP_K, len(features_available))

# Basis korelasi: drop baris target NaN
corr_base = train_ready[features_available + [TARGET_COL]].copy()
corr_base = corr_base[corr_base[TARGET_COL].notna()].reset_index(drop=True)

# Hitung & CETAK ranking penuh + simpan CSV
corr_outputs = {}
for m in chosen_methods:
    corr_tbl = corr_with_target(corr_base, features_available, TARGET_COL, method=m)
    corr_outputs[m] = corr_tbl
    print(f"\n[Ranking Korelasi — {m.upper()}] (abs_r desc)")
    print(corr_tbl[["feature","n_pair","r","p_value","abs_r"]].to_string(index=True))
    corr_tbl.to_csv(WORK_DIR / f"corr_ranking_{m}.csv", index=False)

# Seleksi fitur: porsi adil per metode, sisanya isi rata-rata ranking
selected = set()
k_per_method = max(1, top_cap // max(1, len(chosen_methods)))
for m in chosen_methods:
    selected.update(corr_outputs[m].head(k_per_method)["feature"].tolist())

if len(selected) < top_cap:
    feature_scores = {}
    for m, cdf in corr_outputs.items():
        cdf = cdf.reset_index(drop=False).rename(columns={"index": "rank0"})
        cdf["rank"] = cdf["rank0"] + 1
        for _, row in cdf.iterrows():
            feature_scores.setdefault(row["feature"], []).append(row["rank"])
    avg_rank = {f: np.mean(rks) for f, rks in feature_scores.items()}
    fill_pool = [f for f in features_available if f not in selected]
    fill_order = sorted(fill_pool, key=lambda x: avg_rank.get(x, 1e9))
    for f in fill_order:
        if len(selected) >= top_cap:
            break
        selected.add(f)

selected = list(sorted(set(list(selected) + list(ALWAYS_KEEP))))
selected = [f for f in selected if f in features_available][:top_cap]
print(f"\n[SELECTED] features ({len(selected)}): {selected}")

# Build X, y, X_test (dipakai blok modeling)
X_all = train_ready[selected].copy()
y_all = train_ready[TARGET_COL].copy()
mask_train = y_all.notna().values

X = X_all.loc[mask_train].astype(np.float32).copy()
y = y_all.loc[mask_train].astype(np.float32).copy()
X_test = test_ready[selected].astype(np.float32).copy()

# Safety terakhir
for D in (X, X_test):
    if D.isna().any().any():
        med_map = X.median(numeric_only=True).to_dict()
        for c in D.columns:
            if D[c].isna().any():
                D[c] = D[c].fillna(med_map.get(c, 0.0))

print(f"\n[Check Modeling] Rows (train, after drop NaN target): {len(X)}")
print(f"[Check Modeling] Missing in X: {int(X.isna().sum().sum())} | Missing in y: {int(y.isna().sum())}")


In [None]:
# Blok 8 — PEMODELAN (konfig di sini; mudah diganti)
import lightgbm as lgb

MODEL_NAME = "lgbm"
MODEL_PARAMS = {
    "objective": "regression",
    "boosting_type": "gbdt",
    "metric": "rmse",
    "n_estimators": 5000,   # dipangkas oleh early_stopping saat validasi
    "learning_rate": 0.03,
    "num_leaves": 64,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.9,
    "bagging_freq": 1,
    "min_data_in_leaf": 40,
    "reg_alpha": 0.0,
    "reg_lambda": 0.0,
    "random_state": SEED,
    "n_jobs": -1,
    "verbose": -1
}

def make_model(name: str, params: dict):
    if name == "lgbm":
        return lgb.LGBMRegressor(**params)
    else:
        raise NotImplementedError(f"Model '{name}' belum diimplementasikan.")

print(f"Model ready: {MODEL_NAME}")


In [None]:
# Blok 9 — VALIDASI (CV 5-fold) + OOF metrics + FI
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
oof_pred = np.zeros(len(X), dtype=float)
models = []
best_iters = []
fold_metrics = []

for fold, (tr_idx, va_idx) in enumerate(kf.split(X, y), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    model = make_model(MODEL_NAME, MODEL_PARAMS)
    if MODEL_NAME == "lgbm":
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="rmse",
            callbacks=[lgb.early_stopping(200, verbose=False), lgb.log_evaluation(0)]
        )
        pred_va = model.predict(X_va, num_iteration=model.best_iteration_)
        best_iters.append(model.best_iteration_)
    else:
        model.fit(X_tr, y_tr)
        pred_va = model.predict(X_va)

    oof_pred[va_idx] = pred_va

    mae_   = mean_absolute_error(y_va, pred_va)
    mse_   = mean_squared_error(y_va, pred_va)  # MSE per fold
    rmse_  = rmse(y_va, pred_va)
    mape_  = mape(y_va, pred_va)
    smape_ = smape(y_va, pred_va)

    fold_metrics.append({
        "fold": fold,
        "MAE": mae_,
        "MSE": mse_,
        "RMSE": rmse_,
        "MAPE": mape_,
        "SMAPE": smape_
    })
    models.append(model)

    # === Print MSE per fold ===
    print(f"Fold {fold}: MSE = {mse_:.6f}")

# === Rata-rata per metrik (mean of folds) ===
fm_df = pd.DataFrame(fold_metrics).sort_values("fold")
avg_metrics = fm_df[["MAE","MSE","RMSE","MAPE","SMAPE"]].mean().to_dict()
print("\nRata-rata per metrik (mean of 5 folds)")
print(json.dumps({k: round(v, 6) for k, v in avg_metrics.items()}, indent=2))

# (Opsional) OOF metrics—tetap kita hitung kalau kamu mau bandingkan dengan mean-of-folds
oof_mae  = mean_absolute_error(y, oof_pred)
oof_mse  = mean_squared_error(y, oof_pred)
oof_rmse = rmse(y, oof_pred)
oof_mape = mape(y, oof_pred)
oof_smape= smape(y, oof_pred)
print("\nOOF Metrics (gabungan seluruh fold)")
print(json.dumps({
    "MAE": round(oof_mae, 6),
    "MSE": round(oof_mse, 6),
    "RMSE": round(oof_rmse, 6),
    "MAPE": round(oof_mape, 6),
    "SMAPE": round(oof_smape, 6)
}, indent=2))

# Simpan output evaluasi
fm_df.to_csv(WORK_DIR / "cv_fold_metrics.csv", index=False)

# Feature importance
try:
    if MODEL_NAME == "lgbm":
        fi_mat = [m.booster_.feature_importance(importance_type="gain") for m in models]
        fi = pd.DataFrame({"feature": X.columns, "importance": np.mean(fi_mat, axis=0)})
    elif hasattr(models[0], "feature_importances_"):
        fi_mat = [m.feature_importances_ for m in models]
        fi = pd.DataFrame({"feature": X.columns, "importance": np.mean(fi_mat, axis=0)})
    else:
        fi = pd.DataFrame({"feature": X.columns, "importance": np.nan})
    fi = fi.sort_values("importance", ascending=False)
    fi.to_csv(WORK_DIR / "cv_feature_importance.csv", index=False)
except Exception:
    pass


k = 5: "MSE": 13.889316,
k = 10, "MSE": 6.153683,
k = 15, "MSE": 5.952698,
k = 20, "MSE": 5.884146,
k = 25, "MSE": 

In [None]:
# Blok 10 — RETRAIN 100% & simpan model
import joblib

if MODEL_NAME == "lgbm" and len(best_iters):
    avg_best_iter = int(np.mean(best_iters))
    final_n_estimators = max(200, int(avg_best_iter * 1.05))
    final_params = {**MODEL_PARAMS, "n_estimators": final_n_estimators}
else:
    final_params = {**MODEL_PARAMS}
    final_n_estimators = final_params.get("n_estimators", 1000)

final_model = make_model(MODEL_NAME, final_params)
final_model.fit(X, y)

if MODEL_NAME == "lgbm":
    MODEL_TXT_PATH = WORK_DIR / f"{MODEL_NAME}_turbidity_model.txt"
    final_model.booster_.save_model(str(MODEL_TXT_PATH))
    print("Model saved to:", MODEL_TXT_PATH)
else:
    MODEL_PKL_PATH = WORK_DIR / f"{MODEL_NAME}_turbidity_model.pkl"
    joblib.dump(final_model, MODEL_PKL_PATH)
    print("Model saved to:", MODEL_PKL_PATH)


In [None]:
# Blok 11 — Prediksi TEST & Submission (Record number,Turbidity)
test_pred = final_model.predict(X_test)

sub = sample_sub[[ID_COL]].copy()
pred_df = pd.DataFrame({ID_COL: df_test[ID_COL], TARGET_COL: test_pred})
sub = sub.merge(pred_df, on=ID_COL, how="left")

SUB_PATH = WORK_DIR / "submission.csv"
sub.to_csv(SUB_PATH, index=False)
print("Submission saved to:", SUB_PATH)
print(sub.head())


In [None]:
# Blok 12 — Artefak preprocessing (repro-friendly)
artifacts = {
    "selected_features": list(X.columns),
    "feature_methods": [int(m) for m in FEATURE_METHODS],
    "top_k": TOP_K,
    "always_keep": ALWAYS_KEEP,
    "knn_k": KNN_K,
    "winsor_q_low": WINSOR_Q_LOW,
    "winsor_q_high": WINSOR_Q_HIGH,
    "model_name": MODEL_NAME,
    "model_params": MODEL_PARAMS
}
with open(WORK_DIR / "preprocessing_artifacts.json", "w") as f:
    json.dump(artifacts, f, indent=2)

pd.DataFrame({"mean": scaler_base.mean_, "scale": scaler_base.scale_},
             index=["Average Water Speed","Average Water Direction","Chlorophyll","Temperature",
                    "Dissolved Oxygen","Dissolved Oxygen (%Saturation)","pH","Salinity","Specific Conductance"])\
  .to_csv(WORK_DIR / "scaler_base_stats.csv")

print("Artifacts saved.")
