In [1]:
# === Cell 1: imports + load train_log/test_log ===
import os, gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

np.random.seed(42)

DATA_DIR = "/kaggle/input/data01"

train_log = pd.read_csv(f"{DATA_DIR}/train_log.csv")
test_log  = pd.read_csv(f"{DATA_DIR}/test_log.csv")

print("Cell 1 OK — Data Loaded.")
print("train_log columns:", train_log.columns.tolist())


Cell 1 OK — Data Loaded.
train_log columns: ['object_id', 'Z', 'Z_err', 'EBV', 'SpecType', 'English Translation', 'split', 'target']


In [2]:
# === Cell 2: FE4.0 特徵 + split 載入 ===

def build_features_from_lightcurves(df):
    """
    FE4.0:
      - 保留原本 FE3.2 的 Flux / Flux_err / Time(MJD) 統計
      - 加上 peak/trough、rise/fall、energy、skew/kurt、正負比例
    """
    # 欄位假設：object_id, Filter, Time (MJD), Flux, Flux_err
    df = df.sort_values(["object_id", "Filter", "Time (MJD)"])

    # 權重 + 加權 flux
    df["w"] = 1.0 / (df["Flux_err"]**2 + 1e-9)
    df["flux_w"] = df["Flux"] * df["w"]

    # 基本聚合
    aggs = {
        "Flux": ["min","max","mean","median","std"],
        "Flux_err": ["mean"],
        "Time (MJD)": ["min","max","count"],
        "w": ["sum"],
        "flux_w": ["sum"]
    }

    agg = df.groupby(["object_id","Filter"]).agg(aggs)
    agg.columns = [f"{a}_{b}" for a,b in agg.columns]
    agg = agg.reset_index()

    # === FE3.x: 振幅、相對振幅、時間範圍、加權平均、斜率、形狀 ===
    agg["flux_amp"] = agg["Flux_max"] - agg["Flux_min"]
    agg["flux_rel_amp"] = agg["flux_amp"]/(agg["Flux_mean"]+1e-9)
    agg["time_span"] = agg["Time (MJD)_max"] - agg["Time (MJD)_min"]
    agg["flux_w_mean"] = agg["flux_w_sum"]/(agg["w_sum"]+1e-9)

    agg["slope"] = (agg["Flux_max"] - agg["Flux_min"]) / (
        agg["Time (MJD)_max"] - agg["Time (MJD)_min"] + 1e-9
    )
    agg["asymmetry"] = (agg["Flux_mean"] - agg["Flux_median"]) / (agg["Flux_std"] + 1e-9)
    agg["peak_sharpness"] = (agg["Flux_max"] - agg["Flux_mean"]) / (agg["Flux_std"] + 1e-9)

    # === FE4.0: shape 特徵（逐個 object_id, Filter 迴圈計算）===
    rows = []
    for (oid, flt), g in df.groupby(["object_id", "Filter"]):
        x = g["Time (MJD)"].values
        y = g["Flux"].values
        n = len(y)
        if n == 0:
            continue

        t_span = x[-1] - x[0] if n > 1 else 0.0
        mean = y.mean()
        std = y.std() + 1e-9

        peak_idx = np.argmax(y)
        trough_idx = np.argmin(y)

        t_peak = x[peak_idx]
        t_trough = x[trough_idx]

        rise_time = t_peak - x[0]
        fall_time = x[-1] - t_peak
        peak_time_frac = (t_peak - x[0]) / (t_span + 1e-9)
        trough_time_frac = (t_trough - x[0]) / (t_span + 1e-9)

        energy = np.sum(y**2)
        abs_energy = np.sum(np.abs(y))

        # 中心化後的 skew / kurt
        z = (y - mean) / std
        skew = np.mean(z**3)
        kurt = np.mean(z**4) - 3.0

        pos_ratio = np.mean(y > mean)
        neg_ratio = np.mean(y < mean)

        rows.append({
            "object_id": oid,
            "Filter": flt,
            "rise_time": rise_time,
            "fall_time": fall_time,
            "peak_time_frac": peak_time_frac,
            "trough_time_frac": trough_time_frac,
            "flux_energy": energy,
            "flux_abs_energy": abs_energy,
            "flux_skew": skew,
            "flux_kurt": kurt,
            "flux_pos_ratio": pos_ratio,
            "flux_neg_ratio": neg_ratio,
        })

    extra = pd.DataFrame(rows)
    agg = agg.merge(extra, on=["object_id","Filter"], how="left")

    # 拿掉不再需要的欄位
    agg.drop(columns=["flux_w_sum","w_sum"], inplace=True)

    # 每個 Filter 是否出現
    agg["has_filter"] = 1
    filt = agg.pivot(index="object_id", columns="Filter", values="has_filter").fillna(0)
    filt.columns = [f"has_{c}" for c in filt.columns]

    # 所有 numeric 特徵 wide 展開
    numeric_cols = [c for c in agg.columns if c not in ["object_id","Filter","has_filter"]]
    wide = agg.pivot(index="object_id", columns="Filter", values=numeric_cols)
    wide.columns = [f"{c[0]}_{c[1]}" for c in wide.columns]
    wide = wide.reset_index().fillna(0)

    # 加上 has_filter
    wide = wide.merge(filt.reset_index(), on="object_id", how="left").fillna(0)
    return wide


def load_splits(split_ids, mode="train"):
    all_feats = []
    for i in split_ids:
        fname = f"{DATA_DIR}/split_{i:02d}/{mode}_full_lightcurves.csv"
        if os.path.exists(fname):
            print("  reading", fname)
            df = pd.read_csv(fname)
            feats = build_features_from_lightcurves(df)
            all_feats.append(feats)
            del df
            gc.collect()
    full = pd.concat(all_feats, ignore_index=True)
    return full.groupby("object_id",as_index=False).mean()


In [3]:
# === Cell 3: build train feats + 3-model ensemble + OOF threshold ===

print("Loading TRAIN splits...")
train_feats = load_splits(range(1,21), mode="train")

train = train_feats.merge(
    train_log[["object_id","Z","Z_err","EBV","target"]],
    on="object_id", how="left"
).fillna(0)

# 交互特徵：flux_w_mean * Z
for c in train.columns:
    if "flux_w_mean" in c:
        train[c+"_x_Z"] = train[c]*train["Z"]

feature_cols = [c for c in train.columns if c not in ["object_id","target"]]
X = train[feature_cols].values
y = train["target"].values

print("Train shape:", train.shape)
print("target value counts:")
print(train["target"].value_counts())

N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

oof_preds = np.zeros(X.shape[0])

models_lgb = []
models_xgb = []
models_cat = []

pos = (y==1).sum()
neg = (y==0).sum()
scale_pos_weight = neg/(pos+1)

params_lgb = {
    "objective":"binary",
    "metric":"binary_logloss",
    "learning_rate":0.03,
    "num_leaves":96,
    "feature_fraction":0.7,
    "bagging_fraction":0.7,
    "bagging_freq":1,
    "scale_pos_weight":scale_pos_weight,
    "verbosity":-1,
}

params_xgb = {
    "max_depth":7,
    "eta":0.03,
    "subsample":0.7,
    "colsample_bytree":0.7,
    "objective":"binary:logistic",
    "eval_metric":"logloss",
    "scale_pos_weight":scale_pos_weight,
    "tree_method":"hist",
}

print("Start 3-model Ensemble Training...")

for fold,(tr_idx,va_idx) in enumerate(skf.split(X,y)):
    print(f"\n=== FOLD {fold+1}/{N_FOLDS} ===")

    Xtr, ytr = X[tr_idx], y[tr_idx]
    Xva, yva = X[va_idx], y[va_idx]

    # LightGBM
    dtr_lgb = lgb.Dataset(Xtr,label=ytr)
    dva_lgb = lgb.Dataset(Xva,label=yva)
    m_lgb = lgb.train(
        params_lgb, dtr_lgb, num_boost_round=1000,
        valid_sets=[dtr_lgb,dva_lgb],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    models_lgb.append(m_lgb)
    pred_lgb = m_lgb.predict(Xva)

    # XGBoost
    dtr_xgb = xgb.DMatrix(Xtr,label=ytr)
    dva_xgb = xgb.DMatrix(Xva,label=yva)
    m_xgb = xgb.train(
        params_xgb, dtr_xgb, num_boost_round=1200,
        evals=[(dva_xgb,"eval")],
        early_stopping_rounds=80,
        verbose_eval=False
    )
    models_xgb.append(m_xgb)
    pred_xgb = m_xgb.predict(dva_xxb:=dva_xgb)

    # CatBoost
    m_cat = CatBoostClassifier(
        depth=8,
        learning_rate=0.03,
        iterations=1500,
        loss_function="Logloss",
        eval_metric="Logloss",
        random_seed=42,
        verbose=False
    )
    m_cat.fit(Xtr, ytr, eval_set=(Xva,yva))
    models_cat.append(m_cat)
    pred_cat = m_cat.predict_proba(Xva)[:,1]

    # Ensemble → OOF
    oof_preds[va_idx] = (pred_lgb + pred_xgb + pred_cat) / 3

# ===== 兩階段 OOF threshold 搜尋（收窄）=====

best_thr = 0.0
best_f1 = 0.0

# 粗掃：0.10 ~ 0.25
coarse_ths = np.linspace(0.10, 0.25, 151)
for t in coarse_ths:
    f1 = f1_score(y, (oof_preds >= t).astype(int))
    if f1 > best_f1:
        best_f1 = f1
        best_thr = t

print("Coarse best_thr =", best_thr, "F1 =", best_f1)

# 以 coarse best_thr 為中心 ±0.01 再細掃
fine_start = max(0.0, best_thr - 0.01)
fine_end   = min(1.0, best_thr + 0.01)

best_thr_fine = best_thr
best_f1_fine  = best_f1

fine_ths = np.linspace(fine_start, fine_end, 201)
for t in fine_ths:
    f1 = f1_score(y, (oof_preds >= t).astype(int))
    if f1 > best_f1_fine:
        best_f1_fine = f1
        best_thr_fine = t

best_thr = best_thr_fine
best_f1 = best_f1_fine

print("Best OOF threshold =", best_thr)
print("Best OOF F1 =", best_f1)


Loading TRAIN splits...
  reading /kaggle/input/data01/split_01/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_02/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_03/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_04/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_05/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_06/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_07/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_08/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_09/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_10/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_11/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_12/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_13/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_14/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_15/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_16/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_17/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_18/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_19/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_20/train_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


Train shape: (3043, 173)
target value counts:
target
0    2895
1     148
Name: count, dtype: int64
Start 3-model Ensemble Training...

=== FOLD 1/5 ===
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[103]	training's binary_logloss: 0.0267445	valid_1's binary_logloss: 0.125134

=== FOLD 2/5 ===
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[69]	training's binary_logloss: 0.0427406	valid_1's binary_logloss: 0.151495

=== FOLD 3/5 ===
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[113]	training's binary_logloss: 0.02124	valid_1's binary_logloss: 0.130737

=== FOLD 4/5 ===
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[99]	training's binary_logloss: 0.0273269	valid_1's binary_logloss: 0.143057

=== FOLD 5/5 ===
Training until validation scores don't improve for 50 rounds
Early stopping, best it

In [4]:
# === Cell 4: build test feats + ensemble 預測 + submissions ===

print("Loading TEST splits...")
test_feats = load_splits(range(1,21), mode="test")

test = test_feats.merge(
    test_log[["object_id","Z","Z_err","EBV"]],
    on="object_id", how="left"
).fillna(0)

for c in test.columns:
    if "flux_w_mean" in c:
        test[c+"_x_Z"] = test[c]*test["Z"]

X_test = test[feature_cols].values
dtest  = xgb.DMatrix(X_test)

sub_preds = np.zeros(X_test.shape[0])

for i in range(N_FOLDS):
    p_lgb = models_lgb[i].predict(X_test)
    p_xgb = models_xgb[i].predict(dtest)
    p_cat = models_cat[i].predict_proba(X_test)[:,1]
    sub_preds += (p_lgb + p_xgb + p_cat)/3

sub_preds /= N_FOLDS

print("Test prediction OK.")

# 以 best_thr 為中心 ±0.006，步距 0.003 → 最多 5 個點
threshold_list = [
    best_thr - 0.006,
    best_thr - 0.003,
    best_thr,
    best_thr + 0.003,
    best_thr + 0.006,
]

threshold_list = [max(0.0, min(1.0, t)) for t in threshold_list]
threshold_list = sorted(list(set(threshold_list)))

print("Use thresholds:", threshold_list)

for idx, thr in enumerate(threshold_list,1):
    pred = (sub_preds >= thr).astype(int)

    sub_df = pd.DataFrame({
        "object_id": test["object_id"],
        "prediction": pred
    })

    fname = f"submission_FE40_thr{idx}_{thr:.3f}.csv"
    sub_df.to_csv(fname,index=False)

    print(f"Saved {fname} | positives = {pred.sum()}")


Loading TEST splits...
  reading /kaggle/input/data01/split_01/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_02/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_03/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_04/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_05/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_06/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_07/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_08/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_09/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_10/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_11/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_12/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_13/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_14/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_15/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_16/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_17/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_18/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_19/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


  reading /kaggle/input/data01/split_20/test_full_lightcurves.csv


  pos_ratio = np.mean(y > mean)
  neg_ratio = np.mean(y < mean)


Test prediction OK.
Use thresholds: [0.1313, 0.1343, 0.1373, 0.1403, 0.1433]
Saved submission_FE40_thr1_0.131.csv | positives = 540
Saved submission_FE40_thr2_0.134.csv | positives = 535
Saved submission_FE40_thr3_0.137.csv | positives = 526
Saved submission_FE40_thr4_0.140.csv | positives = 518
Saved submission_FE40_thr5_0.143.csv | positives = 504
