In [9]:
import os, gc
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier

np.random.seed(42)

DATA_DIR = "/kaggle/input/123456"

train_log = pd.read_csv(f"{DATA_DIR}/train_log.csv")
test_log  = pd.read_csv(f"{DATA_DIR}/test_log.csv")

print("Cell 1 OK — Data Loaded.")


Cell 1 OK — Data Loaded.


In [10]:
def build_features_from_lightcurves(df):
    df = df.sort_values(["object_id", "Filter", "Time (MJD)"])

    df["w"] = 1.0 / (df["Flux_err"]**2 + 1e-9)
    df["flux_w"] = df["Flux"] * df["w"]

    aggs = {
        "Flux": ["min","max","mean","median","std"],
        "Flux_err": ["mean"],
        "Time (MJD)": ["min","max","count"],
        "w": ["sum"],
        "flux_w": ["sum"]
    }

    agg = df.groupby(["object_id","Filter"]).agg(aggs)
    agg.columns = [f"{a}_{b}" for a,b in agg.columns]
    agg = agg.reset_index()

    agg["flux_amp"] = agg["Flux_max"] - agg["Flux_min"]
    agg["flux_rel_amp"] = agg["flux_amp"]/(agg["Flux_mean"]+1e-9)
    agg["time_span"] = agg["Time (MJD)_max"] - agg["Time (MJD)_min"]
    agg["flux_w_mean"] = agg["flux_w_sum"]/(agg["w_sum"]+1e-9)

    agg["slope"] = (agg["Flux_max"] - agg["Flux_min"]) / (agg["Time (MJD)_max"] - agg["Time (MJD)_min"] + 1e-9)
    agg["asymmetry"] = (agg["Flux_mean"] - agg["Flux_median"]) / (agg["Flux_std"] + 1e-9)
    agg["peak_sharpness"] = (agg["Flux_max"] - agg["Flux_mean"]) / (agg["Flux_std"] + 1e-9)

    agg.drop(columns=["flux_w_sum","w_sum"], inplace=True)

    agg["has_filter"] = 1
    filt = agg.pivot(index="object_id", columns="Filter", values="has_filter").fillna(0)
    filt.columns = [f"has_{c}" for c in filt.columns]

    numeric_cols = [c for c in agg.columns if c not in ["object_id","Filter","has_filter"]]
    wide = agg.pivot(index="object_id", columns="Filter", values=numeric_cols)
    wide.columns = [f"{c[0]}_{c[1]}" for c in wide.columns]
    wide = wide.reset_index().fillna(0)

    return wide.merge(filt.reset_index(), on="object_id", how="left")


In [11]:
def load_splits(split_ids, mode="train"):
    all_feats = []
    for i in split_ids:
        fname = f"{DATA_DIR}/split_{i:02d}/{mode}_full_lightcurves.csv"
        if os.path.exists(fname):
            df = pd.read_csv(fname)
            feats = build_features_from_lightcurves(df)
            all_feats.append(feats)
            del df
            gc.collect()
    full = pd.concat(all_feats, ignore_index=True)
    return full.groupby("object_id",as_index=False).mean()

print("Loading TRAIN splits...")
train_feats = load_splits(range(1,21), mode="train")

train = train_feats.merge(
    train_log[["object_id","Z","Z_err","EBV","target"]],
    on="object_id", how="left"
).fillna(0)

for c in train.columns:
    if "flux_w_mean" in c:
        train[c+"_x_Z"] = train[c]*train["Z"]

feature_cols = [c for c in train.columns if c not in ["object_id","target"]]
X = train[feature_cols].values
y = train["target"].values

print("Train shape:", train.shape)


Loading TRAIN splits...
Train shape: (3043, 113)


In [12]:
N_FOLDS = 5
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

oof_preds = np.zeros(X.shape[0])

models_lgb = []
models_xgb = []
models_cat = []

pos = (y==1).sum()
neg = (y==0).sum()
scale_pos_weight = neg/(pos+1)

params_lgb = {
    "objective":"binary",
    "metric":"binary_logloss",
    "learning_rate":0.03,
    "num_leaves":96,
    "feature_fraction":0.7,
    "bagging_fraction":0.7,
    "bagging_freq":1,
    "scale_pos_weight":scale_pos_weight,
    "verbosity":-1,
}

params_xgb = {
    "max_depth":7,
    "eta":0.03,
    "subsample":0.7,
    "colsample_bytree":0.7,
    "objective":"binary:logistic",
    "eval_metric":"logloss",
    "scale_pos_weight":scale_pos_weight,
    "tree_method":"hist",
}

print("Start 3-model Ensemble Training...")

for fold,(tr,va) in enumerate(skf.split(X,y)):
    print(f"\n=== FOLD {fold+1}/{N_FOLDS} ===")

    Xtr, ytr = X[tr], y[tr]
    Xva, yva = X[va], y[va]

    # LightGBM
    dtr_lgb = lgb.Dataset(Xtr,label=ytr)
    dva_lgb = lgb.Dataset(Xva,label=yva)
    m_lgb = lgb.train(
        params_lgb, dtr_lgb, num_boost_round=1000,
        valid_sets=[dtr_lgb,dva_lgb],
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
    )
    models_lgb.append(m_lgb)
    pred_lgb = m_lgb.predict(Xva)

    # XGBoost
    dtr_xgb = xgb.DMatrix(Xtr,label=ytr)
    dva_xgb = xgb.DMatrix(Xva,label=yva)
    m_xgb = xgb.train(
        params_xgb, dtr_xgb, num_boost_round=1200,
        evals=[(dva_xgb,"eval")],
        early_stopping_rounds=80,
        verbose_eval=False
    )
    models_xgb.append(m_xgb)
    pred_xgb = m_xgb.predict(dva_xgb)

    # CatBoost
    m_cat = CatBoostClassifier(
        depth=8,
        learning_rate=0.03,
        iterations=1500,
        loss_function="Logloss",
        eval_metric="Logloss",
        random_seed=42,
        verbose=False
    )
    m_cat.fit(Xtr, ytr, eval_set=(Xva,yva))
    models_cat.append(m_cat)
    pred_cat = m_cat.predict_proba(Xva)[:,1]

    # Ensemble → OOF
    oof_preds[va] = (pred_lgb + pred_xgb + pred_cat) / 3


Start 3-model Ensemble Training...

=== FOLD 1/5 ===
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[107]	training's binary_logloss: 0.0271958	valid_1's binary_logloss: 0.129881

=== FOLD 2/5 ===
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[71]	training's binary_logloss: 0.0423343	valid_1's binary_logloss: 0.145148

=== FOLD 3/5 ===
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[97]	training's binary_logloss: 0.0294382	valid_1's binary_logloss: 0.131961

=== FOLD 4/5 ===
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[88]	training's binary_logloss: 0.0348331	valid_1's binary_logloss: 0.14398

=== FOLD 5/5 ===
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[82]	training's binary_logloss: 0.0377144	valid_1's binary_logloss: 0.149949


In [13]:
best_thr = 0
best_f1 = 0

ths = np.linspace(0.05,0.40,200)
for t in ths:
    f1 = f1_score(y, (oof_preds>=t).astype(int))
    if f1>best_f1:
        best_f1 = f1
        best_thr = t

print("Best OOF threshold =", best_thr)
print("Best OOF F1 =", best_f1)


Best OOF threshold = 0.1801507537688442
Best OOF F1 = 0.4125


In [14]:
print("Loading TEST splits...")
test_feats = load_splits(range(1,21), mode="test")

test = test_feats.merge(
    test_log[["object_id","Z","Z_err","EBV"]],
    on="object_id", how="left"
).fillna(0)

for c in test.columns:
    if "flux_w_mean" in c:
        test[c+"_x_Z"] = test[c]*test["Z"]

X_test = test[feature_cols].values
dtest  = xgb.DMatrix(X_test)

sub_preds = np.zeros(X_test.shape[0])

for i in range(N_FOLDS):
    p_lgb = models_lgb[i].predict(X_test)
    p_xgb = models_xgb[i].predict(dtest)
    p_cat = models_cat[i].predict_proba(X_test)[:,1]
    sub_preds += (p_lgb + p_xgb + p_cat)/3

sub_preds /= N_FOLDS

print("Test prediction OK.")


Loading TEST splits...
Test prediction OK.


In [15]:
threshold_list = [best_thr, 
                  best_thr-0.01,
                  best_thr+0.01,
                  best_thr-0.02,
                  best_thr+0.02]

print("Use thresholds:", threshold_list)

for idx, thr in enumerate(threshold_list,1):
    pred = (sub_preds >= thr).astype(int)

    sub_df = pd.DataFrame({
        "object_id": test["object_id"],
        "prediction": pred
    })

    fname = f"submission_FE32_thr{idx}_{thr:.3f}.csv"
    sub_df.to_csv(fname,index=False)

    print(f"Saved {fname} | positives = {pred.sum()}")


Use thresholds: [0.1801507537688442, 0.1701507537688442, 0.19015075376884422, 0.16015075376884422, 0.2001507537688442]
Saved submission_FE32_thr1_0.180.csv | positives = 388
Saved submission_FE32_thr2_0.170.csv | positives = 410
Saved submission_FE32_thr3_0.190.csv | positives = 365
Saved submission_FE32_thr4_0.160.csv | positives = 441
Saved submission_FE32_thr5_0.200.csv | positives = 342
