In [None]:
import numpy as np
import polars as pl
import polars.selectors as cs
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

In [None]:
df = (
    pl.scan_csv("/kaggle/input/kepler-exoplanet-search-results/cumulative.csv")
    .drop(["rowid", "kepid", "kepoi_name", "kepler_name"])
    .collect()
)
target_col = "koi_disposition"
target_names = df.get_column(target_col).unique(maintain_order=True)
target_names

In [None]:
df = (
    df
    .cast({cs.string(): pl.Categorical})
    .with_columns(cs.categorical().to_physical())
)
df

In [None]:
y = df.get_column(target_col)
X = df.drop(target_col)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
models = []
losses = []
models_preds = []
clf_reports = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (train, test) in enumerate(skf.split(X, y)):
    print("================================================================\nFold", i)
    model = xgb.XGBClassifier(random_state=42)
    model.fit(
        X[train],
        y[train],
        eval_set=[(X[train], y[train]), (X[test], y[test])],
        verbose=False
    )

    models.append((f"Model {i}", model))

    evals_result = model.evals_result()
    losses.append(evals_result)
    plt.plot(evals_result['validation_0']['mlogloss'], label='train')
    plt.plot(evals_result['validation_1']['mlogloss'], label='test')
    plt.title("Multi-Class Log Loss")
    plt.legend()
    plt.show()

    y_preds = model.predict(X[test])
    models_preds.append(y_preds)
    
    clf_report = classification_report(y[test], y_preds, target_names=target_names)
    print(clf_report)
    clf_reports.append(clf_report)

In [None]:
model = StackingClassifier(estimators=models, final_estimator=LogisticRegression(random_state=42), cv="prefit")
model.fit(X_train, y_train)

In [None]:
y_preds = model.predict(X_test)
print(classification_report(y_test, y_preds, target_names=target_names))