In [None]:
from pathlib import Path
from imblearn.over_sampling import SMOTE
from metrics.loss import balanced_log_loss
import pandas as pd
from fastai.tabular.core import cont_cat_split
from sklearn import model_selection, metrics, inspection
from visualize.results import plot_results
from tabpfn import TabPFNClassifier

path = Path("./data")
output_path = Path("./submission")


In [2]:
def resample(X, y):
    sampler = SMOTE()
    X_res, y_res = sampler.fit_resample(X, y)
    return X_res, y_res

In [3]:
train_df = pd.read_csv(path / "train.csv", index_col="Id")

drop_cols = ["EJ"]
dep_vars = ["Class"]

untrainable_cols = drop_cols + dep_vars

# Drops the dep_vars before splitting categorical and continuous variables
cont_names, cat_names = cont_cat_split(train_df, dep_var=untrainable_cols)

train_df, test_df = model_selection.train_test_split(train_df, test_size=0.4, random_state=33)
train_df.shape, test_df.shape

((370, 57), (247, 57))

In [4]:
preprocessor = get_preprocess_pipeline(train_df, cont_names, cat_names, untrainable_cols)
X_pre = preprocessor.fit_transform(train_df.drop(columns=untrainable_cols))
train_df = pd.merge(X_pre, train_df[untrainable_cols], left_index=True, right_index=True)

In [5]:
X_test = preprocessor.transform(test_df.drop(columns=untrainable_cols, errors="ignore"))
y_test = test_df[dep_vars]

In [6]:
X = train_df.drop(columns=untrainable_cols, errors="ignore")
y = train_df[dep_vars]

# Ensembe - Voting

In [7]:
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [8]:
xgm_clf = XGBClassifier(
    objective="binary:logistic",
    colsample_bylevel=0.3,
    colsample_bynode=0.7,
    colsample_bytree=1.0,
    gamma=0.6,
    learning_rate=0.0344,
    max_depth=3,
    min_child_weight=0.5,
    n_estimators=650,
    reg_alpha=0.0,
    reg_lambda=0.0,
    scale_pos_weight=5.5,
    subsample=0.6,
    tree_method="hist",
    eval_metric=balanced_log_loss,
)

lgbm_clf = LGBMClassifier(
    objective="binary",
    verbosity=-1,
    boosting_type="gbdt",
    learning_rate=0.046,
    n_estimators=300,
    max_depth=12,
    scale_pos_weight=10.0,
    subsample=0.9,
    reg_alpha=3.5,
    reg_lambda=2.5,
    colsample_bytree=1.0,
    colsample_bynode=0.6,
    data_sample_strategy="bagging",
    num_leaves=6329,
    max_bin=401,
    n_jobs=-1,
)

# cat_clf = CatBoostClassifier(
#     bagging_temperature=0.5,
#     border_count=29,
#     depth=3,
#     grow_policy="SymmetricTree",
#     iterations=550,
#     l2_leaf_reg=6.0,
#     verbose=0,
#     task_type="GPU",
#     devices="0",
# )

tabpfn_clf = TabPFNClassifier(device='cuda', N_ensemble_configurations=64, only_inference=False, no_preprocess_mode=True)
    

ensemble_clf = VotingClassifier(
    estimators=[
        ("xgm", xgm_clf),
        ("lgbm", lgbm_clf),
        ("tabpfn", tabpfn_clf),
    ],
    voting="soft",
)


Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters


In [9]:
kfold = model_selection.RepeatedStratifiedKFold(n_splits=7, n_repeats=4)

for idx in kfold.split(X, y):
    train_idx, _ = idx
    X_train = X.iloc[train_idx]
    y_train = y.iloc[train_idx]

    X_res, y_res = resample(X_train, y_train.values.ravel())

    ensemble_clf.fit(X_res, y_res)

Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters
Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters
Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters
Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters
Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters
Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters
Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters
Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters
Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters
Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters
Using style prior: True
Using cuda device
Using a Transformer with 25.82 M parameters
Using style prior: True
Using cuda device
Using a Tran

In [10]:
pred_probs = ensemble_clf.predict_proba(X)
y_true = y.values.ravel()

y_pred = pred_probs.argmax(axis=1)

log_loss_val = metrics.log_loss(y_true, pred_probs)
balanced_log_loss_val = balanced_log_loss(y_true, pred_probs)

accuracy_val = metrics.accuracy_score(y_true, y_pred)
kappa_val = metrics.cohen_kappa_score(y_true, y_pred)
f1_val = metrics.f1_score(y_true, y_pred)

print(f"Log loss: {log_loss_val:.4f}")
print(f"Balanced log loss: {balanced_log_loss_val:.4f}")
print(f"Accuracy: {accuracy_val:.4f}")
print(f"Kappa: {kappa_val:.4f}")
print(f"F1: {f1_val:.4f}")

Log loss: 0.0472
Balanced log loss: 0.0635
Accuracy: 0.9919
Kappa: 0.9682
F1: 0.9730


In [26]:
pred_probs = ensemble_clf.predict_proba(X_test)
y_true = y_test.values.ravel()

y_pred = pred_probs.argmax(axis=1)

log_loss_val = metrics.log_loss(y_true, pred_probs)
balanced_log_loss_val = balanced_log_loss(y_true, pred_probs)

accuracy_val = metrics.accuracy_score(y_true, y_pred)
kappa_val = metrics.cohen_kappa_score(y_true, y_pred)
f1_val = metrics.f1_score(y_true, y_pred)

print(f"Log loss: {log_loss_val:.4f}")
print(f"Balanced log loss: {balanced_log_loss_val:.4f}")
print(f"Accuracy: {accuracy_val:.4f}")
print(f"Kappa: {kappa_val:.4f}")
print(f"F1: {f1_val:.4f}")


np.find_common_type is deprecated.  Please use `np.result_type` or `np.promote_types`.
See https://numpy.org/devdocs/release/1.25.0-notes.html and the docs for more information.  (Deprecated NumPy 1.25)



Log loss: 0.2009
Balanced log loss: 0.2639
Accuracy: 0.9231
Kappa: 0.7702
F1: 0.8190
