In [None]:
!pip install lightgbm==3.3.5

In [1]:
import os

import torch

from tabular_prediction.methods import lightgbm_predict
from tabular_prediction.metrics import accuracy_metric, balanced_accuracy_metric, cross_entropy_metric, auc_metric

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
max_time = 30

In [3]:
# single dataset
data = torch.load(os.path.join("../datasets/classification/OpenML-CC18/adult.pt"), map_location='cpu')
x_train, y_train, x_test, y_test = data["data"]
cat_features = torch.where(data["cat_features"])[0]

test_y, pred, run_time = lightgbm_predict(x_train, y_train, x_test, y_test, cat_features=cat_features, metric_used=cross_entropy_metric, max_time=max_time)

print(f"\n")
print(f"accuracy: {accuracy_metric(test_y, pred):5.4f} \n")
print(f"balanced accuracy: {balanced_accuracy_metric(test_y, pred):5.4f} \n")
print(f"cross entropy: {cross_entropy_metric(test_y, pred):5.4f} \n")
print(f"roc auc: {auc_metric(test_y, pred):5.4f} \n")
print(f"run time: {run_time:5.4f} seconds \n")

  1%|          | 6/1000 [00:38<1:47:02,  6.46s/trial, best loss: 0.2785500410025945]


accuracy: 0.8890 

balanced accuracy: 0.8090 

cross entropy: 0.2310 

roc auc: 0.9518 

run time: 40.7763 seconds 



In [3]:
# all datasets
for split in range(1, 7):
    data_dir = f"../datasets/classification/OpenML-CC18-{split}"
    datasets = os.listdir(data_dir)
    datasets = [dataset for dataset in datasets if ".pt" in dataset]

    with open(f"../results/lightgbm-classification-{split}-time{max_time}.csv", "a") as f:
        f.write(','.join(["dataset", "acc", "bacc", "ce", "auc", "time"]))
        f.write('\n')
        f.flush()
        for i, dataset in enumerate(datasets):
            if dataset in ['mnist_784.pt', 'CIFAR_10.pt', 'Devnagari-Script.pt', 'Fashion-MNIST.pt']:
                continue
            data = torch.load(os.path.join(data_dir, dataset), map_location='cpu')
            x_train, y_train, x_test, y_test = data["data"]
            cat_features = torch.where(data["cat_features"])[0]

            test_y, pred, run_time = lightgbm_predict(x_train, y_train, x_test, y_test, cat_features=cat_features, metric_used=cross_entropy_metric, max_time=max_time)
            f.write(','.join([dataset] + [f'{val:5.4f}' for val in [accuracy_metric(test_y, pred), balanced_accuracy_metric(test_y, pred), cross_entropy_metric(test_y, pred), auc_metric(test_y, pred), run_time]]))
            f.write('\n')
            f.flush()

  0%|          | 1/1000 [00:08<2:26:50,  8.82s/trial, best loss: 0.4773301048197184]


KeyboardInterrupt: 