In [1]:
import os

import torch

from tabular_prediction.methods import svm_predict
from tabular_prediction.metrics import accuracy_metric, balanced_accuracy_metric, cross_entropy_metric, auc_metric

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# single dataset
max_time = [30, 120]
data = torch.load(os.path.join("../datasets/classification/OpenML-CC18/adult.pt"), map_location='cpu')
x_train, y_train, x_test, y_test = data["data"]
cat_features = torch.where(data["cat_features"])[0]

test_y, summary, run_time = svm_predict(x_train, y_train, x_test, y_test, cat_features=cat_features, metric_used=cross_entropy_metric, max_time=max_time)
pred = summary[max_time[-1]]['pred']

print(f"\n")
print(f"accuracy: {accuracy_metric(test_y, pred):5.4f} \n")
print(f"balanced accuracy: {balanced_accuracy_metric(test_y, pred):5.4f} \n")
print(f"cross entropy: {cross_entropy_metric(test_y, pred):5.4f} \n")
print(f"roc auc: {auc_metric(test_y, pred):5.4f} \n")
print(f"run time: {run_time:5.4f} seconds \n")

FileNotFoundError: [Errno 2] No such file or directory: '../datasets/classification/OpenML-CC18/adult.pt'

In [None]:
# all datasets
max_time = [1, 5, 10, 30, 60, 120, 300, 600, 3600]
for split in range(1, 7):
    data_dir = f"../datasets/classification/OpenML-CC18-{split}"
    datasets = os.listdir(data_dir)
    datasets = [dataset for dataset in datasets if ".pt" in dataset]

    with open(f"../results/xgboost-classification-{split}.csv", "a") as f:
        f.write(','.join(["dataset", "acc", "bacc", "ce", "auc", "time"]))
        f.write('\n')
        f.flush()
        for i, dataset in enumerate(datasets):
            if dataset in ['mnist_784.pt', 'CIFAR_10.pt', 'Devnagari-Script.pt', 'Fashion-MNIST.pt']:
                continue
            data = torch.load(os.path.join(data_dir, dataset), map_location='cpu')
            x_train, y_train, x_test, y_test = data["data"]
            cat_features = torch.where(data["cat_features"])[0]

            test_y, summary, _ = svm_predict(x_train, y_train, x_test, y_test, cat_features=cat_features, metric_used=cross_entropy_metric, max_time=max_time)
            for stop_time in summary:
                pred = summary[stop_time]['pred']
                run_time = summary[stop_time]['tune_time'] + summary[stop_time]['train_time'] + summary[stop_time]['predict_time']
                f.write(','.join([dataset] + [f'{val:5.4f}' for val in [accuracy_metric(test_y, pred), balanced_accuracy_metric(test_y, pred), cross_entropy_metric(test_y, pred), auc_metric(test_y, pred), run_time]]))
                f.write('\n')
                f.flush()