In [1]:
import math
import openml
import numpy as np

import torch

In [2]:
def download_openml_suite(suite_id=99, max_features=500, shuffle=True,
                          split_min=0.88, split_max=0.92, seed=None):
    if seed is not None: np.random.seed(seed)

    benchmark_suite = openml.study.get_suite(suite_id=suite_id)
    datalist = openml.datasets.list_datasets(data_id=benchmark_suite.data, output_format='dataframe')

    n_classes = []
    for _, ds in enumerate(datalist.index):
        entry = datalist.loc[ds]
        name = entry['name']
        did = entry['did']
        print('Downloading', name, did, '..')

        dataset = openml.datasets.get_dataset(int(did))
        X, y, categorical_indicator, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

        if X is None: continue

        # cat_columns = X.select_dtypes(['category', 'object']).columns
        # for col in cat_columns:
        #     try:
        #         X[col].astype(np.float32)
        #     except:
        #         X[col] = X[col].astype('category').cat.codes
        for i, col in enumerate(X.columns):
            if not categorical_indicator[i]:
                try:
                    X[col].astype(np.float32)
                    continue
                except:
                    categorical_indicator[i] = True
            X[col] = X[col].astype('category').cat.codes

        X = X.values.astype('float32')

        N, F = X.shape
        #if F > max_features: continue

        n_classes.append(y.astype('category').cat.categories.size)
        y = y.astype('category').cat.codes.values

        if shuffle:
            perm = np.random.permutation(N)
            X = X[perm, :]
            y = y[perm]

        test_size = N - int(N*np.random.uniform(split_min, split_max))
        test_size = min(test_size, 1000)

        X_train, X_test = X[:(-test_size), :], X[(-test_size):, :]
        y_train, y_test = y[:(-test_size)], y[(-test_size):]
        X_train, y_train, X_test, y_test = map(torch.tensor, (X_train, y_train, X_test, y_test))
        dataset = {
            "data": (X_train, y_train, X_test, y_test),
            "cat_features": torch.tensor(categorical_indicator, dtype=torch.long),
            "attribute_names": attribute_names
        }
        torch.save(dataset, f'{name}.pt')

    return n_classes


In [None]:
download_openml_suite(seed=44)