In [1]:
import openml
import numpy as np

import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def download_openml_suite(suite_id=353, max_features=120, shuffle=True,
                          split_min=0.88, split_max=0.92, seed=None):
    if seed is not None: np.random.seed(seed)

    benchmark_suite = openml.study.get_suite(suite_id=suite_id)
    datalist = openml.datasets.list_datasets(data_id=benchmark_suite.data, output_format='dataframe')

    for _, ds in enumerate(datalist.index):
        entry = datalist.loc[ds]
        name = entry['name']
        did = entry['did']
        print('Downloading', name, did, '..')

        dataset = openml.datasets.get_dataset(int(did))
        X, y, categorical_indicator, attribute_names = dataset.get_data(target=dataset.default_target_attribute)

        if X is None: continue

        # cat_columns = X.select_dtypes(['category', 'object']).columns
        # for col in cat_columns:
        #     try:
        #         X[col].astype(np.float32)
        #     except:
        #         X[col] = X[col].astype('category').cat.codes
        for i, col in enumerate(X.columns):
            if not categorical_indicator[i]:
                try:
                    X[col].astype(np.float32)
                    continue
                except:
                    categorical_indicator[i] = True
            X[col] = X[col].astype('category').cat.codes

        X = X.values.astype('float32')
        y = y.values.astype('float32')

        N, F = X.shape
        #if F > max_features: continue

        if shuffle:
            perm = np.random.permutation(N)
            X = X[perm, :]
            y = y[perm]

        split = int(N*np.random.uniform(split_min, split_max))

        X_train, X_test = X[:split, :], X[split:, :]
        y_train, y_test = y[:split], y[split:]
        X_train, y_train, X_test, y_test = map(torch.tensor, (X_train, y_train, X_test, y_test))
        dataset = {
            "data": (X_train, y_train, X_test, y_test),
            "cat_features": torch.tensor(categorical_indicator, dtype=torch.long),
            "attribute_names": attribute_names
        }
        torch.save(dataset, f'{name}.pt')


In [None]:
download_openml_suite(seed=40)