# Import packages

In [1]:
import os, pandas as pd, sklearn, arff, pprint, numpy as np, seaborn as sns
from sklearn import neighbors
from tqdm import tqdm
# import missingno as msno
from sklearn.model_selection import StratifiedShuffleSplit
import warnings

import openml
from openml import datasets
from openml.datasets import get_dataset # edit_dataset, fork_dataset
from openml.tasks import TaskType
openml.config.apikey = '2474cbc06507fbf9677530945039be25'
openml.config.server = 'https://www.openml.org/api/v1' # https://test.openml.org/api/v1/xml
openml.config.set_cache_directory(os.path.expanduser('~/openml/cache'))
openml.config.cachedir = '~/openml/cache'

# 1. OpenML-CC18 (SCARF) 
- Out of 72 datasets,
    - Exclude image datasets: mnist_784, Fashion-MNIST, CIFAR_10 (69 left)
    - Exclude the ones without categorical variables (23 left)


In [123]:
benchmark_suite = openml.study.get_suite(suite_id=99)
benchmark_suite = openml.study.get_suite(suite_id='OpenML-CC18')
benchmark_suite

OpenML Benchmark Suite
ID..............: 99
Name............: OpenML-CC18 Curated Classification benchmark
Status..........: active
Main Entity Type: task
Study URL.......: https://www.openml.org/s/99
# of Data.......: 72
# of Tasks......: 72
Creator.........: https://www.openml.org/u/1
Upload Time.....: 2019-02-21 18:47:13

In [124]:
# Task IDs and data IDs present in the benchmark suite.

print('number of tasks:', len(benchmark_suite.tasks))
print('task ids:', benchmark_suite.tasks)
print('')
print('number of data:', len(benchmark_suite.data))
print('data ids:', benchmark_suite.data)


number of tasks: 72
task ids: [3, 6, 11, 12, 14, 15, 16, 18, 22, 23, 28, 29, 31, 32, 37, 43, 45, 49, 53, 219, 2074, 2079, 3021, 3022, 3481, 3549, 3560, 3573, 3902, 3903, 3904, 3913, 3917, 3918, 7592, 9910, 9946, 9952, 9957, 9960, 9964, 9971, 9976, 9977, 9978, 9981, 9985, 10093, 10101, 14952, 14954, 14965, 14969, 14970, 125920, 125922, 146195, 146800, 146817, 146819, 146820, 146821, 146822, 146824, 146825, 167119, 167120, 167121, 167124, 167125, 167140, 167141]

number of data: 72
data ids: [3, 6, 11, 12, 14, 15, 16, 18, 22, 23, 28, 29, 31, 32, 37, 44, 46, 50, 54, 151, 182, 188, 38, 307, 300, 458, 469, 554, 1049, 1050, 1053, 1063, 1067, 1068, 1590, 4134, 1510, 1489, 1494, 1497, 1501, 1480, 1485, 1486, 1487, 1468, 1475, 1462, 1464, 4534, 6332, 1461, 4538, 1478, 23381, 40499, 40668, 40966, 40982, 40994, 40983, 40975, 40984, 40979, 40996, 41027, 23517, 40923, 40927, 40978, 40670, 40701]


In [125]:
data_ids_1 = benchmark_suite.data

In [107]:
"""
Get datasets
"""

dlist = openml.datasets.list_datasets(data_id=benchmark_suite.data, output_format='dataframe')

# drop rows where 'name' contains 'mnist' or 'MNIST' or 'CIFAR'
dlist = dlist[~dlist['name'].str.contains('mnist', case=False)]
dlist = dlist[~dlist['name'].str.contains('MNIST', case=False)]
dlist = dlist[~dlist['name'].str.contains('CIFAR', case=False)]

dlist

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
6,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,17.0,20000.0,0.0,0.0,16.0,1.0
11,11,balance-scale,1,1,active,ARFF,288.0,3.0,49.0,3.0,5.0,625.0,0.0,0.0,4.0,1.0
12,12,mfeat-factors,1,1,active,ARFF,200.0,10.0,200.0,10.0,217.0,2000.0,0.0,0.0,216.0,1.0
14,14,mfeat-fourier,1,1,active,ARFF,200.0,10.0,200.0,10.0,77.0,2000.0,0.0,0.0,76.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40982,40982,steel-plates-fault,3,4265,active,ARFF,673.0,7.0,55.0,7.0,28.0,1941.0,0.0,0.0,27.0,1.0
40983,40983,wilt,2,4265,active,ARFF,4578.0,2.0,261.0,2.0,6.0,4839.0,0.0,0.0,5.0,1.0
40984,40984,segment,3,4265,active,ARFF,330.0,7.0,330.0,7.0,20.0,2310.0,0.0,0.0,19.0,1.0
40994,40994,climate-model-simulation-crashes,4,4265,active,ARFF,494.0,2.0,46.0,2.0,21.0,540.0,0.0,0.0,20.0,1.0


In [108]:
"""
Count the number of data with both 'continuous' and 'categorical' variables, and the number of data with only 'categorical' variables.
"""

# see if NumberOfFeatures == NumberOfNumericFeatures + NumberOfSymbolicFeatures
# count the number of False, if any
temp = dlist['NumberOfFeatures'] == dlist['NumberOfNumericFeatures'] + dlist['NumberOfSymbolicFeatures']
print(temp.value_counts())

# count the number of rows where 'NumberOfNumericFeatures' or 'NumberOfSymbolicFeatures' is zero
print('number of rows where NumberOfNumericFeatures is zero:', len(dlist[dlist['NumberOfNumericFeatures'] == 0]))
# NumberOfSymbolicFeatures includes the class, so if it's 1, it means there's no categorical variable.
print('number of rows where NumberOfSymbolicFeatures is one:', len(dlist[dlist['NumberOfSymbolicFeatures'] == 1]))

True    69
dtype: int64
number of rows where NumberOfNumericFeatures is zero: 8
number of rows where NumberOfSymbolicFeatures is one: 45


In [109]:
# Get tasks

tlist = openml.tasks.list_tasks(tag="OpenML-CC18", output_format="dataframe")
tlist = openml.tasks.list_tasks(task_id=benchmark_suite.tasks, output_format="dataframe")
tlist

Unnamed: 0,tid,ttid,did,name,task_type,status,estimation_procedure,source_data,target_feature,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
3,3,TaskType.SUPERVISED_CLASSIFICATION,3,kr-vs-kp,Supervised Classification,active,10-fold Crossvalidation,3,class,1669,3.0,1527,2,37,3196,0,0,0,37
6,6,TaskType.SUPERVISED_CLASSIFICATION,6,letter,Supervised Classification,active,10-fold Crossvalidation,6,class,813,26.0,734,26,17,20000,0,0,16,1
11,11,TaskType.SUPERVISED_CLASSIFICATION,11,balance-scale,Supervised Classification,active,10-fold Crossvalidation,11,class,288,3.0,49,3,5,625,0,0,4,1
12,12,TaskType.SUPERVISED_CLASSIFICATION,12,mfeat-factors,Supervised Classification,active,10-fold Crossvalidation,12,class,200,10.0,200,10,217,2000,0,0,216,1
14,14,TaskType.SUPERVISED_CLASSIFICATION,14,mfeat-fourier,Supervised Classification,active,10-fold Crossvalidation,14,class,200,10.0,200,10,77,2000,0,0,76,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167121,167121,TaskType.SUPERVISED_CLASSIFICATION,40923,Devnagari-Script,Supervised Classification,active,10-fold Crossvalidation,40923,character,2000,46.0,2000,46,1025,92000,0,0,1024,1
167124,167124,TaskType.SUPERVISED_CLASSIFICATION,40927,CIFAR_10,Supervised Classification,active,10-fold Crossvalidation,40927,class,6000,10.0,6000,10,3073,60000,0,0,3072,1
167125,167125,TaskType.SUPERVISED_CLASSIFICATION,40978,Internet-Advertisements,Supervised Classification,active,10-fold Crossvalidation,40978,class,2820,2.0,459,2,1559,3279,0,0,3,1556
167140,167140,TaskType.SUPERVISED_CLASSIFICATION,40670,dna,Supervised Classification,active,10-fold Crossvalidation,40670,class,1654,3.0,765,3,181,3186,0,0,0,181


## Collect

In [110]:
current_dir = os.getcwd()
save_path = current_dir + '/data/data_1/'

print(save_path)

c:\Users\lesga\OneDrive\문서\JupyterLab\연구_SSLT\Dataset_OpenML/data/data_1/


In [111]:
dataset_dict = {}
did_SCARF = []

for did in tqdm(dlist['did']):
    # get dataset name from dlist
    name = dlist.loc[dlist['did']==did, 'name'].values[0]
    
    # get dataset from dlist
    odata = openml.datasets.get_dataset(did)
    X, y, categorical_indicator, attribute_names = odata.get_data(
                                                            target=odata.default_target_attribute, 
                                                            include_row_id=True,
                                                            include_ignore_attribute=True,
    )
    y.name = 'class' # the name of the 'y' variable
    
    if True in categorical_indicator: # Only data with categorical variables.
        
        # collect did
        did_SCARF.append(did)
        
        # get num, cat feature names
        num_feat_names = [attribute_names[i] for i in range(len(attribute_names)) if categorical_indicator[i]==False]
        cat_feat_names = [attribute_names[i] for i in range(len(attribute_names)) if categorical_indicator[i]==True]

        if len(num_feat_names) != 0: # Only data with continuous variables.

            # sort features names by num_feat_names and cat_feat_names
            X = X[num_feat_names+cat_feat_names] 
            # remove columns if cardinality is one
            X = X.loc[:, X.nunique() != 1]
            # remove columns if cardinality is num of rows (id)
            X = X.loc[:, X.nunique() != X.shape[0]]

            # get split from tlist
            task = openml.tasks.get_task(tlist[tlist['did']==did]['tid'].values[0])
            train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0,)
            X_train, X_test, y_train, y_test = X.iloc[train_indices], X.iloc[test_indices], y.iloc[train_indices], y.iloc[test_indices]
            train = pd.concat([X_train, y_train], axis=1)
            test = pd.concat([X_test, y_test], axis=1)

            # create a folder path
            os.makedirs(save_path+f'{did}-{name}', exist_ok=True)
            # save train and test data
            train.to_csv(save_path+f'{did}-{name}/train.csv', index=False)
            test.to_csv(save_path+f'{did}-{name}/test.csv', index=False)
            
            # save cat feat names
            pd.DataFrame(cat_feat_names).to_csv(save_path+f'{did}-{name}/cat_feat_names.csv', index=False)


100%|██████████| 69/69 [08:22<00:00,  7.29s/it]

15





# 2. Why do tree-based models still outperform deep learning on tabular data?  

In [14]:
# Categorical classification, OpenML benchmark
# https://www.openml.org/search?type=benchmark&sort=tasks_included&study_type=task&id=300

In [133]:
benchmark_suite = openml.study.get_suite(suite_id = 300)
benchmark_suite

OpenML Benchmark Suite
ID..............: 300
Name............: Tabular benchmark categorical classification
Status..........: in_preparation
Main Entity Type: task
Study URL.......: https://www.openml.org/s/300
# of Data.......: 7
# of Tasks......: 7
Creator.........: https://www.openml.org/u/26324
Upload Time.....: 2022-07-12 18:58:05

In [134]:
# Task IDs and data IDs present in the benchmark suite.

print('number of tasks:', len(benchmark_suite.tasks))
print('task ids:', benchmark_suite.tasks)
print('')
print('number of data:', len(benchmark_suite.data))
print('data ids:', benchmark_suite.data)


number of tasks: 7
task ids: [361110, 361111, 361112, 361113, 361114, 361115, 361116]

number of data: 7
data ids: [44156, 44157, 44158, 44159, 44160, 44161, 44162]


In [135]:
data_ids_2 = benchmark_suite.data

In [136]:
"""
Get datasets
"""

dlist = datasets.list_datasets(data_id=benchmark_suite.data, output_format='dataframe')
dlist

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
44156,44156,electricity,13,26324,active,arff,19237.0,19237.0,2.0,9.0,38474.0,0.0,0.0,7.0,2.0
44157,44157,eye_movements,8,26324,active,arff,3804.0,3804.0,2.0,24.0,7608.0,0.0,0.0,20.0,4.0
44158,44158,KDDCup09_upselling,4,26324,active,arff,2516.0,2516.0,2.0,46.0,5032.0,0.0,0.0,34.0,12.0
44159,44159,covertype,13,26324,active,arff,211840.0,211840.0,2.0,55.0,423680.0,0.0,0.0,10.0,45.0
44160,44160,rl,4,26324,active,arff,2485.0,2485.0,2.0,13.0,4970.0,0.0,0.0,5.0,8.0
44161,44161,road-safety,6,26324,active,arff,55881.0,55881.0,2.0,33.0,111762.0,0.0,0.0,29.0,4.0
44162,44162,compass,3,26324,active,arff,8322.0,8322.0,2.0,18.0,16644.0,0.0,0.0,8.0,10.0


In [13]:
"""
Count the number of data with both 'continuous' and 'categorical' variables, and the number of data with only 'categorical' variables.
"""


# see if NumberOfFeatures == NumberOfNumericFeatures + NumberOfSymbolicFeatures
# count the number of False, if any
temp = dlist['NumberOfFeatures'] == dlist['NumberOfNumericFeatures'] + dlist['NumberOfSymbolicFeatures']
print(temp.value_counts())

# count the number of rows where 'NumberOfNumericFeatures' or 'NumberOfSymbolicFeatures' is zero
print('number of rows where NumberOfNumericFeatures is zero:', len(dlist[dlist['NumberOfNumericFeatures'] == 0]))
# NumberOfSymbolicFeatures includes the class, so if it's 1, it means there's no categorical variable.
print('number of rows where NumberOfSymbolicFeatures is one:', len(dlist[dlist['NumberOfSymbolicFeatures'] == 1]))

True    7
dtype: int64
number of rows where NumberOfNumericFeatures is zero: 0
number of rows where NumberOfSymbolicFeatures is one: 0


In [15]:
# Get tasks

tlist = openml.tasks.list_tasks(task_id=benchmark_suite.tasks, output_format="dataframe")
tlist

Unnamed: 0,tid,ttid,did,name,task_type,status,estimation_procedure,evaluation_measures,source_data,target_feature,MajorityClassSize,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
361110,361110,TaskType.SUPERVISED_CLASSIFICATION,44156,electricity,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,44156,class,19237,19237,2,9,38474,0,0,7,2
361111,361111,TaskType.SUPERVISED_CLASSIFICATION,44157,eye_movements,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,44157,label,3804,3804,2,24,7608,0,0,20,4
361112,361112,TaskType.SUPERVISED_CLASSIFICATION,44158,KDDCup09_upselling,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,44158,UPSELLING,2516,2516,2,46,5032,0,0,34,12
361113,361113,TaskType.SUPERVISED_CLASSIFICATION,44159,covertype,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,44159,class,211840,211840,2,55,423680,0,0,10,45
361114,361114,TaskType.SUPERVISED_CLASSIFICATION,44160,rl,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,44160,class,2485,2485,2,13,4970,0,0,5,8
361115,361115,TaskType.SUPERVISED_CLASSIFICATION,44161,road-safety,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,44161,Sex_of_Driver,55881,55881,2,33,111762,0,0,29,4
361116,361116,TaskType.SUPERVISED_CLASSIFICATION,44162,compass,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,44162,is_recid,8322,8322,2,18,16644,0,0,8,10


## Collect

In [16]:
current_dir = os.getcwd()
save_path = current_dir + '/data/data_2/'

print(save_path)


/home/ubuntu/User/LEE/연구_신한/Dataset_OpenML/data/data_2/


In [17]:
dataset_dict = {}
did_trees = []

for did in tqdm(dlist['did']):
    # get dataset name from dlist
    name = dlist.loc[dlist['did']==did, 'name'].values[0]
    
    # get dataset from dlist
    odata = openml.datasets.get_dataset(did)
    X, y, categorical_indicator, attribute_names = odata.get_data(
                                                            target=odata.default_target_attribute, 
                                                            include_row_id=True,
                                                            include_ignore_attribute=True,
    )
    y.name = 'class' # the name of the 'y' variable
    
    if True in categorical_indicator: # Only data with categorical variables.
        
        # collect did
        did_trees.append(did)
        
        # get num, cat feature names
        num_feat_names = [attribute_names[i] for i in range(len(attribute_names)) if categorical_indicator[i]==False]
        cat_feat_names = [attribute_names[i] for i in range(len(attribute_names)) if categorical_indicator[i]==True]

        # sort features names by num_feat_names and cat_feat_names
        X = X[num_feat_names+cat_feat_names] 
        # remove columns if cardinality is one
        X = X.loc[:, X.nunique() != 1]
        # remove columns if cardinality is num of rows (id)
        X = X.loc[:, X.nunique() != X.shape[0]]

        # get split from tlist
        task = openml.tasks.get_task(tlist[tlist['did']==did]['tid'].values[0])
        train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0,)
        X_train, X_test, y_train, y_test = X.iloc[train_indices], X.iloc[test_indices], y.iloc[train_indices], y.iloc[test_indices]
        train = pd.concat([X_train, y_train], axis=1)
        test = pd.concat([X_test, y_test], axis=1)
        
        # create a folder path
        os.makedirs(save_path+f'{did}-{name}', exist_ok=True)
        # save train and test data
        train.to_csv(save_path+f'{did}-{name}/train.csv', index=False)
        test.to_csv(save_path+f'{did}-{name}/test.csv', index=False)
        
        # save cat feat names
        pd.DataFrame(cat_feat_names).to_csv(save_path+f'{did}-{name}/cat_feat_names.csv', index=False)


100%|██████████| 7/7 [00:33<00:00,  4.72s/it]


In [15]:
# Numerical classification, OpenML benchmark
# https://www.openml.org/search?type=benchmark&study_type=task&sort=tasks_included&id=298

In [48]:
benchmark_suite = openml.study.get_suite(suite_id = 298)
benchmark_suite

OpenML Benchmark Suite
ID..............: 298
Name............: Tabular benchmark numerical classification
Status..........: in_preparation
Main Entity Type: task
Study URL.......: https://www.openml.org/s/298
# of Data.......: 15
# of Tasks......: 15
Creator.........: https://www.openml.org/u/26324
Upload Time.....: 2022-07-10 11:00:56

In [49]:
# Task IDs and data IDs present in the benchmark suite.

print('number of tasks:', len(benchmark_suite.tasks))
print('task ids:', benchmark_suite.tasks)
print('')
print('number of data:', len(benchmark_suite.data))
print('data ids:', benchmark_suite.data)


number of tasks: 15
task ids: [361055, 361056, 361057, 361060, 361061, 361062, 361063, 361064, 361065, 361066, 361067, 361068, 361069, 361070, 361071]

number of data: 15
data ids: [44089, 44090, 44091, 44120, 44121, 44122, 44123, 44124, 44125, 44126, 44127, 44128, 44129, 44130, 44131]


In [50]:
data_ids_3 = benchmark_suite.data

In [18]:
"""
Get datasets
"""

dlist = datasets.list_datasets(data_id=benchmark_suite.data, output_format='dataframe')
dlist

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
44089,44089,credit,2,26324,active,arff,8357.0,8357.0,2.0,11.0,16714.0,0.0,0.0,10.0,1.0
44090,44090,california,5,26324,active,arff,10317.0,10317.0,2.0,9.0,20634.0,0.0,0.0,8.0,1.0
44091,44091,wine,8,26324,active,arff,1277.0,1277.0,2.0,12.0,2554.0,0.0,0.0,11.0,1.0
44120,44120,electricity,12,26324,active,arff,19237.0,19237.0,2.0,8.0,38474.0,0.0,0.0,7.0,1.0
44121,44121,covertype,12,26324,active,arff,283301.0,283301.0,2.0,11.0,566602.0,0.0,0.0,10.0,1.0
44122,44122,pol,10,26324,active,arff,5041.0,5041.0,2.0,27.0,10082.0,0.0,0.0,26.0,1.0
44123,44123,house_16H,10,26324,active,arff,6744.0,6744.0,2.0,17.0,13488.0,0.0,0.0,16.0,1.0
44124,44124,kdd_ipums_la_97-small,6,26324,active,arff,2594.0,2594.0,2.0,21.0,5188.0,0.0,0.0,20.0,1.0
44125,44125,MagicTelescope,7,26324,active,arff,6688.0,6688.0,2.0,11.0,13376.0,0.0,0.0,10.0,1.0
44126,44126,bank-marketing,8,26324,active,arff,5289.0,5289.0,2.0,8.0,10578.0,0.0,0.0,7.0,1.0


In [19]:
"""
Count the number of data with both 'continuous' and 'categorical' variables, and the number of data with only 'categorical' variables.
"""

# see if NumberOfFeatures == NumberOfNumericFeatures + NumberOfSymbolicFeatures
# count the number of False, if any
temp = dlist['NumberOfFeatures'] == dlist['NumberOfNumericFeatures'] + dlist['NumberOfSymbolicFeatures']
print(temp.value_counts())

# count the number of rows where 'NumberOfNumericFeatures' or 'NumberOfSymbolicFeatures' is zero
print('number of rows where NumberOfNumericFeatures is zero:', len(dlist[dlist['NumberOfNumericFeatures'] == 0]))
# NumberOfSymbolicFeatures includes the class, so if it's 1, it means there's no categorical variable.
print('number of rows where NumberOfSymbolicFeatures is one:', len(dlist[dlist['NumberOfSymbolicFeatures'] == 1]))

True    15
dtype: int64
number of rows where NumberOfNumericFeatures is zero: 0
number of rows where NumberOfSymbolicFeatures is one: 15


# 3. AutoML

In [2]:
benchmark_suite = openml.study.get_suite(suite_id = 271) 
benchmark_suite

OpenML Benchmark Suite
ID..............: 271
Name............: AutoML Benchmark All Classification
Status..........: in_preparation
Main Entity Type: task
Study URL.......: https://www.openml.org/s/271
# of Data.......: 71
# of Tasks......: 71
Creator.........: https://www.openml.org/u/869
Upload Time.....: 2020-11-19 20:52:19

In [3]:
# Task IDs and data IDs present in the benchmark suite.

print('number of tasks:', len(benchmark_suite.tasks))
print('task ids:', benchmark_suite.tasks)
print('')
print('number of data:', len(benchmark_suite.data))
print('data ids:', benchmark_suite.data)


number of tasks: 71
task ids: [2073, 3945, 7593, 10090, 146818, 146820, 167120, 168350, 168757, 168784, 168868, 168909, 168910, 168911, 189354, 189355, 189356, 189922, 190137, 190146, 190392, 190410, 190411, 190412, 211979, 211986, 359953, 359954, 359955, 359956, 359957, 359958, 359959, 359960, 359961, 359962, 359963, 359964, 359965, 359966, 359967, 359968, 359969, 359970, 359971, 359972, 359973, 359974, 359975, 359976, 359977, 359979, 359980, 359981, 359982, 359983, 359984, 359985, 359986, 359987, 359988, 359989, 359990, 359991, 359992, 359993, 359994, 360112, 360113, 360114, 360975]

number of data: 71
data ids: [181, 1111, 1596, 1457, 40981, 40983, 23517, 1489, 31, 40982, 41138, 41163, 41164, 41143, 1169, 41167, 41147, 41158, 1487, 54, 41144, 41145, 41156, 41157, 41168, 4541, 1515, 188, 1464, 1494, 1468, 1049, 23, 40975, 12, 1067, 40984, 40670, 3, 40978, 4134, 40701, 1475, 4538, 4534, 41146, 41142, 40498, 40900, 40996, 40668, 4135, 1486, 41027, 1461, 1590, 41169, 41166, 41165, 40685

In [4]:
data_ids_4 = benchmark_suite.data

In [5]:
"""
Get datasets
"""

dlist = datasets.list_datasets(data_id=benchmark_suite.data, output_format='dataframe')
dlist

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
12,12,mfeat-factors,1,1,active,ARFF,200.0,10.0,200.0,10.0,217.0,2000.0,0.0,0.0,216.0,1.0
23,23,cmc,1,1,active,ARFF,629.0,4.0,333.0,3.0,10.0,1473.0,0.0,0.0,2.0,8.0
31,31,credit-g,1,1,active,ARFF,700.0,10.0,300.0,2.0,21.0,1000.0,0.0,0.0,7.0,14.0
54,54,vehicle,1,1,active,ARFF,218.0,4.0,199.0,4.0,19.0,846.0,0.0,0.0,18.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42734,42734,okcupid-stem,3,11601,active,arff,36350.0,,4882.0,3.0,20.0,50789.0,48622.0,154107.0,2.0,18.0
42742,42742,porto-seguro,3,11601,active,arff,573518.0,,21694.0,2.0,58.0,595212.0,470281.0,846458.0,26.0,32.0
42746,42746,KDDCup99,5,11601,active,arff,2807886.0,,2.0,23.0,42.0,4898431.0,0.0,0.0,32.0,10.0
42769,42769,Higgs,3,11601,active,arff,529920.0,,470080.0,2.0,29.0,1000000.0,0.0,0.0,28.0,1.0


In [6]:
# drop rows where 'name' contains 'mnist' or 'MNIST' or 'CIFAR'
dlist = dlist[~dlist['name'].str.contains('mnist', case=False)]
# dlist = dlist[~dlist['name'].str.contains('MNIST', case=False)]
# dlist = dlist[~dlist['name'].str.contains('CIFAR', case=False)]
dlist

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
12,12,mfeat-factors,1,1,active,ARFF,200.0,10.0,200.0,10.0,217.0,2000.0,0.0,0.0,216.0,1.0
23,23,cmc,1,1,active,ARFF,629.0,4.0,333.0,3.0,10.0,1473.0,0.0,0.0,2.0,8.0
31,31,credit-g,1,1,active,ARFF,700.0,10.0,300.0,2.0,21.0,1000.0,0.0,0.0,7.0,14.0
54,54,vehicle,1,1,active,ARFF,218.0,4.0,199.0,4.0,19.0,846.0,0.0,0.0,18.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42734,42734,okcupid-stem,3,11601,active,arff,36350.0,,4882.0,3.0,20.0,50789.0,48622.0,154107.0,2.0,18.0
42742,42742,porto-seguro,3,11601,active,arff,573518.0,,21694.0,2.0,58.0,595212.0,470281.0,846458.0,26.0,32.0
42746,42746,KDDCup99,5,11601,active,arff,2807886.0,,2.0,23.0,42.0,4898431.0,0.0,0.0,32.0,10.0
42769,42769,Higgs,3,11601,active,arff,529920.0,,470080.0,2.0,29.0,1000000.0,0.0,0.0,28.0,1.0


In [7]:
"""
Count the number of data with both 'continuous' and 'categorical' variables, and the number of data with only 'categorical' variables.
"""

# see if NumberOfFeatures == NumberOfNumericFeatures + NumberOfSymbolicFeatures
# count the number of False, if any
temp = dlist['NumberOfFeatures'] == dlist['NumberOfNumericFeatures'] + dlist['NumberOfSymbolicFeatures']
print(temp.value_counts())

# count the number of rows where 'NumberOfNumericFeatures' or 'NumberOfSymbolicFeatures' is zero
print('number of rows where NumberOfNumericFeatures is zero:', len(dlist[dlist['NumberOfNumericFeatures'] == 0]))
# NumberOfSymbolicFeatures includes the class, so if it's 1, it means there's no categorical variable.
print('number of rows where NumberOfSymbolicFeatures is one:', len(dlist[dlist['NumberOfSymbolicFeatures'] == 1]))

True    70
dtype: int64
number of rows where NumberOfNumericFeatures is zero: 6
number of rows where NumberOfSymbolicFeatures is one: 41


In [8]:
# Get tasks

tlist = openml.tasks.list_tasks(task_id=benchmark_suite.tasks, output_format="dataframe")
tlist

Unnamed: 0,tid,ttid,did,name,task_type,status,estimation_procedure,evaluation_measures,source_data,target_feature,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
2073,2073,TaskType.SUPERVISED_CLASSIFICATION,181,yeast,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,181,class_protein_localization,463,10.0,5,10,9,1484,0,0,8,1
3945,3945,TaskType.SUPERVISED_CLASSIFICATION,1111,KDDCup09_appetency,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,1111,APPETENCY,49110,15415.0,890,2,231,50000,50000,8024152,192,39
7593,7593,TaskType.SUPERVISED_CLASSIFICATION,1596,covertype,Supervised Classification,active,10-fold Crossvalidation,,1596,class,283301,7.0,2747,7,55,581012,0,0,10,45
10090,10090,TaskType.SUPERVISED_CLASSIFICATION,1457,amazon-commerce-reviews,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,1457,Class,30,50.0,30,50,10001,1500,0,0,10000,1
146818,146818,TaskType.SUPERVISED_CLASSIFICATION,40981,Australian,Supervised Classification,active,10-fold Crossvalidation,,40981,A15,383,14.0,307,2,15,690,0,0,6,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359994,359994,TaskType.SUPERVISED_CLASSIFICATION,42732,sf-police-incidents,Supervised Classification,active,10-fold Crossvalidation,predictive_accuracy,42732,ViolentCrime,1945704,,269319,2,9,2215023,0,0,3,6
360112,360112,TaskType.SUPERVISED_CLASSIFICATION,42746,KDDCup99,Supervised Classification,active,10-fold Crossvalidation,,42746,target,2807886,,2,23,42,4898431,0,0,32,10
360113,360113,TaskType.SUPERVISED_CLASSIFICATION,42742,porto-seguro,Supervised Classification,active,10-fold Crossvalidation,,42742,target,573518,,21694,2,58,595212,470281,846458,26,32
360114,360114,TaskType.SUPERVISED_CLASSIFICATION,42769,Higgs,Supervised Classification,active,10-fold Crossvalidation,area_under_roc_curve,42769,target,529920,,470080,2,29,1000000,0,0,28,1


## Collect

In [9]:
current_dir = os.getcwd()
save_path = current_dir + '/data/data_3/'

print(save_path)

c:\Users\lesga\OneDrive - UNIST\lab바탕화면\연구_STCC\_code\Dataset_OpenML/data/data_3/


In [10]:
dataset_dict = {}
did_automl = []

for did in tqdm(dlist['did']):
    # get dataset name from dlist
    name = dlist.loc[dlist['did']==did, 'name'].values[0]
    
    # get dataset from dlist
    odata = openml.datasets.get_dataset(did)
    X, y, categorical_indicator, attribute_names = odata.get_data(
                                                            target=odata.default_target_attribute, 
                                                            include_row_id=True,
                                                            include_ignore_attribute=True,
    )
    y.name = 'class' # the name of the 'y' variable
    
    if True in categorical_indicator: # Only data with categorical variables.
        
        # collect did
        did_automl.append(did)
        
        # get num, cat feature names
        num_feat_names = [attribute_names[i] for i in range(len(attribute_names)) if categorical_indicator[i]==False]
        cat_feat_names = [attribute_names[i] for i in range(len(attribute_names)) if categorical_indicator[i]==True]

        if len(num_feat_names) != 0: # Only data with continuous variables.

            # sort features names by num_feat_names and cat_feat_names
            X = X[num_feat_names+cat_feat_names] 
            # remove columns if cardinality is one
            X = X.loc[:, X.nunique() != 1]
            # remove columns if cardinality is num of rows (id)
            X = X.loc[:, X.nunique() != X.shape[0]]

            # get split from tlist
            task = openml.tasks.get_task(tlist[tlist['did']==did]['tid'].values[0])
            train_indices, test_indices = task.get_train_test_split_indices(repeat=0, fold=0, sample=0,)
            X_train, X_test, y_train, y_test = X.iloc[train_indices], X.iloc[test_indices], y.iloc[train_indices], y.iloc[test_indices]
            train = pd.concat([X_train, y_train], axis=1)
            test = pd.concat([X_test, y_test], axis=1)
            
            # create a folder path
            os.makedirs(save_path+f'{did}-{name}', exist_ok=True)
            # save train and test data
            train.to_csv(save_path+f'{did}-{name}/train.csv', index=False)
            test.to_csv(save_path+f'{did}-{name}/test.csv', index=False)
            
            # save cat feat names
            pd.DataFrame(cat_feat_names).to_csv(save_path+f'{did}-{name}/cat_feat_names.csv', index=False)


100%|██████████| 70/70 [18:24<00:00, 15.78s/it] 


# 4. OpenML

In [2]:
"""
All data present in OpenML
"""

dlist = datasets.list_datasets(output_format='dataframe')
dlist


Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
2,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,39.0,898.0,898.0,22175.0,6.0,33.0
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
4,4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,17.0,57.0,56.0,326.0,8.0,9.0
5,5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,280.0,452.0,384.0,408.0,206.0,74.0
6,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,17.0,20000.0,0.0,0.0,16.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45047,45047,Airlines_DepDelay_1M,6,26324,active,arff,,,,0.0,6.0,1000000.0,0.0,0.0,6.0,0.0
45048,45048,medical_charges,12,26324,active,arff,,,,0.0,4.0,163065.0,0.0,0.0,4.0,0.0
45049,45049,MD_MIX_Mini_Copy,1,9186,active,arff,40.0,,40.0,706.0,69.0,28240.0,28240.0,665053.0,45.0,2.0
45050,45050,Amazon_Electronics_Dataset,1,31955,active,arff,,,,,3.0,10000.0,0.0,0.0,1.0,2.0


In [3]:
"""
keep only latest version of datasets
"""

# capital letter to lower case in dlist['name']
dlist['name'] = dlist['name'].str.lower()
dlist_f = dlist.sort_values('version').drop_duplicates('name', keep='last')
dlist_f.shape


(3957, 16)

In [4]:
dlist_f

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
42178,42178,telco-customer-churn,1,1140,active,arff,5174.0,,1869.0,2.0,20.0,7043.0,0.0,0.0,3.0,0.0
42182,42182,lorenz_attractor_regime_changes,1,10283,active,ARFF,,,,,4.0,4942.0,0.0,0.0,4.0,0.0
42183,42183,dataset_sales,1,10333,active,ARFF,,,,0.0,15.0,10738.0,0.0,0.0,15.0,0.0
42186,42186,juanfeldmaniris,1,10443,active,ARFF,50.0,3.0,50.0,3.0,5.0,150.0,0.0,0.0,4.0,1.0
42188,42188,premier_league_with_tda,1,10283,active,ARFF,,,,,20.0,2565.0,0.0,0.0,20.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45012,45012,fifa,19,30127,active,arff,,,,0.0,29.0,19178.0,0.0,0.0,28.0,1.0
44195,44195,weather,19,31897,active,arff,9.0,,5.0,2.0,5.0,14.0,0.0,0.0,2.0,3.0
42626,42626,train,24,11598,active,arff,,,,,12.0,891.0,708.0,866.0,7.0,0.0
43035,43035,dgf_96f4164d-956d-4c1c-b161-68724eb0ccdc,27,26214,active,arff,398.0,,10.0,5.0,57.0,699.0,699.0,7889.0,12.0,45.0


In [None]:
"""
Exclude imgage and time-series data: 'mnist' or 'cifar' or 'forex' 'volcanoes' 'meta_album'
"""

dlist_f = dlist_f.loc[~dlist_f['name'].str.contains('mnist'), :]
dlist_f = dlist_f.loc[~dlist_f['name'].str.contains('cifar'), :]
dlist_f = dlist_f.loc[~dlist_f['name'].str.contains('forex'), :]
dlist_f = dlist_f.loc[~dlist_f['name'].str.contains('volcanoes'), :]
dlist_f = dlist_f.loc[~dlist_f['name'].str.contains('meta_album'), :]
dlist_f.shape


(3644, 16)

In [None]:
"""
Only data with numerical variables.
"""

dlist_f = dlist_f.loc[dlist_f['NumberOfNumericFeatures']>0, :]
dlist_f.shape

(24, 16)


(3375, 16)

In [None]:
"""
Only data with more than 5000 samples.
"""

dlist_f = dlist_f.loc[dlist_f['NumberOfInstances'] >= 5000, :]
dlist_f.shape


(752, 16)

In [None]:
"""
Only data with classification tasks
"""

dlist_f = dlist_f.loc[dlist_f['NumberOfClasses'] > 0, :]
dlist_f.shape


(340, 16)


(250, 16)

In [None]:
"""
Exclude the ones with too many NumberOfFeatures to eliminate images, text, composite data, etc
"""

# filter dataframe where NumberOfFeatures is less than 50
dlist_f_f = dlist_f.loc[dlist_f['NumberOfFeatures'] >= 150, :]
dlist_f = dlist_f.loc[dlist_f['NumberOfFeatures'] < 150, :]
dlist_f.shape

(214, 16)

In [None]:
dlist_f = dlist_f[dlist_f["NumberOfNumericFeatures"] > dlist_f["NumberOfSymbolicFeatures"]]

In [None]:
dlist_f = dlist_f[(dlist_f["MajorityClassSize"] / dlist_f["MinorityClassSize"] ) <= 5]

In [None]:
dlist_f = dlist_f[dlist_f["NumberOfClasses"] <= 5]

In [None]:
data_ids_5 = dlist_f.did.values

## Collect

In [None]:
current_dir = os.getcwd()
save_path = current_dir + '/data/data_4_2/'

print(save_path)

/home/ubuntu/User/LEE/연구_신한/Dataset_OpenML/data/data_4_2/


In [None]:
dlist = dlist_f.copy()

In [None]:
dataset_dict = {}
did_dataset = []

for did in tqdm(dlist['did']):
    
    with warnings.catch_warnings(): 
        warnings.simplefilter("error")
    
        # get dataset name from dlist
        name = dlist.loc[dlist['did']==did, 'name'].values[0]
        
        # get dataset from dlist
        odata = openml.datasets.get_dataset(did)
        X, y, categorical_indicator, attribute_names = odata.get_data(
                                                                target=odata.default_target_attribute, 
                                                                include_row_id=True,
                                                                include_ignore_attribute=True,
        )
        y.name = 'class' # the name of the 'y' variable
        
        if True in categorical_indicator: # Only data with categorical variables.
            
            # collect did
            did_dataset.append(did)
            
            # get num, cat feature names
            num_feat_names = [attribute_names[i] for i in range(len(attribute_names)) if categorical_indicator[i]==False]
            cat_feat_names = [attribute_names[i] for i in range(len(attribute_names)) if categorical_indicator[i]==True]
            
            # sort features names by num_feat_names and cat_feat_names
            X = X[num_feat_names+cat_feat_names] 
            # remove columns if cardinality is one
            X = X.loc[:, X.nunique() != 1]
            # remove columns if cardinality is num of rows (id)
            X = X.loc[:, X.nunique() != X.shape[0]]
            
            # get split
            data = pd.concat([X, y], axis=1)
            split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=2022)
            train_indices, test_indices = next(split.split(data, data['class']))
            X_train, X_test, y_train, y_test = X.iloc[train_indices], X.iloc[test_indices], y.iloc[train_indices], y.iloc[test_indices]
            train = pd.concat([X_train, y_train], axis=1)
            test = pd.concat([X_test, y_test], axis=1)
            
            # create a folder path
            os.makedirs(save_path+f'{did}-{name}', exist_ok=True)
            # save train and test data
            train.to_csv(save_path+f'{did}-{name}/train.csv', index=False)
            test.to_csv(save_path+f'{did}-{name}/test.csv', index=False)
            
            # save cat feat names
            pd.DataFrame(cat_feat_names).to_csv(save_path+f'{did}-{name}/cat_feat_names.csv', index=False)

100%|██████████| 73/73 [2:40:34<00:00, 131.98s/it]
