# 0 - Import and installs

In [5]:
!pip install nnetsauce openml joblib --upgrade --no-cache-dir



In [6]:
import category_encoders as ce
import joblib
import nnetsauce as ns
import numpy as np
import openml
import pandas as pd
import warnings

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from time import time
from functools import lru_cache
from tqdm import tqdm

# 1 Utility functions and global variables

In [7]:
NROWS = 1000
NCOLS = 10

In [8]:
def select_NROWS_NCOLS(X, y):

    def replace_nan_with_median(arr):
      # Calculate the median of each column ignoring NaN values
      median_vals = np.nanmedian(arr, axis=0)

      # Iterate over each column index and replace NaN with the corresponding median value
      for col_idx in range(arr.shape[1]):
          col_values = arr[:, col_idx]
          nan_indices = np.isnan(col_values)
          if np.any(nan_indices):
              col_values[nan_indices] = median_vals[col_idx]

      return arr

    print(f"X.shape (initial): {X.shape}")
    print(f"y.shape (initial): {y.shape}")
    print("Encoding features and response...")
    le = LabelEncoder()
    encoder = ce.HashingEncoder(return_df=False)
    X = np.asarray(encoder.fit_transform(X, y)).astype(np.float32)
    X = replace_nan_with_median(X)
    y = np.asarray(le.fit_transform(y)).astype(np.uint8)
    print("...Done.")
    print(f"Finding top {NCOLS} features if necessary...")
    if X.shape[1] > NCOLS:
        rf = RandomForestClassifier(n_estimators=50, random_state=42)
        rf.fit(X, y)
        indices = np.argsort(rf.feature_importances_)[::-1]
        top_cols = indices[:NCOLS]
        print(f"  top {NCOLS} indices: {top_cols}")
        X = X[:,top_cols]
        print(f"  X reduced shape: {X.shape}")
    print("...Done.")
    if X.shape[0] > NROWS:
      print(f"Subsampling to {NROWS} if necessary...")
      start = time()
      sub = ns.SubSampler(y=y.ravel().astype(np.uint8),
                          n_samples=NROWS, seed=123, n_jobs=-1)
      idx_rows  = sub.subsample().ravel()
      print(f"... Elapsed time for subsampling: {time() - start}")
      print("Number of rows in the subsample: ", len(idx_rows))
      return_X = replace_nan_with_median(X[idx_rows,:])
      return_y = y[idx_rows].ravel().astype(np.uint8)
      print("Done.")
      return return_X, return_y
    else:
      return X, y

# 2 - Get openml cc18 data

In [9]:
warnings.filterwarnings("ignore")

benchmark_suite = openml.study.get_suite('OpenML-CC18') # obtain the benchmark suite

n_tasks = len(benchmark_suite.tasks)

Xys = {}

start = time()
for idx, task_id in tqdm(enumerate(benchmark_suite.tasks)):  # iterate over all tasks
  try: 
    print(f"\n\n task #{idx}/{n_tasks} --------------------")
    warnings.filterwarnings("ignore")
    # Get the task information
    start = time()
    task = openml.tasks.get_task(task_id)  # download the OpenML task
    # get the data from openml
    task_name = task.get_dataset().name
    X_temp, y_temp = task.get_X_and_y()
    Xys[task_name] = {"task_id": task_id,
                      "dataset": select_NROWS_NCOLS(X_temp, y_temp)}
    print(f"Elapsed time for task #{idx}: {time()-start}")
  except:
    continue
print("Total elapsed time: {time()-start}")

0it [00:00, ?it/s]



 task #0/72 --------------------
X.shape (initial): (3196, 36)
y.shape (initial): (3196,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [20 32  9 14 34  5 31  7  0  6]
  X reduced shape: (3196, 10)
...Done.
Subsampling to 1000 if necessary...


1it [00:01,  1.72s/it]

... Elapsed time for subsampling: 0.3489799499511719
Number of rows in the subsample:  999
Done.
Elapsed time for task #0: 1.716198205947876


 task #1/72 --------------------
X.shape (initial): (20000, 16)
y.shape (initial): (20000,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


2it [00:04,  2.56s/it]

  top 10 indices: [12 14  8 11  7 10 13  9  6 15]
  X reduced shape: (20000, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.04304075241088867
Number of rows in the subsample:  987
Done.
Elapsed time for task #1: 3.1403844356536865


 task #2/72 --------------------


3it [00:05,  1.90s/it]

X.shape (initial): (625, 4)
y.shape (initial): (625,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
...Done.
Elapsed time for task #2: 1.1272993087768555


 task #3/72 --------------------
X.shape (initial): (2000, 216)
y.shape (initial): (2000,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


4it [00:09,  2.35s/it]

  top 10 indices: [185 211 180  96   7 184 197  28   0 145]
  X reduced shape: (2000, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.023698806762695312
Number of rows in the subsample:  1000
Done.
Elapsed time for task #3: 3.0318500995635986


 task #4/72 --------------------
X.shape (initial): (2000, 76)
y.shape (initial): (2000,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


5it [00:10,  2.15s/it]

  top 10 indices: [ 1  6 72 75  5 73  0  4  2  8]
  X reduced shape: (2000, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.03317546844482422
Number of rows in the subsample:  1000
Done.
Elapsed time for task #4: 1.7827410697937012


 task #5/72 --------------------


6it [00:11,  1.68s/it]

X.shape (initial): (699, 9)
y.shape (initial): (699,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
...Done.
Elapsed time for task #5: 0.7847929000854492


 task #6/72 --------------------
X.shape (initial): (2000, 64)
y.shape (initial): (2000,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


7it [00:13,  1.77s/it]

  top 10 indices: [ 0  2  1  3  4  9 10 20  8  6]
  X reduced shape: (2000, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.033721208572387695
Number of rows in the subsample:  1000
Done.
Elapsed time for task #6: 1.9545674324035645


 task #7/72 --------------------


8it [00:15,  1.96s/it]

X.shape (initial): (2000, 6)
y.shape (initial): (2000,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.02405834197998047
Number of rows in the subsample:  1000
Done.
Elapsed time for task #7: 2.3591530323028564


 task #8/72 --------------------
X.shape (initial): (2000, 47)
y.shape (initial): (2000,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


9it [00:17,  1.92s/it]

  top 10 indices: [18 28 42  5 44 39 45 41 36 27]
  X reduced shape: (2000, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.023369550704956055
Number of rows in the subsample:  1000
Done.
Elapsed time for task #8: 1.832855463027954


 task #9/72 --------------------


10it [00:19,  1.76s/it]

X.shape (initial): (1473, 9)
y.shape (initial): (1473,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.022966623306274414
Number of rows in the subsample:  1000
Done.
Elapsed time for task #9: 1.4137580394744873


 task #10/72 --------------------
X.shape (initial): (5620, 64)
y.shape (initial): (5620,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


11it [00:21,  1.95s/it]

  top 10 indices: [21 42 36 43 26 19 20 28 30 18]
  X reduced shape: (5620, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.03519797325134277
Number of rows in the subsample:  996
Done.
Elapsed time for task #10: 2.3790433406829834


 task #11/72 --------------------


12it [00:22,  1.59s/it]

X.shape (initial): (690, 15)
y.shape (initial): (690,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [ 8  7 14 10  2  1 13  9  5  6]
  X reduced shape: (690, 10)
...Done.
Elapsed time for task #11: 0.7600455284118652


 task #12/72 --------------------


13it [00:23,  1.36s/it]

X.shape (initial): (1000, 20)
y.shape (initial): (1000,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [ 4  0 12  1  3  2  6  5  7 11]
  X reduced shape: (1000, 10)
...Done.
Elapsed time for task #12: 0.8299944400787354


 task #13/72 --------------------
X.shape (initial): (10992, 16)
y.shape (initial): (10992,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


14it [00:25,  1.60s/it]

  top 10 indices: [15 13 10  4  9 14  1  8  7  0]
  X reduced shape: (10992, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.03728747367858887
Number of rows in the subsample:  993
Done.
Elapsed time for task #13: 2.162907361984253


 task #14/72 --------------------


15it [00:25,  1.33s/it]

X.shape (initial): (768, 8)
y.shape (initial): (768,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
...Done.
Elapsed time for task #14: 0.6871469020843506


 task #15/72 --------------------
X.shape (initial): (4601, 57)
y.shape (initial): (4601,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


16it [00:27,  1.34s/it]

  top 10 indices: [51 52  6 55 15 56 54 24 20 18]
  X reduced shape: (4601, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.023447036743164062
Number of rows in the subsample:  999
Done.
Elapsed time for task #15: 1.3772311210632324


 task #16/72 --------------------
X.shape (initial): (3190, 60)
y.shape (initial): (3190,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [29 28 31 30 34 27 32 25 33 18]
  X reduced shape: (3190, 10)
...Done.
Subsampling to 1000 if necessary...


17it [00:30,  1.77s/it]

... Elapsed time for subsampling: 0.023273944854736328
Number of rows in the subsample:  999
Done.
Elapsed time for task #16: 2.7652134895324707


 task #17/72 --------------------


18it [00:30,  1.45s/it]

X.shape (initial): (958, 9)
y.shape (initial): (958,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
...Done.
Elapsed time for task #17: 0.7096312046051025


 task #18/72 --------------------


19it [00:31,  1.25s/it]

X.shape (initial): (846, 18)
y.shape (initial): (846,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [ 5 11  9  7 10  6  2  4 17  0]
  X reduced shape: (846, 10)
...Done.
Elapsed time for task #18: 0.7935891151428223


 task #19/72 --------------------


20it [00:34,  1.80s/it]

X.shape (initial): (45312, 8)
y.shape (initial): (45312,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.03491473197937012
Number of rows in the subsample:  1000
Done.
Elapsed time for task #19: 3.0857653617858887


 task #20/72 --------------------
X.shape (initial): (6430, 36)
y.shape (initial): (6430,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


21it [00:36,  1.86s/it]

  top 10 indices: [17 16 19 20 21 28 12 13 27 32]
  X reduced shape: (6430, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.02463054656982422
Number of rows in the subsample:  997
Done.
Elapsed time for task #20: 1.9748656749725342


 task #21/72 --------------------


22it [00:37,  1.60s/it]

X.shape (initial): (736, 19)
y.shape (initial): (736,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [14 12 11 13 10 17 16  0 15  9]
  X reduced shape: (736, 10)
...Done.
Elapsed time for task #21: 1.00333571434021


 task #22/72 --------------------


23it [00:38,  1.42s/it]

X.shape (initial): (3772, 29)
y.shape (initial): (3772,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [19 25 21 23 17 28  0 18  5  2]
  X reduced shape: (3772, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.02335524559020996
Number of rows in the subsample:  999
Done.
Elapsed time for task #22: 1.0075037479400635


 task #23/72 --------------------


24it [00:39,  1.33s/it]

X.shape (initial): (990, 12)
y.shape (initial): (990,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [ 2  3  6  5  4  7  9 11 10  8]
  X reduced shape: (990, 10)
...Done.
Elapsed time for task #23: 1.1107244491577148


 task #24/72 --------------------
X.shape (initial): (7797, 617)
y.shape (initial): (7797,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


25it [00:56,  5.83s/it]

  top 10 indices: [393 394 412 461 583 411 418 396 361 395]
  X reduced shape: (7797, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.03521561622619629
Number of rows in the subsample:  988
Done.
Elapsed time for task #24: 16.31846857070923


 task #25/72 --------------------


26it [00:57,  4.42s/it]

X.shape (initial): (841, 70)
y.shape (initial): (841,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [ 9 59 50 23 56 36 20  5 34 28]
  X reduced shape: (841, 10)
...Done.
Elapsed time for task #25: 1.1220920085906982


 task #26/72 --------------------


27it [00:58,  3.37s/it]

X.shape (initial): (797, 4)
y.shape (initial): (797,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
...Done.
Elapsed time for task #26: 0.9328291416168213


 task #27/72 --------------------
X.shape (initial): (70000, 784)
y.shape (initial): (70000,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


28it [01:31, 12.28s/it]

  top 10 indices: [378 433 437 350 409 489 543 155 318 405]
  X reduced shape: (70000, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.05359172821044922
Number of rows in the subsample:  996
Done.
Elapsed time for task #27: 33.056612730026245


 task #28/72 --------------------


29it [01:32,  8.92s/it]

X.shape (initial): (1458, 37)
y.shape (initial): (1458,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [ 3 35  7  4  0 17  5 18 34 29]
  X reduced shape: (1458, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.022443294525146484
Number of rows in the subsample:  999
Done.
Elapsed time for task #28: 1.0998406410217285


 task #29/72 --------------------


30it [01:33,  6.62s/it]

X.shape (initial): (1563, 37)
y.shape (initial): (1563,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [ 0 34 17 35 32 19 24 21  4 18]
  X reduced shape: (1563, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.022283315658569336
Number of rows in the subsample:  999
Done.
Elapsed time for task #29: 1.2473511695861816


 task #30/72 --------------------
X.shape (initial): (10885, 21)
y.shape (initial): (10885,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


31it [01:36,  5.45s/it]

  top 10 indices: [ 0  8  5  3  9 11 12 20  7 18]
  X reduced shape: (10885, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.025152921676635742
Number of rows in the subsample:  999
Done.
Elapsed time for task #30: 2.7316460609436035


 task #31/72 --------------------


32it [01:37,  4.09s/it]

X.shape (initial): (522, 21)
y.shape (initial): (522,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [ 0 19 18  5 17  4 14  8 12  7]
  X reduced shape: (522, 10)
...Done.
Elapsed time for task #31: 0.9059920310974121


 task #32/72 --------------------


33it [01:38,  3.19s/it]

X.shape (initial): (2109, 21)
y.shape (initial): (2109,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [ 0  5  4  7  9  8 18 19 12 11]
  X reduced shape: (2109, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.022785186767578125
Number of rows in the subsample:  999
Done.
Elapsed time for task #32: 1.0794663429260254


 task #33/72 --------------------


34it [01:39,  2.52s/it]

X.shape (initial): (1109, 21)
y.shape (initial): (1109,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [13  8 15 14  5 17 12  4 11  0]
  X reduced shape: (1109, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.022216796875
Number of rows in the subsample:  999
Done.
Elapsed time for task #33: 0.9475643634796143


 task #34/72 --------------------
X.shape (initial): (48842, 14)
y.shape (initial): (48842,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


35it [01:46,  3.85s/it]

  top 10 indices: [ 2  0  5 10  4 12  6  7 11  3]
  X reduced shape: (48842, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.035808563232421875
Number of rows in the subsample:  999
Done.
Elapsed time for task #34: 6.947858095169067


 task #35/72 --------------------
X.shape (initial): (3751, 1776)
y.shape (initial): (3751,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


36it [01:52,  4.44s/it]

  top 10 indices: [ 26 105 468  19 106   6   4  13  16   8]
  X reduced shape: (3751, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.02300882339477539
Number of rows in the subsample:  999
Done.
Elapsed time for task #35: 5.817915916442871


 task #36/72 --------------------


37it [01:54,  3.73s/it]

X.shape (initial): (569, 30)
y.shape (initial): (569,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [27  7 20 22  6  2 23  3 26  0]
  X reduced shape: (569, 10)
...Done.
Elapsed time for task #36: 2.0765492916107178


 task #37/72 --------------------


38it [01:55,  3.01s/it]

X.shape (initial): (5404, 5)
y.shape (initial): (5404,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.023593902587890625
Number of rows in the subsample:  999
Done.
Elapsed time for task #37: 1.3433442115783691


 task #38/72 --------------------


39it [02:03,  4.44s/it]

X.shape (initial): (1055, 41)
y.shape (initial): (1055,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [35  0 21 26 38 14 12 11  1 17]
  X reduced shape: (1055, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.022336244583129883
Number of rows in the subsample:  999
Done.
Elapsed time for task #38: 7.757649660110474


 task #39/72 --------------------
X.shape (initial): (5456, 24)
y.shape (initial): (5456,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


40it [02:14,  6.52s/it]

  top 10 indices: [14 19 18 13 17 12 11 16 23 10]
  X reduced shape: (5456, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.023965835571289062
Number of rows in the subsample:  999
Done.
Elapsed time for task #39: 11.372346639633179


 task #40/72 --------------------


41it [02:31,  9.67s/it]

X.shape (initial): (1593, 256)
y.shape (initial): (1593,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [177 161  62 127  46 145  78  81 176 128]
  X reduced shape: (1593, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.022874116897583008
Number of rows in the subsample:  996
Done.
Elapsed time for task #40: 17.01414918899536


 task #41/72 --------------------


42it [02:36,  8.13s/it]

X.shape (initial): (583, 10)
y.shape (initial): (583,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
...Done.
Elapsed time for task #41: 4.5540666580200195


 task #42/72 --------------------
X.shape (initial): (2600, 500)
y.shape (initial): (2600,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


43it [02:55, 11.53s/it]

  top 10 indices: [338 475 241 105 472 442 128 336 453  48]
  X reduced shape: (2600, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.023008346557617188
Number of rows in the subsample:  1000
Done.
Elapsed time for task #42: 19.453449487686157


 task #43/72 --------------------
X.shape (initial): (34465, 118)
y.shape (initial): (34465,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


44it [03:07, 11.57s/it]

  top 10 indices: [ 0  3  1 97  5  2 88 99 90  4]
  X reduced shape: (34465, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.03161334991455078
Number of rows in the subsample:  999
Done.
Elapsed time for task #43: 11.652350425720215


 task #44/72 --------------------
X.shape (initial): (2534, 72)
y.shape (initial): (2534,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


45it [03:11,  9.45s/it]

  top 10 indices: [41 43 29 40 55 42 30 67 39 60]
  X reduced shape: (2534, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.022888660430908203
Number of rows in the subsample:  999
Done.
Elapsed time for task #44: 4.525864839553833


 task #45/72 --------------------


46it [03:21,  9.40s/it]

X.shape (initial): (1080, 856)
y.shape (initial): (1080,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [545 190 420 704 210 518 206 386 606 201]
  X reduced shape: (1080, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.022815227508544922
Number of rows in the subsample:  999
Done.
Elapsed time for task #45: 9.257000207901001


 task #46/72 --------------------
X.shape (initial): (6118, 51)
y.shape (initial): (6118,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


47it [03:31,  9.68s/it]

  top 10 indices: [14 12 28 26 24 10 22 18 37 16]
  X reduced shape: (6118, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.023974895477294922
Number of rows in the subsample:  996
Done.
Elapsed time for task #46: 10.35729169845581


 task #47/72 --------------------


48it [03:33,  7.44s/it]

X.shape (initial): (1372, 4)
y.shape (initial): (1372,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.022424936294555664
Number of rows in the subsample:  999
Done.
Elapsed time for task #47: 2.2118728160858154


 task #48/72 --------------------


49it [03:34,  5.41s/it]

X.shape (initial): (748, 4)
y.shape (initial): (748,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
...Done.
Elapsed time for task #48: 0.668339729309082


 task #49/72 --------------------
X.shape (initial): (11055, 30)
y.shape (initial): (11055,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


50it [03:36,  4.37s/it]

  top 10 indices: [13  7 25  6  5 14 15 28 12  8]
  X reduced shape: (11055, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.025200843811035156
Number of rows in the subsample:  999
Done.
Elapsed time for task #49: 1.9336228370666504


 task #50/72 --------------------


51it [03:47,  6.36s/it]

X.shape (initial): (540, 37)
y.shape (initial): (540,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [27 26 33 22 28 19  0  1 25 21]
  X reduced shape: (540, 10)
...Done.
Elapsed time for task #50: 10.99889874458313


 task #51/72 --------------------
X.shape (initial): (45211, 16)
y.shape (initial): (45211,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


52it [04:17, 13.60s/it]

  top 10 indices: [11  5  0  9 10 15  1 13 12  3]
  X reduced shape: (45211, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.04041457176208496
Number of rows in the subsample:  999
Done.
Elapsed time for task #51: 30.501537561416626


 task #52/72 --------------------
X.shape (initial): (9873, 32)
y.shape (initial): (9873,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


53it [05:14, 26.63s/it]

  top 10 indices: [25  4 10 27 24  7  1 29  2  8]
  X reduced shape: (9873, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.025085926055908203
Number of rows in the subsample:  998
Done.
Elapsed time for task #52: 57.04700422286987


 task #53/72 --------------------
X.shape (initial): (10299, 561)
y.shape (initial): (10299,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


54it [06:04, 33.62s/it]

  top 10 indices: [ 40 558  41  52  49 559  53  56  50  57]
  X reduced shape: (10299, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.02557516098022461
Number of rows in the subsample:  997
Done.
Elapsed time for task #53: 49.9165575504303


 task #54/72 --------------------


55it [06:05, 23.82s/it]

X.shape (initial): (500, 12)
y.shape (initial): (500,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [ 2  8  0  4  1  6 10  3  5 11]
  X reduced shape: (500, 10)
...Done.
Elapsed time for task #54: 0.9584224224090576


 task #55/72 --------------------
X.shape (initial): (5500, 40)
y.shape (initial): (5500,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


56it [06:08, 17.41s/it]

  top 10 indices: [22 29 39 19  9  5  2  4 23 28]
  X reduced shape: (5500, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.03547859191894531
Number of rows in the subsample:  990
Done.
Elapsed time for task #55: 2.446132183074951


 task #56/72 --------------------
X.shape (initial): (67557, 42)
y.shape (initial): (67557,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


57it [06:16, 14.79s/it]

  top 10 indices: [30  6 18 12 13  0 24 36 19 14]
  X reduced shape: (67557, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.041795969009399414
Number of rows in the subsample:  999
Done.
Elapsed time for task #56: 8.678444147109985


 task #57/72 --------------------
X.shape (initial): (1080, 77)
y.shape (initial): (1080,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


58it [06:18, 10.79s/it]

  top 10 indices: [32 46 10 76 30 65 53 17  0 50]
  X reduced shape: (1080, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.02282571792602539
Number of rows in the subsample:  998
Done.
Elapsed time for task #57: 1.4576756954193115


 task #58/72 --------------------
X.shape (initial): (1941, 27)
y.shape (initial): (1941,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


59it [06:19,  7.98s/it]

  top 10 indices: [10 21  4  7 22 13 17  0  1 24]
  X reduced shape: (1941, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.023145675659179688
Number of rows in the subsample:  997
Done.
Elapsed time for task #58: 1.4122834205627441


 task #59/72 --------------------


60it [06:20,  5.86s/it]

X.shape (initial): (540, 18)
y.shape (initial): (540,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
  top 10 indices: [ 1  0 12 13  5  4 14 17 10 15]
  X reduced shape: (540, 10)
...Done.
Elapsed time for task #59: 0.909733772277832


 task #60/72 --------------------


61it [06:21,  4.45s/it]

X.shape (initial): (4839, 5)
y.shape (initial): (4839,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.02387380599975586
Number of rows in the subsample:  999
Done.
Elapsed time for task #60: 1.168506145477295


 task #61/72 --------------------


62it [06:22,  3.42s/it]

X.shape (initial): (1728, 6)
y.shape (initial): (1728,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.02283620834350586
Number of rows in the subsample:  998
Done.
Elapsed time for task #61: 1.0068128108978271


 task #62/72 --------------------
X.shape (initial): (2310, 16)
y.shape (initial): (2310,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


63it [06:24,  2.81s/it]

  top 10 indices: [15 12  8  7  6  9 14 10 13 11]
  X reduced shape: (2310, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.02314615249633789
Number of rows in the subsample:  994
Done.
Elapsed time for task #62: 1.3799090385437012


 task #63/72 --------------------
X.shape (initial): (2000, 240)
y.shape (initial): (2000,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


64it [06:25,  2.33s/it]

  top 10 indices: [151 113  96  57  71  56  72 152  86 214]
  X reduced shape: (2000, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.023044109344482422
Number of rows in the subsample:  1000
Done.
Elapsed time for task #63: 1.2165896892547607


 task #64/72 --------------------
X.shape (initial): (70000, 784)
y.shape (initial): (70000,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


65it [07:39, 23.83s/it]

  top 10 indices: [246 434 574 602 546 262 234 406 490 658]
  X reduced shape: (70000, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.05324673652648926
Number of rows in the subsample:  1000
Done.
Elapsed time for task #64: 73.98587989807129


 task #65/72 --------------------


66it [07:58, 22.50s/it]

X.shape (initial): (44819, 6)
y.shape (initial): (44819,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.03522205352783203
Number of rows in the subsample:  998
Done.
Elapsed time for task #65: 19.40232276916504


 task #66/72 --------------------
X.shape (initial): (96320, 21)
y.shape (initial): (96320,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


67it [08:41, 28.51s/it]

  top 10 indices: [15  1 19  5 12  4 14 18 20  9]
  X reduced shape: (96320, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.04986572265625
Number of rows in the subsample:  999
Done.
Elapsed time for task #66: 42.52905035018921


 task #67/72 --------------------
X.shape (initial): (92000, 1024)
y.shape (initial): (92000,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


68it [10:32, 53.21s/it]

  top 10 indices: [619 611 431 910 908 579 643 675 712 594]
  X reduced shape: (92000, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.10752558708190918
Number of rows in the subsample:  966
Done.
Elapsed time for task #67: 110.83830332756042


 task #68/72 --------------------
X.shape (initial): (60000, 3072)
y.shape (initial): (60000,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


69it [15:20, 123.89s/it]

  top 10 indices: [2085 2074 2057 2076 2083 2055 2103 2054 2050 2956]
  X reduced shape: (60000, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.051497459411621094
Number of rows in the subsample:  1000
Done.
Elapsed time for task #68: 288.81143021583557


 task #69/72 --------------------
X.shape (initial): (3279, 1558)
y.shape (initial): (3279,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


70it [18:02, 135.26s/it]

  top 10 indices: [   1 1399    2  351    0 1229 1243  398 1483 1455]
  X reduced shape: (3279, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.023302793502807617
Number of rows in the subsample:  999
Done.
Elapsed time for task #69: 161.80345225334167


 task #70/72 --------------------
X.shape (initial): (3186, 180)
y.shape (initial): (3186,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


71it [19:18, 117.42s/it]

  top 10 indices: [ 84  89  92 104  99  82  88  93  83  87]
  X reduced shape: (3186, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.02329397201538086
Number of rows in the subsample:  999
Done.
Elapsed time for task #70: 75.77686166763306


 task #71/72 --------------------
X.shape (initial): (5000, 20)
y.shape (initial): (5000,)
Encoding features and response...
...Done.
Finding top 10 features if necessary...


72it [19:29, 16.24s/it] 

  top 10 indices: [ 7  9 19  4 12 10 17 16 15 18]
  X reduced shape: (5000, 10)
...Done.
Subsampling to 1000 if necessary...
... Elapsed time for subsampling: 0.023724079132080078
Number of rows in the subsample:  999
Done.
Elapsed time for task #71: 11.020227193832397
Total elapsed time: {time()-start}





In [None]:
n_tasks = len(Xys)
print

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

n_tasks = len(Xys)
clf = RandomForestClassifier()

for i, (key, value) in enumerate(Xys.items()):
  print(f"\n\n task #{i+1}/{n_tasks} --------------------")  
  X, y = value["dataset"]
  print(X.shape)
  print(y.shape)
  print(X.dtype)
  print(y.dtype)
  X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                      test_size=0.2,
                                                      random_state=42)
  start = time()
  print(f"accuracy: {clf.fit(X_train, y_train).score(X_test, y_test)}")
  print(f"Elapsed fitting time: {time() - start} seconds")
print(f"Total tasks: {i}")



 task #1/72 --------------------
(999, 10)
(999,)
float32
uint8
accuracy: 0.98
Elapsed fitting time: 0.11992907524108887 seconds


 task #2/72 --------------------
(987, 10)
(987,)
float32
uint8
accuracy: 0.803030303030303
Elapsed fitting time: 0.1830589771270752 seconds


 task #3/72 --------------------
(625, 4)
(625,)
float32
uint8
accuracy: 0.8
Elapsed fitting time: 0.11958980560302734 seconds


 task #4/72 --------------------
(1000, 10)
(1000,)
float32
uint8
accuracy: 0.915
Elapsed fitting time: 0.20526337623596191 seconds


 task #5/72 --------------------
(1000, 10)
(1000,)
float32
uint8
accuracy: 0.89
Elapsed fitting time: 0.22701525688171387 seconds


 task #6/72 --------------------
(699, 9)
(699,)
float32
uint8
accuracy: 0.9642857142857143
Elapsed fitting time: 0.12316179275512695 seconds


 task #7/72 --------------------
(1000, 10)
(1000,)
float32
uint8
accuracy: 0.935
Elapsed fitting time: 0.22740483283996582 seconds


 task #8/72 --------------------
(1000, 6)
(1000,)

# 3 - save data

In [13]:
print(f"Number of data sets: {len(Xys)}")

Number of data sets: 72


In [14]:
joblib.dump(Xys, "openml-cc18-Xys-2024-05-20.pkl")

['openml-cc18-Xys-2024-05-20.pkl']