In [None]:
import openml

In [None]:
task = openml.tasks.get_task(31, download_splits=False)
dataset = task.get_dataset()

print(f"Task type: {task.task_type}")
print(f"Dataset name: {dataset.name}")
print(f"Description: {dataset.description}")
print(f"Number of instances: {dataset.qualities['NumberOfInstances']}")
print(f"Number of features: {dataset.qualities['NumberOfFeatures']}")
print(f"Target feature: {dataset.default_target_attribute}")

In [1]:
import os

def find_child_index(parent_dir, child_name):
    try:
        directory_contents = os.listdir(parent_dir)
        if child_name in directory_contents:
            return directory_contents.index(child_name)
        return -1
    except FileNotFoundError:
        print(f"Parent directory {parent_dir} not found")
        return -1

In [3]:
parent_dir = "results-category3"
child_name = "cv_early_stop_strategy=current_average_worse_than_mean_best-fold=0-metric=roc_auc_ovr-n_splits=10-optimizer=random_search-pipeline=mlp_classifier-task=146818"

print(find_child_index(parent_dir, child_name))

348


# Quick commands
1. One exp
    ```bash
    python e1.py submit --expname "category3-nsplits-10" --job-array-limit 1000 --mail-type ALL --mail-user avakhutinskiy1@sheffield.ac.uk
    ```

    Output:
    ```bash
    pending: 1080
    Submitted batch job 5860009

    Due to the job array of the experiment "category3-nsplits-10" (1080) exceeding job array limit (1000):
    Submitted chunk 0-999 of 1079

    To submit the next chunk, run the following command when 80 jobs terminate (succed or fail):

        python e1.py submit --expname category3-nsplits-10 --job-array-limit 1000 --chunk-start-idx 1000
    ```
2. Count files
    ```bash
    find results-category3/ -name ".flag.submitted" | wc -l
    ```

    Output: `1000`

# Analysis

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_paper  = pd.read_parquet("../data-paper/mlp-nsplits-10.parquet.gzip")
df = pd.read_parquet("../data/mlp-nsplits-10.parquet")

In [3]:
print(f'df_paper shape: {df_paper.shape}')
print(f'df shape:    {df.shape}')

columns_diff = set(df.columns).difference(df_paper.columns)
print(f'\n{columns_diff}')

df_paper shape: (2732865, 56)
df shape:    (3213287, 57)

{'setting:seeded_inner_cv'}


In [4]:
df.drop(columns=columns_diff, inplace=True)

In [5]:
print("df_paper columns:", df_paper.columns)
print("df columns:", df.columns)
print("df_paper index:", df_paper.index)
print("df index:", df.index)

df_paper columns: Index(['created_at', 'reported_at', 'status',
       'metric:roc_auc_ovr [0.0, 1.0] (maximize)',
       'summary:val_mean_roc_auc_ovr', 'summary:val_std_roc_auc_ovr',
       'summary:test_mean_roc_auc_ovr', 'summary:test_std_roc_auc_ovr',
       'summary:test_bagged_roc_auc_ovr', 'summary:split_0:val_roc_auc_ovr',
       'summary:split_1:val_roc_auc_ovr', 'summary:split_2:val_roc_auc_ovr',
       'summary:split_3:val_roc_auc_ovr', 'summary:split_4:val_roc_auc_ovr',
       'summary:split_5:val_roc_auc_ovr', 'summary:split_6:val_roc_auc_ovr',
       'summary:split_7:val_roc_auc_ovr', 'summary:split_8:val_roc_auc_ovr',
       'summary:split_9:val_roc_auc_ovr', 'summary:split_0:test_roc_auc_ovr',
       'summary:split_1:test_roc_auc_ovr', 'summary:split_2:test_roc_auc_ovr',
       'summary:split_3:test_roc_auc_ovr', 'summary:split_4:test_roc_auc_ovr',
       'summary:split_5:test_roc_auc_ovr', 'summary:split_6:test_roc_auc_ovr',
       'summary:split_7:test_roc_auc_ovr', 