In [2]:
import openml

In [3]:
task = openml.tasks.get_task(31, download_splits=False)
dataset = task.get_dataset()

print(f"Task type: {task.task_type}")
print(f"Dataset name: {dataset.name}")
print(f"Description: {dataset.description}")
print(f"Number of instances: {dataset.qualities['NumberOfInstances']}")
print(f"Number of features: {dataset.qualities['NumberOfFeatures']}")
print(f"Target feature: {dataset.default_target_attribute}")

Task type: Supervised Classification
Dataset name: credit-g
Description: **Author**: Dr. Hans Hofmann  
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)) - 1994    
**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)

**German Credit dataset**  
This dataset classifies people described by a set of attributes as good or bad credit risks.

This dataset comes with a cost matrix: 
``` 
Good  Bad (predicted)  
Good   0    1   (actual)  
Bad    5    0  
```

It is worse to class a customer as good when they are bad (5), than it is to class a customer as bad when they are good (1).  

### Attribute description  

1. Status of existing checking account, in Deutsche Mark.  
2. Duration in months  
3. Credit history (credits taken, paid back duly, delays, critical accounts)  
4. Purpose of the credit (car, television,...)  
5. Credit amount  
6. Status of savings account/bonds, in Deutsche Mark.  
7. Present employment, in number of y

  dataset = get_dataset(task.dataset_id, *dataset_args, **get_dataset_kwargs)
  return datasets.get_dataset(self.dataset_id)


In [4]:
import os

def find_child_index(parent_dir, child_name):
    try:
        directory_contents = os.listdir(parent_dir)
        if child_name in directory_contents:
            return directory_contents.index(child_name)
        return -1
    except FileNotFoundError:
        print(f"Parent directory {parent_dir} not found")
        return -1

In [5]:
parent_dir = "results-category3"
child_name = "cv_early_stop_strategy=current_average_worse_than_mean_best-fold=0-metric=roc_auc_ovr-n_splits=10-optimizer=random_search-pipeline=mlp_classifier-task=146818"

print(find_child_index(parent_dir, child_name))

Parent directory results-category3 not found
-1


# Quick commands
1. One exp
    ```bash
    python e1.py submit --expname "category3-nsplits-10" --job-array-limit 1000 --mail-type ALL --mail-user avakhutinskiy1@sheffield.ac.uk
    ```

    Output:
    ```bash
    pending: 1080
    Submitted batch job 5860009

    Due to the job array of the experiment "category3-nsplits-10" (1080) exceeding job array limit (1000):
    Submitted chunk 0-999 of 1079

    To submit the next chunk, run the following command when 80 jobs terminate (succed or fail):

        python e1.py submit --expname category3-nsplits-10 --job-array-limit 1000 --chunk-start-idx 1000
    ```
2. Count files
    ```bash
    find results-category3/ -name ".flag.submitted" | wc -l
    ```

    Output: `1000`

# Analysis df_paper

In [1]:
import json

import pandas as pd
import numpy as np

from utils.column_stats import generate_column_stats

In [2]:
exp = "mlp-nsplits-10"
df_paper  = pd.read_parquet(f"data-paper/{exp}.parquet.gzip")

In [8]:
stats_dict = generate_column_stats(df=df_paper)
print(type(stats_dict))

<class 'dict'>


Results from the paper

| Aggressive Average Speedup % | Aggressive Datasets Failed | Forgiving Average Speedup % | Forgiving Datasets Failed |
| ----------------------------- | ------------------------- | ---------------------------- | ------------------------ |
| 301% ± 187%                  | 20/36                     | 174% ± 64%                  | 0/36                    |


In [None]:
# Step 1: Compute the Incumbent Trace per Dataset/Fold
df_paper["created_at"] = pd.to_datetime(df_paper["created_at"])
df_sorted = df_paper.sort_values("created_at")

groups = df_sorted.groupby(["setting:task", "setting:fold"])

def add_incumbent_trace(grp):
    grp = grp.copy()
    grp["incumbent_val"] = grp["summary:val_mean_roc_auc_ovr"].cummax()
    return grp

df_with_trace = groups.apply(add_incumbent_trace).reset_index(drop=True)

  df_with_trace = groups.apply(add_incumbent_trace).reset_index(drop=True)


In [None]:
# Step 2: Aggregate Across Folds per Dataset


# Analysis df vs df_paper

In [6]:
import pandas as pd
import numpy as np

In [8]:
# df_paper  = pd.read_parquet("../data-paper/mlp-nsplits-10.parquet.gzip")
df = pd.read_parquet("data/mlp-nsplits-10.parquet")

In [9]:
print(f'df_paper shape: {df_paper.shape}')
print(f'df shape:    {df.shape}')

columns_diff = set(df.columns).difference(df_paper.columns)
print(f'\n{columns_diff}')

df_paper shape: (2732865, 56)
df shape:    (3213287, 57)

{'setting:seeded_inner_cv'}


In [10]:
df.drop(columns=columns_diff, inplace=True)

In [11]:
print("df_paper columns:", df_paper.columns)
print("df columns:", df.columns)
print("df_paper index:", df_paper.index)
print("df index:", df.index)

df_paper columns: Index(['created_at', 'reported_at', 'status',
       'metric:roc_auc_ovr [0.0, 1.0] (maximize)',
       'summary:val_mean_roc_auc_ovr', 'summary:val_std_roc_auc_ovr',
       'summary:test_mean_roc_auc_ovr', 'summary:test_std_roc_auc_ovr',
       'summary:test_bagged_roc_auc_ovr', 'summary:split_0:val_roc_auc_ovr',
       'summary:split_1:val_roc_auc_ovr', 'summary:split_2:val_roc_auc_ovr',
       'summary:split_3:val_roc_auc_ovr', 'summary:split_4:val_roc_auc_ovr',
       'summary:split_5:val_roc_auc_ovr', 'summary:split_6:val_roc_auc_ovr',
       'summary:split_7:val_roc_auc_ovr', 'summary:split_8:val_roc_auc_ovr',
       'summary:split_9:val_roc_auc_ovr', 'summary:split_0:test_roc_auc_ovr',
       'summary:split_1:test_roc_auc_ovr', 'summary:split_2:test_roc_auc_ovr',
       'summary:split_3:test_roc_auc_ovr', 'summary:split_4:test_roc_auc_ovr',
       'summary:split_5:test_roc_auc_ovr', 'summary:split_6:test_roc_auc_ovr',
       'summary:split_7:test_roc_auc_ovr', 

In [14]:
idx_df_paper = set(df_paper.index)
idx_df = set(df.index)

extra_trials = idx_df - idx_df_paper
print(f"Number of extra trials in df: {len(extra_trials)}")
print("Some extra trials in df:", list(extra_trials)[:10])

missing_in_df = idx_df_paper - idx_df
print(f"Number of trials in df_paper not in df: {len(missing_in_df)}")

duplicates_df = df.index.duplicated().sum()
print("Number of duplicate trial-ids in df:", duplicates_df)

duplicates_df_paper = df_paper.index.duplicated().sum()
print("Number of duplicate trial-ids in df_paper:", duplicates_df_paper)

Number of extra trials in df: 7667
Some extra trials in df: ['trial-34242', 'trial-34959', 'trial-35022', 'trial-36874', 'trial-33443', 'trial-30433', 'trial-35092', 'trial-34349', 'trial-33891', 'trial-35648']
Number of trials in df_paper not in df: 0
Number of duplicate trial-ids in df: 3175865
Number of duplicate trial-ids in df_paper: 2703110


Start      - 2025-04-07T20:26:38

End        - 2025-04-08T18:53:41

Time Delta - 80823 seconds (22.45 hours)