In [None]:
from TabArenaIterator import TabArenaIterator
import pandas as pd
import numpy as np
import plotly.express as px


final_table = pd.DataFrame(columns=['dataset_id', 'dataset_name', 
                                    'original_number_features', 'final_number_features',
                                    'original_number_rows', 'final_number_rows',
                                    'original_density', 'final_density', 
                                    'initial_r2', 'accuracy_initial',
                                    'r2', 'accuracy'])

tabArenaURL = 'https://raw.githubusercontent.com/TabArena/tabarena_dataset_curation/refs/heads/main/dataset_creation_scripts/metadata/tabarena_dataset_metadata.csv'
iterator = TabArenaIterator(tabArenaURL)


def getDensitiesPlot(df, target_feature, density_threshold=0.1, show_fig=False):
    # Drop the target column and duplicated rows
    df_copy = df.copy().drop(columns=target_feature).drop_duplicates()
    densities = []

    # Calculate the original dataset informations
    original_number_features = len(df_copy.columns)
    original_number_rows = len(df_copy)
    print(
        "Total features:", original_number_features, "Total rows:", original_number_rows
    )

    # Calculate the orignial density
    total = 1
    for i in df_copy.nunique().values:
        total = np.multiply(total, i, dtype=object)
    original_density = len(df_copy) / total
    print("Original density:", original_density)
    print()
    densities.append(original_density)

    # Dropping all available columns one by one until the threshold is chased
    for feature, cardinality in df_copy.nunique().sort_values(ascending=False).items():

        # Drop the corresponding column
        df_copy = df_copy.drop(columns=feature).drop_duplicates()

        # Calculate agian the density
        total = 1
        for i in df_copy.nunique().values:
            total = np.multiply(total, i, dtype=object)

        density = len(df_copy) / total
        densities.append(density)

        # Check the requirement to stop dropping columns
        if density > density_threshold:
            break

    # Calculate the final dataset informations
    final_number_features = len(df_copy.columns)
    final_number_rows = len(df_copy)
    print("Features left:", final_number_features, "Rows left:", final_number_rows)
    print("Final density:", density)
    print()

    # Plot the column dropping process
    if show_fig:
        fig = px.line(
            x=range(0, len(densities), 1),
            y=densities,
            labels={"x": "Number features dropped", "y": "Density"},
        )
        fig.show()

    # Return the final dataset and the change informations
    return df_copy, {
        "original_number_features": original_number_features,
        "original_number_rows": original_number_rows,
        "original_density": original_density,
        "final_number_features": final_number_features,
        "final_number_rows": final_number_rows,
        "final_density": density,
    }


for row, df in iterator:
    print(row['dataset_id'])

    # Obtain the dense df and informations about the changes
    dense_df, infos = getDensitiesPlot(df, row['target_feature'], density_threshold=0.1, show_fig=True)

    infos['dataset_id'] = row['dataset_id']
    infos['dataset_name'] = row['dataset_name']

    # Append the row to the final table
    final_table.loc[len(final_table)] = infos

46904
Total features: 5 Total rows: 1503
Original density: 0.0010519022423784328

Features left: 4 Rows left: 1503
Final density: 0.11044973544973545



46905
Total features: 9 Total rows: 32769
Original density: 5.4331464888837344e-21

Features left: 1 Rows left: 67
Final density: 1.0



46906
Total features: 38 Total rows: 886
Original density: 6.763894527119904e-16

Features left: 13 Rows left: 8
Final density: 0.125



46907
Total features: 7 Total rows: 1515
Original density: 5.646737052785582e-10

Features left: 3 Rows left: 34
Final density: 0.3541666666666667



46908
Total features: 170 Total rows: 76000
Original density: 0.0

Features left: 3 Rows left: 52
Final density: 0.8666666666666667



46910
Total features: 13 Total rows: 44905
Original density: 5.349772965830006e-12

Features left: 8 Rows left: 2533
Final density: 0.18323206018518517



46911
Total features: 10 Total rows: 10000
Original density: 4.6085769342056656e-12

Features left: 6 Rows left: 723
Final density: 0.6846590909090909



46912
Total features: 1776 Total rows: 3751
Original density: 0.0



In [4]:
final_table

Unnamed: 0,dataset_id,dataset_name,original_number_features,final_number_features,original_number_rows,final_number_rows,original_density,final_density,initial_r2,accuracy_initial,r2,accuracy
0,46904,airfoil_self_noise,5,4,1503,1503,0.001051902,0.11045,,,,
1,46905,Amazon_employee_access,9,1,32769,67,5.4331459999999996e-21,1.0,,,,
2,46906,anneal,38,13,886,8,6.763895e-16,0.125,,,,
3,46907,Another-Dataset-on-used-Fiat-500,7,3,1515,34,5.646737e-10,0.354167,,,,
4,46908,APSFailure,170,3,76000,52,0.0,0.866667,,,,
5,46910,bank-marketing,13,8,44905,2533,5.349773e-12,0.183232,,,,
6,46911,Bank_Customer_Churn,10,6,10000,723,4.608577e-12,0.684659,,,,
7,46912,Bioresponse,1776,11,3751,255,0.0,0.124512,,,,
8,46913,blood-transfusion-service-center,4,2,502,188,0.0001906424,0.183773,,,,
9,46915,churn,19,5,5000,625,2.77498e-36,0.248016,,,,


In [46]:
np.array(final_table['original_density'])

array([1.05190224e-003, 5.43314649e-021, 6.76389453e-016, 5.64673705e-010,
       0.00000000e+000, 5.34977297e-012, 4.60857693e-012, 0.00000000e+000,
       1.90642419e-004, 2.77498011e-036, 4.18838624e-067, 3.03855133e-015,
       1.40335277e-013, 4.92838397e-057, 1.86124145e-018, 1.11749823e-013,
       7.57843832e-037, 2.61638676e-013, 9.03257841e-009, 1.05114945e-005,
       5.40739138e-016, 1.06647219e-019, 4.28129354e-090, 5.39922736e-004,
       1.48325730e-035, 0.00000000e+000, 5.23424119e-022, 5.30889048e-010,
       1.44559699e-011, 1.33565265e-010, 0.00000000e+000, 3.46008068e-032,
       7.37295826e-006, 1.01141724e-044, 1.03833218e-068, 9.55781348e-023,
       1.30937265e-029, 4.14238883e-035, 9.50357310e-236, 2.44726371e-063,
       2.92541586e-305, 7.63391402e-010, 2.07543556e-042, 9.40029535e-018,
       6.83804444e-038, 1.29051926e-040, 3.87228427e-287, 0.00000000e+000,
       7.72748057e-002, 4.50609348e-022, 3.67905845e-050])

In [62]:
values = np.array(final_table['original_density'])
log_values = np.log10(values[values > 0])

# Create dataframe
df = pd.DataFrame({"log10(value)": log_values})

# Plot histogram of log-values
fig = px.histogram(df, x="log10(value)", labels={'log10(value)':'log10(original_density)'}, nbins=50, text_auto=True, title="Original density of the datasets from TabArena.")
fig.update_layout(bargap=0.2)

fig.show()

In [None]:
fig = px.histogram(x=final_table['original_density'], 
                   labels={'x':'Original_density'}, text_auto=True, title='Original density of datasets from TabArena')
fig.update_layout(bargap=0.2)
fig.show()

In [21]:
import plotly.express as px
fig = px.histogram(x=final_table['final_density'], title='Final density of datasets from TabArena after features dropping process.',
                   labels={'x':'final_density'}, text_auto=True)
fig.update_layout(bargap=0.2)
fig.show()

In [None]:
from TabArenaIterator import TabArenaIterator
from autogluon.tabular import TabularPredictor
import os
import pandas as pd
import numpy as np

models= ['XGB', 'NN_TORCH']

final_table = pd.DataFrame(columns=['dataset_id', 'dataset_name', 
                                    'original_number_features', 'final_number_features',
                                    'original_number_rows', 'final_number_rows',
                                    'original_density', 'final_density'])

for model in models:
    final_table['initial_' + model + '_r2'] = None
    final_table['initial_' + model + '_accuracy'] = None
    final_table['final_' + model + '_r2'] = None
    final_table['final_' + model + '_accuracy'] = None

tabArenaURL = 'https://raw.githubusercontent.com/TabArena/tabarena_dataset_curation/refs/heads/main/dataset_creation_scripts/metadata/tabarena_dataset_metadata.csv'
iterator = TabArenaIterator(tabArenaURL)
for row, df in iterator:
    print(row['dataset_id'])

    # Obtain the dense df and informations about the changes
    dense_df, infos = getDensitiesPlot(df, row['target_feature'], density_threshold=0.1, show_fig=False)

    # Train the initial models and get the metrics
    for model in models:
        metric, result = trainAutogluonModels(df, 'AutoGluonModels/' + 'initial_' + model + '_' + row['dataset_name'], 
                                                        row['problem_type'], row['target_feature'], {model:{}})
        infos['initial_' + model + '_' + metric] = result

    # If the dense df does not fulfill the requirement, skip to the next dataset
    if infos['final_number_features'] < 3 or infos['final_number_rows'] < 30:
        print('Feature or obaservations not fulfill requirements.')
        continue
    
    # Extract the target feature to the dense_df
    dense_df[row['target_feature']] = df[row['target_feature']].iloc[dense_df.index,]
    dense_df = dense_df.reset_index(drop=True)
    '''
    # Split the train (70%) and test (30%) dataset
    train_data = dense_df.iloc[:int(len(dense_df) * 0.7), ]
    test_data = dense_df.iloc[int(len(dense_df) * 0.7):, ]

    # Set the path where the model should be saved 
    path = 'AutoGluonModels/' + row['dataset_name']

    # Choose the metric for regression or classification
    if row['problem_type'] == 'regression':
        metric = 'r2'
    else:
        metric = 'accuracy' 

    # If the path (the model) already exists, load the model, if not, train the model
    if os.path.isdir(path):
        print('Model already exists. Loading...')
        predictor = TabularPredictor.load(path)
    else:
        print('Train a new model...')
        predictor = TabularPredictor(label=row['target_feature'], path=path, 
                                     eval_metric=metric, problem_type=row['problem_type'],
                                     verbosity=0).fit(train_data)
    
    # Set the result and other informations about the dataset
    result_metric = predictor.evaluate(test_data).get(metric)
    '''

    for model in models:
        metric, result = trainAutogluonModels(dense_df, 'AutoGluonModels/' + 'final_' + model + '_' + row['dataset_name'], 
                                                        row['problem_type'], row['target_feature'], {model:{}})
        infos['final_' + model + '_' + metric] = result

    infos['dataset_id'] = row['dataset_id']
    infos['dataset_name'] = row['dataset_name']


    # Append the row to the final table
    final_table.loc[len(final_table)] = infos
    