### comparing whole result

In [11]:
import os
import pandas as pd

datasets = ('lung', 'prostate', 'toxicity', 'cll', 'smk')
experiment_types = ('svd', 'nmf', 'dnn')

experiment_results = []

for dataset in datasets:
    for experiment_type in experiment_types:
        experiment_name = f"{experiment_type}_{dataset}"
        folder_path = os.path.join('logs', experiment_name)

        for folder in os.listdir(folder_path):
            metrics_path = os.path.join(folder_path, folder, 'metrics.csv')
            if os.path.exists(metrics_path):
                metrics = pd.read_csv(metrics_path)

                train_balanced_acc = metrics['bestmodel_train/balanced_accuracy'].max()
                valid_balanced_acc = metrics['bestmodel_valid/balanced_accuracy'].max()
                test_balanced_acc = metrics['bestmodel_test/balanced_accuracy'].max()

                experiment_results.append({
                    'Dataset': dataset,
                    'Experiment': experiment_type,
                    'Train Balanced Accuracy': train_balanced_acc,
                    'Validation Balanced Accuracy': valid_balanced_acc,
                    'Test Balanced Accuracy': test_balanced_acc
                })
results_df = pd.DataFrame(experiment_results)
grouped_results = results_df.groupby(['Dataset', 'Experiment']).mean()
grouped_results = grouped_results.reset_index()
print(grouped_results)


     Dataset Experiment  Train Balanced Accuracy  \
0        cll        dnn                 0.908107   
1        cll        nmf                 0.889562   
2        cll        svd                 0.464659   
3       lung        dnn                 0.702771   
4       lung        nmf                 0.901653   
5       lung        svd                 0.475263   
6   prostate        dnn                 0.878824   
7   prostate        nmf                 0.938039   
8   prostate        svd                 0.685882   
9        smk        dnn                 0.631334   
10       smk        nmf                 0.722183   
11       smk        svd                 0.505000   
12  toxicity        dnn                 0.611777   
13  toxicity        nmf                 0.773102   
14  toxicity        svd                 0.417312   

    Validation Balanced Accuracy  Test Balanced Accuracy  
0                       0.833333                0.722896  
1                       0.733333                0

Demo script to analyze experiments results

In [1]:
import os
import pandas as pd

In [4]:
experiment_name = "samplerun1"
# itearte all folders in the experiment folder
train_balanced_accs, valid_balanced_acc, test_balanced_acc = [], [], []

for folder in os.listdir(os.path.join('logs', experiment_name)):
	metrics = pd.read_csv(os.path.join('logs', experiment_name, folder, 'metrics.csv'))

	# each column 'bestmodel_train/bestmodel_valid/bestmodel_test' contains only one non-zero entry
	# 	which is computed at the end of the experiment
	train_balanced_accs.append(metrics['bestmodel_train/balanced_accuracy'].max())
	valid_balanced_acc.append(metrics['bestmodel_valid/balanced_accuracy'].max())
	test_balanced_acc.append(metrics['bestmodel_test/balanced_accuracy'].max())

print(f"Average across all {len(train_balanced_accs)} experiments:")
print(f"train_balanced_accs mean: %.2f" % ((sum(train_balanced_accs)/len(train_balanced_accs))*100))
print(f"valid_balanced_acc mean: %.2f" % ((sum(valid_balanced_acc)/len(valid_balanced_acc))*100))
print(f"test_balanced_acc mean: %.2f" % ((sum(test_balanced_acc)/len(test_balanced_acc))*100))

experiment_name = "samplerun2"
# itearte all folders in the experiment folder
train_balanced_accs, valid_balanced_acc, test_balanced_acc = [], [], []

for folder in os.listdir(os.path.join('logs', experiment_name)):
	metrics = pd.read_csv(os.path.join('logs', experiment_name, folder, 'metrics.csv'))

	# each column 'bestmodel_train/bestmodel_valid/bestmodel_test' contains only one non-zero entry
	# 	which is computed at the end of the experiment
	train_balanced_accs.append(metrics['bestmodel_train/balanced_accuracy'].max())
	valid_balanced_acc.append(metrics['bestmodel_valid/balanced_accuracy'].max())
	test_balanced_acc.append(metrics['bestmodel_test/balanced_accuracy'].max())

print(f"Average across all {len(train_balanced_accs)} experiments:")
print(f"train_balanced_accs mean: %.2f" % ((sum(train_balanced_accs)/len(train_balanced_accs))*100))
print(f"valid_balanced_acc mean: %.2f" % ((sum(valid_balanced_acc)/len(valid_balanced_acc))*100))
print(f"test_balanced_acc mean: %.2f" % ((sum(test_balanced_acc)/len(test_balanced_acc))*100))

Average across all 25 experiments:
train_balanced_accs mean: 56.55
valid_balanced_acc mean: 48.04
test_balanced_acc mean: 54.37
Average across all 25 experiments:
train_balanced_accs mean: 58.97
valid_balanced_acc mean: 53.39
test_balanced_acc mean: 56.19
