In [1]:
import tqdm
import itertools

import datamol as dm
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from mood.constants import DOWNSTREAM_RESULTS_DIR
from mood.dataset import MOOD_DATASETS
from mood.representations import MOOD_REPRESENTATIONS
from mood.baselines import SUPPORTED_BASELINES
from mood.utils import get_outlier_bounds
from mood.metrics import Metric
from mood.visualize import axes_grid_iterator, plot_performance_over_distance

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SUBDIR = "20221213"

In [3]:
in_dir = dm.fs.join(DOWNSTREAM_RESULTS_DIR, "dataframes", "compare_performance", SUBDIR)
pattern = dm.fs.join(in_dir, "perf_over_distance_*.csv")
paths = dm.fs.glob(pattern)

In [4]:
def find_missing(paths, all_triplets):
    triplets = []
    for path in paths:
        path = dm.fs.get_basename(path)
        path = ".".join(path.split(".")[:-1])
        triplet = tuple(path.split("_")[3:])
        triplets.append(triplet)
    missing = set(all_triplets) - set(triplets)
    
    msg = f"Missing {len(missing)}:\n"
    for m in sorted(missing): 
        msg += f"{m}\n"
    return msg 
                        

all_triplets = list(itertools.product(MOOD_DATASETS, SUPPORTED_BASELINES, MOOD_REPRESENTATIONS))
assert list(all_triplets) == len(paths), find_missing(paths, all_triplets)

AssertionError: Missing 77:
('BBB', 'GP', 'ChemBERTa')
('BBB', 'GP', 'Desc2D')
('BBB', 'GP', 'ECFP6')
('BBB', 'GP', 'Graphormer')
('BBB', 'GP', 'MACCS')
('BBB', 'GP', 'WHIM')
('BBB', 'MLP', 'ChemBERTa')
('BBB', 'MLP', 'Desc2D')
('BBB', 'MLP', 'ECFP6')
('BBB', 'MLP', 'Graphormer')
('BBB', 'MLP', 'MACCS')
('BBB', 'MLP', 'WHIM')
('BBB', 'RF', 'ChemBERTa')
('BBB', 'RF', 'Desc2D')
('BBB', 'RF', 'ECFP6')
('BBB', 'RF', 'Graphormer')
('BBB', 'RF', 'MACCS')
('BBB', 'RF', 'WHIM')
('CYPP4502C9', 'GP', 'ChemBERTa')
('CYPP4502C9', 'GP', 'Desc2D')
('CYPP4502C9', 'GP', 'ECFP6')
('CYPP4502C9', 'GP', 'Graphormer')
('CYPP4502C9', 'GP', 'MACCS')
('CYPP4502C9', 'GP', 'WHIM')
('CYPP4502C9', 'MLP', 'ChemBERTa')
('CYPP4502C9', 'MLP', 'Desc2D')
('CYPP4502C9', 'MLP', 'ECFP6')
('CYPP4502C9', 'MLP', 'Graphormer')
('CYPP4502C9', 'MLP', 'MACCS')
('CYPP4502C9', 'MLP', 'WHIM')
('CYPP4502C9', 'RF', 'ChemBERTa')
('CYPP4502C9', 'RF', 'Desc2D')
('CYPP4502C9', 'RF', 'ECFP6')
('CYPP4502C9', 'RF', 'Graphormer')
('CYPP4502C9', 'RF', 'MACCS')
('CYPP4502C9', 'RF', 'WHIM')
('Caco-2', 'GP', 'ChemBERTa')
('Caco-2', 'GP', 'Desc2D')
('Caco-2', 'GP', 'ECFP6')
('Caco-2', 'GP', 'Graphormer')
('Caco-2', 'GP', 'MACCS')
('Caco-2', 'GP', 'WHIM')
('Caco-2', 'MLP', 'ChemBERTa')
('Caco-2', 'MLP', 'Desc2D')
('Caco-2', 'MLP', 'ECFP6')
('Caco-2', 'MLP', 'Graphormer')
('Caco-2', 'MLP', 'MACCS')
('Caco-2', 'MLP', 'WHIM')
('Caco-2', 'RF', 'ChemBERTa')
('Caco-2', 'RF', 'Desc2D')
('Caco-2', 'RF', 'ECFP6')
('Caco-2', 'RF', 'Graphormer')
('Caco-2', 'RF', 'MACCS')
('Caco-2', 'RF', 'WHIM')
('Lipophilicity', 'GP', 'ChemBERTa')
('Lipophilicity', 'GP', 'Desc2D')
('Lipophilicity', 'GP', 'ECFP6')
('Lipophilicity', 'GP', 'Graphormer')
('Lipophilicity', 'GP', 'MACCS')
('Lipophilicity', 'GP', 'WHIM')
('Lipophilicity', 'MLP', 'ChemBERTa')
('Lipophilicity', 'MLP', 'Desc2D')
('Lipophilicity', 'MLP', 'ECFP6')
('Lipophilicity', 'MLP', 'Graphormer')
('Lipophilicity', 'MLP', 'MACCS')
('Lipophilicity', 'MLP', 'WHIM')
('Lipophilicity', 'RF', 'ChemBERTa')
('Lipophilicity', 'RF', 'Desc2D')
('Lipophilicity', 'RF', 'ECFP6')
('Lipophilicity', 'RF', 'Graphormer')
('Lipophilicity', 'RF', 'MACCS')
('Lipophilicity', 'RF', 'WHIM')
('PPBR', 'GP', 'ChemBERTa')
('PPBR', 'GP', 'Graphormer')
('PPBR', 'MLP', 'ChemBERTa')
('PPBR', 'RF', 'ChemBERTa')
('PPBR', 'RF', 'Graphormer')


In [5]:
df_original = pd.concat([pd.read_csv(p) for p in tqdm.tqdm(paths)], ignore_index=True)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 121/121 [00:27<00:00,  4.34it/s]


In [None]:
for baseline, group in df_original.groupby("algorithm"):
    
    iterator = axes_grid_iterator(
        col_labels=MOOD_DATASETS, 
        row_labels=MOOD_REPRESENTATIONS
    )
    
    for ax, representation, dataset in iterator:
        df = group[group["dataset"] == dataset]
        df = df[df["representation"] == representation]
        
        performance_data = df[df["type"] == "performance"]
        calibration_data = df[df["type"] == "calibration"]
        
        plot_performance_over_distance(
            performance_data, 
            calibration_data, 
            dataset,
            ax=ax, 
            show_legend=False
        )

The End. 