In [8]:
import os
import yaml
import fsspec
import datamol as dm
import pandas as pd

from mood.rct import get_experimental_configurations
from mood.dataset import MOOD_DATASETS

In [2]:
SUBDIR = "20230106"
BASE_PATH = "gs://experiments-output/mood-v2/results/YAML/RCT/"
N_RCT_TRIALS = 250

In [3]:
def get_result_paths_for_dataset(dataset):
    pattern = dm.fs.join(BASE_PATH, SUBDIR, f"*{dataset}*.yaml")
    paths = dm.fs.glob(pattern)
    return paths


def find_missing(dataset, n: int = N_RCT_TRIALS):
    
    paths = get_result_paths_for_dataset(dataset)
    missing_configs = get_experimental_configurations(dataset)[:n]
    
    for path in paths: 
        
        path = dm.fs.get_basename(path)
        path = os.path.splitext(path)[0]
        
        config = path.split("_")[4:]
        config[-1] = int(config[-1])
        config = tuple(config)
        
        missing_configs.remove(config)
        
    return sorted(missing_configs)
        

for dataset in MOOD_DATASETS:
    print(f"Missing {len(find_missing(dataset))}/{N_RCT_TRIALS} of results for {dataset}")

Missing 17/250 of results for DILI
Missing 25/250 of results for HIA
Missing 21/250 of results for hERG
Missing 39/250 of results for HalfLife
Missing 49/250 of results for Caco-2
Missing 30/250 of results for Clearance
Missing 220/250 of results for Pgp
Missing 250/250 of results for PPBR
Missing 242/250 of results for BBB
Missing 250/250 of results for Lipophilicity
Missing 250/250 of results for CYP2C9


In [6]:
get_result_paths_for_dataset("DILI")[0]

'gcs://experiments-output/mood-v2/results/YAML/RCT/20230106/rct_selected_model_DILI_CORAL_ChemBERTa_Perimeter_Performance_2.yaml'

In [21]:
def load_results(path):
    with fsspec.open(path, "r") as fd:
        data = yaml.safe_load(fd)
        data.pop("hparams")
        
    # NOTE: Due to a bug, the seed in the YAML was faulty. 
    # Luckily, we can still extract the right seed from the filename
    data["seed"] = int(path[-6])
    return pd.DataFrame(data, index=[0])


paths = [p for dataset in MOOD_DATASETS for p in get_result_paths_for_dataset(dataset)]
df = pd.concat(dm.utils.parallelized(load_results, paths, progress=True), ignore_index=True)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1378/1378 [00:34<00:00, 39.58it/s]


The End.