In [1]:
import datamol as dm
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime

from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.gaussian_process import GaussianProcessRegressor, GaussianProcessClassifier
from sklearn.gaussian_process.kernels import PairwiseKernel, Sum, WhiteKernel

from mood.dataset import dataset_iterator, load_data_from_tdc, MOOD_REGR_DATASETS, MOOD_CLSF_DATASETS
from mood.model_space import ModelSpaceTransformer
from mood.preprocessing import standardize_smiles
from mood.distance import compute_knn_distance
from mood.visualize import plot_distance_distributions
from mood.representations import representation_iterator, featurize
from mood.constants import DOWNSTREAM_APPS_DATA_DIR
from mood.utils import load_representation_for_downstream_application, save_figure_with_fsspec, get_outlier_bounds

In [3]:
BASE_SAVE_DIR = "gs://experiments-output/mood-v2/results/figures/"
OVERWRITE = True

today = datetime.now().strftime("%Y%m%d")
save_dir = dm.fs.join(BASE_SAVE_DIR, f"{today}_NB01")

In [None]:
df_corr = pd.DataFrame()

for dataset, (smiles, y) in dataset_iterator(standardize_smiles, progress=True):

    for representation, (X, mask) in representation_iterator(smiles, n_jobs=-1, progress=True):
            
        y_repr = y[mask]
        
        virtual_screening = load_representation_for_downstream_application("virtual_screening", representation)
        optimization = load_representation_for_downstream_application("optimization", representation)      
        
        is_regression = dataset in MOOD_REGR_DATASETS
        mlp_model = train_model(X, y_repr, "mlp", is_regression)
        rf_model = train_model(X, y_repr, "rf", is_regression)
        gp_model = train_model(X, y_repr, "gp", is_regression)
        
        # Distances in input spaces
        input_distances = compute_knn_distance(X, [X, optimization, virtual_screening], n_jobs=-1)
        
        labels = ["Train", "Optimization", "Virtual Screening"]
        ax = plot_distance_distributions(input_distances, labels=labels)
        ax.set_title(f"Input space ({representation}, {dataset})")
        save_figure_with_fsspec(dm.fs.join(save_dir, f"{dataset}_{representation}_input_space.png"), exist_ok=OVERWRITE)
        # To prevent the notebook from showing 100s of figures
        plt.close()
        
        # Distances in different model spaces
        for name, model in {"MLP": mlp_model, "RF": rf_model, "GP": gp_model}.items():
            
            if model is None: 
                continue 

            model_distances = get_model_space_distances(model, X, [X, optimization, virtual_screening])
            
            ax = plot_distance_distributions(model_distances, labels=labels)
            ax.set_title(f"{name} space ({representation}, {dataset})")
            
            # Save figure
            save_figure_with_fsspec(
                dm.fs.join(save_dir, f"{dataset}_{representation}_{name}_space.png"), 
                exist_ok=OVERWRITE
            )
            # To prevent the notebook from showing 100s of figures
            plt.close()
            
            # Compute correlation
            df = compute_correlations(
                input_distances,
                model_distances,
                labels,
            )
            df["model"] = name
            df["dataset"] = dataset
            df["representation"] = representation
            df_corr = pd.concat((df_corr, df), ignore_index=True)

path = dm.fs.join(save_dir, "correlations.csv")
df_corr.to_csv(path, index=False)
df_corr.head()

Found local copy...
Loading...
Done!


Preprocess BBB:   0%|          | 0/2030 [00:00<?, ?it/s]

MACCS:   0%|          | 0/2030 [00:00<?, ?it/s]

ECFP6:   0%|          | 0/2030 [00:00<?, ?it/s]

Desc2D:   0%|          | 0/2030 [00:00<?, ?it/s]

WHIM:   0%|          | 0/2030 [00:00<?, ?it/s]

Found local copy...
Loading...
Done!


Preprocess CYPP4502C9:   0%|          | 0/12092 [00:00<?, ?it/s]

MACCS:   0%|          | 0/12092 [00:00<?, ?it/s]

ECFP6:   0%|          | 0/12092 [00:00<?, ?it/s]

In [None]:
sns.boxplot(data=df_corr, x="model", y="spearman")

The End.