In [1]:
# !pip install deepchem sklearn matplotlib pandas sentence_transformers

## Note: These fine-tuning notebooks donot reproduce the exact results mentioned in the paper, please follow the settings in paper to reproduce the results

In [25]:
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.utils import class_weight
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.utils import class_weight
from sklearn.metrics import roc_curve, auc, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import PredefinedSplit
import pickle
import matplotlib.pyplot as plt
import warnings
from deepchem.molnet import load_clintox, load_tox21, load_bace_classification, load_bbbp
import pandas as pd
from data_reader import DataReader
from sentence_transformers import SentenceTransformer
import numpy as np

warnings.filterwarnings("ignore")

In [26]:
# adjust the path to domain adapted sbert based on the domain adaptation dataset
sbert_encoder = SentenceTransformer("../2_domain_adaptation/models/sbert_domain_adapted_muv/")

In [27]:
def load_dataset(dataset_name):
    dataset = DataReader(dataset_name)
    (train_dataset, valid_dataset, test_dataset) = (dataset.train_dataset,
                                                    dataset.valid_dataset, 
                                                    dataset.test_dataset
                                                   )
    X_train, y_train, X_valid, y_valid, X_test, y_test = (dataset.train_dataset.smiles,
                        dataset.train_dataset.y,
                        dataset.valid_dataset.smiles,
                        dataset.valid_dataset.y,
                        dataset.test_dataset.smiles,
                        dataset.test_dataset.y,
                       
                       )
    print(f"Loading and embedding SMILES for dataset {dataset_name}")
    return (
            sbert_encoder.encode(X_train, show_progress_bar=True), y_train,
            sbert_encoder.encode(X_valid, show_progress_bar=True), y_valid,
            sbert_encoder.encode(X_test, show_progress_bar=True), y_test
            )

In [41]:
def train_and_evaluate_model(X_train, y_train, X_valid, y_valid, X_test, y_test):

    # ps = PredefinedSplit(test_fold)
    print("Training Classifier")
    parameters = {'estimator__class_weight':['balanced'],
              'estimator__kernel':['rbf','sigmoid'], 
              'estimator__C':[1,0.5,0.25], 'estimator__gamma':['auto','scale']}
    tox21_svc = GridSearchCV(OneVsRestClassifier(SVC(probability=True,
                                                     random_state=23)), 
                             parameters, cv=3, scoring='roc_auc',n_jobs=-1)
    result = tox21_svc.fit(X_train, y_train)
    pred = tox21_svc.predict_proba(X_test)
    pred_svc = np.copy(pred)
    if len(np.array(y_test).shape) == 1 or np.array(y_test).shape[-1] == 1:
        return roc_auc_score(y_test,pred[:,1])
    else:
        return roc_auc_score(y_test,pred)

In [42]:
def evaluate_dataset(dataset_name):

    X_train, y_train, X_valid, y_valid, X_test, y_test = load_dataset(dataset_name=dataset_name)
    roc_score = train_and_evaluate_model(X_train, y_train, X_valid, y_valid, X_test, y_test)

    print(f"The AUROC score for dataset {dataset_name} is {roc_score:2f}")

## Evaluate MoleculeNet Datasets

In [44]:
evaluate_dataset(dataset_name="clintox")
print(f"\n{'*'*100}\n")
evaluate_dataset(dataset_name="bace")
print(f"\n{'*'*100}\n")
evaluate_dataset(dataset_name="bbbp")
print(f"\n{'*'*100}\n")
evaluate_dataset(dataset_name="tox21")

Loading and embedding SMILES for dataset clintox


Batches:   0%|          | 0/37 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Training Classifier
The AUROC score for dataset clintox is 0.951994

****************************************************************************************************

Loading and embedding SMILES for dataset bace


Batches:   0%|          | 0/38 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Training Classifier
The AUROC score for dataset bace is 0.676721

****************************************************************************************************

Loading and embedding SMILES for dataset bbbp


Batches:   0%|          | 0/51 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

Training Classifier
The AUROC score for dataset bbbp is 0.674198

****************************************************************************************************

Loading and embedding SMILES for dataset tox21


Batches:   0%|          | 0/196 [00:00<?, ?it/s]

Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Training Classifier
The AUROC score for dataset tox21 is 0.659198
