In [1]:
# !pip install deepchem sklearn matplotlib pandas sentence_transformers

## Note: These fine-tuning notebooks donot reproduce the exact results mentioned in the paper, please follow the settings in paper to reproduce the results

In [17]:
from sklearn.decomposition import KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.utils import class_weight
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.utils import class_weight
from sklearn.metrics import roc_curve, auc, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import PredefinedSplit
import pickle
import matplotlib.pyplot as plt
import warnings
from deepchem.molnet import load_clintox, load_tox21, load_bace_classification, load_bbbp
import pandas as pd
from data_reader import DataReader
import numpy as np
from typing import List, Union
from transformers import BartModel, BartTokenizer
from torch import Tensor
from numpy import ndarray
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [27]:
class BartFeaturizer:
    def __init__(self, model_name_or_path: str):
        self.tokenizer = BartTokenizer.from_pretrained(model_name_or_path)
        self.model = BartModel.from_pretrained(model_name_or_path)

    def encode(
        self, smiles: Union[str, List[str]], embedder: str = "encoder", batch_size=8
    ) -> Union[List[Tensor], ndarray, Tensor]:
        assert len(smiles) > 0, "SMILES can not be empty!"
        smiles = [str(smile) for smile in smiles]
        def batch(iterable, n=1):
            l = len(iterable)
            for ndx in range(0, l, n):
                yield iterable[ndx:min(ndx + n, l)]
        embeddings = []
        for smiles_batch in tqdm(batch(smiles, batch_size)):
            inputs = self.tokenizer(smiles_batch, return_tensors="pt", padding=True)
            outputs = self.model(**inputs)
            if embedder == "encoder":
                embeddings += outputs.encoder_last_hidden_state.mean(dim=1).tolist()
            elif embedder == "decoder":
                embeddings += outputs.last_hidden_state.mean(dim=1).tolist()
            else:
                raise NotImplementedError

        return np.array(embeddings)

In [28]:
# adjust the path to domain adapted encoder based on the domain adaptation dataset
model_name_or_path = "emtrl/smole-bart"
encoder = BartFeaturizer(model_name_or_path)

In [29]:
def load_dataset(dataset_name):
    dataset = DataReader(dataset_name)
    (train_dataset, valid_dataset, test_dataset) = (dataset.train_dataset,
                                                    dataset.valid_dataset, 
                                                    dataset.test_dataset
                                                   )
    X_train, y_train, X_valid, y_valid, X_test, y_test = (dataset.train_dataset.smiles,
                        dataset.train_dataset.y,
                        dataset.valid_dataset.smiles,
                        dataset.valid_dataset.y,
                        dataset.test_dataset.smiles,
                        dataset.test_dataset.y,
                       
                       )
    print(f"Loading and embedding SMILES for dataset {dataset_name}")
    return (
            encoder.encode(X_train), y_train,
            encoder.encode(X_valid), y_valid,
            encoder.encode(X_test), y_test
            )

In [30]:
def train_and_evaluate_model(X_train, y_train, X_valid, y_valid, X_test, y_test):

    # ps = PredefinedSplit(test_fold)
    print("Training Classifier")
    parameters = {'estimator__class_weight':['balanced'],
              'estimator__kernel':['rbf','sigmoid'], 
              'estimator__C':[1,0.5,0.25], 'estimator__gamma':['auto','scale']}
    tox21_svc = GridSearchCV(OneVsRestClassifier(SVC(probability=True,
                                                     random_state=23)), 
                             parameters, cv=3, scoring='roc_auc',n_jobs=-1)
    result = tox21_svc.fit(X_train, y_train)
    pred = tox21_svc.predict_proba(X_test)
    pred_svc = np.copy(pred)
    if len(np.array(y_test).shape) == 1 or np.array(y_test).shape[-1] == 1:
        return roc_auc_score(y_test,pred[:,1])
    else:
        return roc_auc_score(y_test,pred)

In [31]:
def evaluate_dataset(dataset_name):

    X_train, y_train, X_valid, y_valid, X_test, y_test = load_dataset(dataset_name=dataset_name)
    roc_score = train_and_evaluate_model(X_train, y_train, X_valid, y_valid, X_test, y_test)

    print(f"The AUROC score for dataset {dataset_name} is {roc_score:2f}")

## Evaluate MoleculeNet Datasets

In [None]:
evaluate_dataset(dataset_name="clintox")
print(f"\n{'*'*100}\n")
evaluate_dataset(dataset_name="bace")
print(f"\n{'*'*100}\n")
evaluate_dataset(dataset_name="bbbp")
print(f"\n{'*'*100}\n")
evaluate_dataset(dataset_name="tox21")

Loading and embedding SMILES for dataset clintox


148it [00:03, 37.21it/s]
19it [00:00, 31.46it/s]
19it [00:00, 41.05it/s]


Training Classifier
The AUROC score for dataset clintox is 0.985660

****************************************************************************************************

Loading and embedding SMILES for dataset bace


152it [00:03, 49.14it/s]
19it [00:00, 46.97it/s]
19it [00:00, 38.15it/s]


Training Classifier
The AUROC score for dataset bace is 0.746920

****************************************************************************************************

Loading and embedding SMILES for dataset bbbp


204it [00:03, 53.07it/s]
26it [00:00, 32.13it/s]
26it [00:00, 39.64it/s]


Training Classifier
The AUROC score for dataset bbbp is 0.689758

****************************************************************************************************



