<a href="https://colab.research.google.com/github/yingzibu/JAK_ML/blob/main/examples/experiments/ADMET_M_MLP_AT_GIN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install rdkit --quiet
! pip install PyTDC --quiet
! pip install mycolorpy --quiet

! pip install dgllife --quiet
! pip install molvs --quiet
! pip install dgl -f https://data.dgl.ai/wheels/cu118/repo.html --quiet
! pip install dglgo -f https://data.dgl.ai/wheels-test/repo.html --quiet

! pip install DeepPurpose --quiet
! pip install git+https://github.com/bp-kelley/descriptastorus --quiet
! pip install pandas-flavor --quiet

In [3]:
cd /content/drive/MyDrive/ADMET

/content/drive/MyDrive/ADMET


# Evaluation Functions
#### classification already writen in scripts eval_utils

#### regression: here:

In [5]:
from scripts.eval_utils import *
from scripts.preprocess_mols import *
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score

device = 'cuda'
import sklearn.metrics as metrics

import numpy as np

def reg_evaluate(label_clean, preds_clean):
    mae = metrics.mean_absolute_error(label_clean, preds_clean)
    mse = metrics.mean_squared_error(label_clean, preds_clean)
    rmse = np.sqrt(mse) #mse**(0.5)
    r2 = metrics.r2_score(label_clean, preds_clean)

    print('MAE,   MSE,   RMSE,   R2')
    print("& %5.3f" % (mae), " &%5.3f" % (mse), " &%5.3f" % (rmse),
      " &%5.3f" % (r2))

    eval_result_r2 =   f'R2:     {r2:.3f}'
    eval_result_mae =  f'MAE:   {mae:.3f}'
    eval_result_rmse = f'RMSE: {rmse:.3f}'

    return eval_result_r2, eval_result_mae, eval_result_rmse

from mycolorpy import colorlist as mcp
import matplotlib.pyplot as plt

def eval_dict(y_probs:dict, y_label:dict, names:list, IS_R=False, draw_fig=False):
    if isinstance(IS_R, list): task_list = IS_R
    else: task_list = [IS_R] * len(names)
    for i, name in enumerate(names):
        IS_R = task_list[i]
        print('*'*15, name, '*'*15)
        probs = y_probs[name]
        label = y_label[name]
        assert len(probs) == len(label)
        if IS_R == False:
            preds = get_preds(0.5, probs)
            evaluate(label, preds, probs)

        else:
            r2, mae, rmse = reg_evaluate(label, probs)
            if draw_fig:
                color = mcp.gen_color_normalized(cmap='viridis',
                                                data_arr=label)
                plt.scatter(label, probs, cmap='viridis', marker='.',
                            s=10, alpha=0.5, edgecolors='none', c=color)
                plt.xlabel(f'True {name}')
                plt.ylabel(f'Predicted {name}')
                plt.title(f'{name} prediction on test set')

                x0, xmax = plt.xlim()
                y0, ymax = plt.ylim()
                data_width = xmax - x0
                data_height = ymax - y0
                # print(x0, xmax, y0, ymax, data_width, data_height)
                plt.text(x0 + 0.1*data_width, y0 + data_height * 0.8/0.95, r2)
                plt.text(x0 + 0.1*data_width, y0 + data_height * 0.8,  mae)
                plt.text(x0 + 0.1*data_width, y0 + data_height * 0.8*0.95, rmse)

                plt.show()
                plt.cla()
                plt.clf()
                plt.close()
        print()


# Models architecture

#### Classifier: MLP

#### AttentiveFP


In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Classifier(nn.Module):
    def __init__(self, **config):
        super(Classifier, self).__init__()
        dims = [config['in_dim'], config['hid_dims'], config['out_dim']]
        self.dims = dims
        neurons = [config['in_dim'], *config['hid_dims']]
        linear_layers = [nn.Linear(neurons[i-1], neurons[i]) \
                         for i in range(1, len(neurons))]
        self.hidden = nn.ModuleList(linear_layers)
        self.final = nn.Linear(config['hid_dims'][-1], config['out_dim'])
        self.dropout = nn.Dropout(config['dropout'])

    def forward(self, x):
        for layer in self.hidden: x = F.relu(layer(x))
        x = self.final(x)
        return self.dropout(x)

    def get_dim(self): return self.dims


In [58]:
import time
import pandas as pd
import dgl
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from dgllife.model import model_zoo
from dgllife.utils import smiles_to_bigraph
from dgllife.utils import EarlyStopping, Meter
from dgllife.utils import AttentiveFPAtomFeaturizer
from dgllife.utils import AttentiveFPBondFeaturizer
from dgllife.data import MoleculeCSVDataset



def get_model_AT_10_17(names, n_layers, graph_feat_size, dropout):
    atom_featurizer = AttentiveFPAtomFeaturizer(atom_data_field='hv')
    bond_featurizer = AttentiveFPBondFeaturizer(bond_data_field='he')
    n_feats_num = atom_featurizer.feat_size('hv')
    e_feats_num = bond_featurizer.feat_size('he')

    model = model_zoo.AttentiveFPPredictor(
            node_feat_size=n_feats_num, edge_feat_size=e_feats_num,
            num_layers=n_layers, num_timesteps=1,
            graph_feat_size=graph_feat_size,
            n_tasks=len(names), dropout=dropout)
    return model

In [61]:

# config_AttentiveFP = {'model_type': 'AttentiveFP',
#           'in_dim': graph_feat_size,
#           'n_layers': n_layers,
#           'out_dim': len(names),
#           'prop_names': names,
#           'dropout': dropout,
#           'IS_R': IS_R,
#           'lr': lr,
#           'wd': wd,
#           'patience': patience,
#           'model_path': 'ckpt.pt'}

def AttentiveFP(**config):
    return get_model_AT_10_17(config['prop_names'], config['n_layers'],
                                    config['in_dim'], config['dropout'])

In [None]:
# AttentiveFP(**config_AttentiveFP)

GIN

In [102]:

class GIN_MOD(nn.Module):
    """
    Reference: https://github.com/kexinhuang12345/DeepPurpose/blob/master/DeepPurpose/encoders.py#L392
    """
	## adapted from https://github.com/awslabs/dgl-lifesci/blob/2fbf5fd6aca92675b709b6f1c3bc3c6ad5434e96/examples/property_prediction/moleculenet/utils.py#L76
    def __init__(self, **config):
        super(GIN_MOD, self).__init__()
        self.gnn = load_pretrained('gin_supervised_contextpred')
        self.readout = AvgPooling()
        self.transform = nn.Linear(300, config['in_dim'])
        self.dropout = nn.Dropout(config['dropout'])
        self.hidden_dims = config['hid_dims']
        self.out_dim = config['out_dim']
        layer_size = len(self.hidden_dims)
        neurons = [config['in_dim'], *self.hidden_dims]
        linear_layers = [nn.Linear(neurons[i-1], neurons[i]) \
                         for i in range(1, len(neurons))]
        self.hidden = nn.ModuleList(linear_layers)
        self.final = nn.Linear(self.hidden_dims[-1], self.out_dim)

    def forward(self, bg):
        # bg = bg.to(device)
        node_feats = [
            bg.ndata.pop('atomic_number'),
            bg.ndata.pop('chirality_type')
        ]
        edge_feats = [
            bg.edata.pop('bond_type'),
            bg.edata.pop('bond_direction_type')
        ]

        node_feats = self.gnn(bg, node_feats, edge_feats)
        x = self.readout(bg, node_feats)
        x = self.transform(x)
        for layer in self.hidden: x = F.leaky_relu(layer(x))
        x = self.final(x)
        return self.dropout(x)


# Train eval test functions

In [103]:
def train_epoch_MLP(model, loader, IS_R, names, device, epoch=None,
                    optimizer=None, MASK=-100):
    if optimizer==None: # no optimizer, either validation or test
        model.eval()    # model evaluation for either valid or test
        if epoch != None: train_type='Valid' # if epoch is inputted, its valid
        else: train_type = 'Test' # if no epoch information, its test
    else: model.train(); train_type='Train' # if optimizer inputted, its train

    if isinstance(IS_R, list): IS_R_list = IS_R
    else: IS_R_list = [IS_R] * len(names)
    losses = 0
    y_probs = {}
    y_label = {}
    for idx, batch_data in enumerate(loader):
        """
        len(batch_data) could determine which algorithm
        len(batch_data) == 2: MLP, GIN
        len(batch_data) == 4: AttentiveFP
        """
        if len(batch_data) == 2:  # MLP or GIN
            fp, labels = batch_data
            fp, labels = fp.to(device), labels.to(device)
            mask = labels == MASK
            pred = model(fp)
        elif len(batch_data) == 4: # attentiveFP
            smiles, bg, labels, masks = batch_data
            bg, labels, masks = bg.to(device), labels.to(device), masks.to(device)
            n_feats = bg.ndata.pop('hv').to(device)
            e_feats = bg.edata.pop('he').to(device)
            pred = model(bg, n_feats, e_feats)
            mask = masks < 1

        for j, (name, IS_R) in enumerate(zip(names, IS_R_list)):
            loss_func = get_loss_fn(IS_R)
            probs = pred[:, j][~mask[:, j]]
            label = labels[:, j][~mask[:, j]]
            if j == 0: loss = loss_func(probs, label)
            else: loss += loss_func(probs, label)
            if IS_R == False: probs = F.sigmoid(probs)

            if train_type != 'Train': # validation
                probs = probs.cpu().detach().numpy().tolist()
                label = label.cpu().detach().numpy().tolist()
                if idx ==0: y_probs[name], y_label[name] = probs, label
                else:
                    y_probs[name] += probs
                    y_label[name] += label

        losses += loss.item()
        if optimizer != None: optimizer.zero_grad(); loss.backward(); optimizer.step()

    total_loss = losses / len(loader.dataset)
    if epoch != None:
        print(f'Epoch:{epoch}, [{train_type}] Loss: {total_loss:.3f}')
    else:
        print(f'[{train_type}] Loss: {total_loss:.3f}')
        eval_dict(y_probs, y_label, names, IS_R, True)

    if train_type == 'train': return total_loss
    else: return total_loss, y_probs, y_label



In [None]:
def train_epoch_AT(model, loader, IS_R, names, device, epoch=None,
                    optimizer=None, MASK=-100):
    if optimizer==None: # no optimizer, either validation or test
        model.eval()    # model evaluation for either valid or test
        if epoch != None: train_type='Valid' # if epoch is inputted, its valid
        else: train_type = 'Test' # if no epoch information, its test
    else: model.train(); train_type='Train' # if optimizer inputted, its train

    if isinstance(IS_R, list): IS_R_list = IS_R
    else: IS_R_list = [IS_R] * len(names)
    losses = 0
    y_probs = {}
    y_label = {}
    for idx, batch_data in enumerate(loader):
        smiles, bg, labels, masks = batch_data
        bg, labels, masks = bg.to(device), labels.to(device), masks.to(device)
        n_feats = bg.ndata.pop('hv').to(device)
        e_feats = bg.edata.pop('he').to(device)

        pred = model(bg, n_feats, e_feats)
        mask = masks < 1

        for j, (name, IS_R) in enumerate(zip(names, IS_R_list)):
            loss_func = get_loss_fn(IS_R)
            probs = pred[:, j][~mask[:, j]]
            label = labels[:, j][~mask[:, j]]
            if j == 0: loss = loss_func(probs, label)
            else: loss += loss_func(probs, label)
            if IS_R == False: probs = F.sigmoid(probs)

            if train_type != 'Train': # validation
                probs = probs.cpu().detach().numpy().tolist()
                label = label.cpu().detach().numpy().tolist()
                if idx ==0: y_probs[name], y_label[name] = probs, label
                else:
                    y_probs[name] += probs
                    y_label[name] += label

        losses += loss.item()
        if optimizer != None: optimizer.zero_grad(); loss.backward(); optimizer.step()

    total_loss = losses / len(loader.dataset)
    if epoch != None:
        print(f'Epoch:{epoch}, [{train_type}] Loss: {total_loss:.3f}')
    else:
        print(f'[{train_type}] Loss: {total_loss:.3f}')
        eval_dict(y_probs, y_label, names, IS_R, True)

    if train_type == 'train': return total_loss
    else: return total_loss, y_probs, y_label



# dataset and dataloader functions

In [104]:
from torch.utils.data import DataLoader, Dataset

import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem.MACCSkeys import GenMACCSKeys
import torch.nn.functional as F

m = Chem.MolFromSmiles
header = ['bit' + str(i) for i in range(167)]

def smile_list_to_MACCS(smi_list:list):
    MACCS_list = []
    for smi in smi_list:
        maccs = [float(i) for i in list(GenMACCSKeys(m(smi)).ToBitString())]
        MACCS_list.append(maccs)
    return MACCS_list

import torch
def process(data):

    # data = convert_with_qed_sa(data)
    print('---> converting SMILES to MACCS...')
    MACCS_list = smile_list_to_MACCS(data['Drug'].tolist())
    data[header] = pd.DataFrame(MACCS_list)
    print('---> FINISHED')
    return data

MASK = -100

class nn_dataset(Dataset):
    def __init__(self, df, prop_names, mask=MASK):
        super(nn_dataset, self).__init__()
        df = process(df)
        df = df.fillna(mask)
        self.df = df
        self.len = len(df)
        self.fp = self.df[header]
        if isinstance(prop_names, str): prop_names = [prop_names]
        self.props = self.df[prop_names]

    def __getitem__(self, idx):
        fp = torch.tensor(self.fp.iloc[idx], dtype=torch.float32)
        label = torch.tensor(self.props.iloc[idx], dtype=torch.float32)
        return fp, label

    def __len__(self): return self.len

    def get_df(self): return self.df

AttentiveFP

In [105]:
def collate_molgraphs(data):
    assert len(data[0]) in [3, 4], \
        'Expect the tuple to be of length 3 or 4, got {:d}'.format(len(data[0]))
    if len(data[0]) == 3:
        smiles, graphs, labels = map(list, zip(*data))
        masks = None
    else:
        smiles, graphs, labels, masks = map(list, zip(*data))

    bg = dgl.batch(graphs)
    bg.set_n_initializer(dgl.init.zero_initializer)
    bg.set_e_initializer(dgl.init.zero_initializer)
    labels = torch.stack(labels, dim=0)

    if masks is None:
        masks = torch.ones(labels.shape)
    else:
        masks = torch.stack(masks, dim=0)
        # masks = (labels == MASK).long()
    return smiles, bg, labels, masks

def get_AttentiveFP_dataset(df, name):
    atom_featurizer = AttentiveFPAtomFeaturizer(atom_data_field='hv')
    bond_featurizer = AttentiveFPBondFeaturizer(bond_data_field='he')
    time_string = time.strftime("%m_%d_%Y_%H:%M:%S", time.localtime())

    params = {'smiles_to_graph': smiles_to_bigraph,
            'node_featurizer': atom_featurizer,
            'edge_featurizer': bond_featurizer,
            'smiles_column': 'Drug',
            'cache_file_path': time_string+'.bin',
            'task_names': name, 'load': True, 'n_jobs': len(name)*2}
    graph_dataset = MoleculeCSVDataset(df, **params)
    return graph_dataset

def get_AttentiveFP_loader(df, name, **loader_params):
    dataset = get_AttentiveFP_dataset(df, name)
    loader_params['collate_fn'] = collate_molgraphs
    loader = DataLoader(dataset, **loader_params)
    return loader

GIN data set and data loader

In [106]:
from dgllife.model import load_pretrained
from dgl.nn.pytorch.glob import AvgPooling
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from functools import partial
import torch
from dgllife.utils import smiles_to_bigraph, PretrainAtomFeaturizer, PretrainBondFeaturizer

MASK = -100

class GIN_dataset(Dataset):
    def __init__(self, df, names, mask=MASK):
        df = df.fillna(mask)
        self.names = names
        self.df = df
        self.len = len(df)
        self.props = self.df[names]
        self.node_featurizer = PretrainAtomFeaturizer()
        self.edge_featurizer = PretrainBondFeaturizer()
        self.fc = partial(smiles_to_bigraph, add_self_loop=True)
    def __len__(self): return self.len
    def __getitem__(self, idx):
        v_d = self.df.iloc[idx]['Drug']
        v_d = self.fc(smiles=v_d, node_featurizer = self.node_featurizer,
                      edge_featurizer = self.edge_featurizer)
        label = torch.tensor(self.props.iloc[idx], dtype=torch.float32)
        return v_d, label

import dgl
def get_GIN_dataloader(datasets, **loader_params):
    def dgl_collate_func(data):
        x, labels = map(list, zip(*data))
        bg = dgl.batch(x)
        labels = torch.stack(labels, dim=0)
        bg.set_n_initializer(dgl.init.zero_initializer)
        bg.set_e_initializer(dgl.init.zero_initializer)
        return bg, labels
    loader_params['collate_fn'] = dgl_collate_func
    return DataLoader(datasets, **loader_params)

In [107]:
def init_model(**config):
    """need incorporate all models here! """
    if config['model_type'] == 'MLP':
        model = Classifier(**config)
    elif config['model_type'] == 'GIN':
        model = GIN_MOD(**config) # need work config GIN out_dim
    elif config['model_type'] == 'AttentiveFP':
        model = AttentiveFP(**config)
    elif config['model_type'] == 'RNN': pass
    return model

def get_loss_fn(IS_R):
    if IS_R: return nn.MSELoss(reduction='sum')
    else: return nn.BCEWithLogitsLoss(reduction='sum')

def get_train_fn(model_type):
    if model_type == 'MLP': return train_epoch_MLP

    elif model_type == 'GIN': return train_epoch_MLP
    elif model_type == 'AttentiveFP': return train_epoch_MLP
    elif model_type == 'RNN': pass

def get_eval_fn(model_type):
    if model_type == 'MLP': return train_epoch_MLP

    elif model_type == 'GIN': return train_epoch_MLP
    elif model_type == 'AttentiveFP': return train_epoch_MLP
    elif model_type == 'RNN': pass


def get_loader(df, names, params, model_type):
    print('--> preparing data loader for model type ', model_type)
    if model_type == 'MLP': return DataLoader(nn_dataset(df, names), **params)

    elif model_type == 'GIN':
        return get_GIN_dataloader(GIN_dataset(df, names), **params)

    elif model_type == 'AttentiveFP':
        return get_AttentiveFP_loader(df, names, **params)

    elif model_type == 'RNN': pass


In [108]:
from scripts.preprocess_mols import preprocess, rename_cols, clean_mol
from tdc.single_pred import ADME
def collect_data_10_17(names:list, type_tdc='ADME', clean_mol_=False):
    for i, name in enumerate(names):
        print('*'*15, name, '*'*15)
        if type_tdc == 'ADME':
            data = ADME(name=name)
            # data.label_distribution()
            split = data.get_split()
        train, valid, test = split['train'], split['valid'], split['test']
        if clean_mol_:
            train, valid, test = clean_mol(train), clean_mol(valid), clean_mol(test)

        train = rename_cols(train[['Drug', 'Y']], name)
        valid = rename_cols(valid[['Drug', 'Y']], name)
        test  = rename_cols(test[['Drug', 'Y']],  name)

        # if IS_R and SCALE: train, valid, test = scal(train), scal(valid), scal(test)

        if i == 0: trains, valids, tests = train, valid, test
        else:
            trains = trains.merge(train, how='outer')
            valids = valids.merge(valid, how='outer')
            tests = tests.merge(test, how='outer')
    return trains, valids, tests



In [109]:
from dgllife.utils import EarlyStopping, Meter
from tqdm import tqdm


class PRED:
    def __init__(self, **config):
        cuda = torch.cuda.is_available()
        if cuda: self.device = 'cuda'
        else:    self.device = 'cpu'
        self.prop_names = config['prop_names']
        self.config = config
        self.model_type = config['model_type']
        print('model type: ', self.model_type)
        self.model_path = config['model_path']

        self.eval_fn = get_eval_fn(self.model_type)
        self.train_fn = get_train_fn(self.model_type)

        self.model = init_model(**config).to(self.device)

        self.IS_R = config['IS_R'] # could be list, could be true/false
        self.optimizer = torch.optim.AdamW(self.model.parameters(),
                        lr=config['lr'], weight_decay=config['wd'])
        self.stopper = EarlyStopping(mode='lower', patience=config['patience'])

        self.min_loss = np.inf
        self.best_epoch = 0

    def load_model(self, path):
        con = self.config.copy()
        con['dropout'] = 0
        self.model = init_model(**con).to(self.device)
        print('load pretrained model from ', path)
        self.model.load_state_dict(torch.load(path, map_location=self.device))

    def eval(self, loader, path=None):
        if path != None: self.load_model(path)
        self.eval_fn(self.model, loader, self.IS_R, self.prop_names,
                     self.device, epoch=None, optimizer=None, MASK=-100)

    def train(self, data_loader, val_loader, test_loader=None):
        if self.best_epoch != 0: self.load_model(self.model_path)

        for epoch in range(500):
            score = self.train_fn(self.model, data_loader, self.IS_R,
                                  self.prop_names, self.device, epoch,
                                  self.optimizer)
            val_score, probs, labels = \
                    self.train_fn(self.model, val_loader, self.IS_R,
                                  self.prop_names, self.device, epoch)

            early_stop = self.stopper.step(val_score, self.model)
            if val_score < self.min_loss:
                print(f'prev min loss {self.min_loss:.3f}, '
                      f'now loss {val_score:.3f} |',
                      f'save model at epoch: {epoch}')
                self.min_loss = val_score
                torch.save(self.model.state_dict(), self.model_path)
                self.best_epoch = epoch
                eval_dict(probs, labels, self.prop_names, IS_R=self.IS_R)
            if early_stop: print('early stop'); break

        print(f"best epoch: {self.best_epoch}, min loss: {self.min_loss:.4f}")
        print()
        if test_loader != None: self.eval(test_loader, self.model_path)



# TEST MLP MO

In [None]:
names = ['CYP2C19_Veith', 'CYP2D6_Veith', 'CYP3A4_Veith',
         'CYP1A2_Veith', 'CYP2C9_Veith']

IS_R = [False] * len(names)

trains, valids, tests = collect_data_10_17(names)

In [114]:
import pandas as pd
def count_(df:pd.DataFrame):
    for col in df.columns:
        if col != 'Drug':
            try:
                ones = df[col].value_counts()[1]
                zero = df[col].value_counts()[0]
            except: ones = 'Nan'; zero = 'Nan'
            print(col, f'\t 0: {zero} | 1: {ones}')
    print()

count_(trains), count_(valids), count_(tests)

CYP2C19_Veith 	 0: 4803 | 1: 4063
CYP2D6_Veith 	 0: 7425 | 1: 1766
CYP3A4_Veith 	 0: 5052 | 1: 3577
CYP1A2_Veith 	 0: 4745 | 1: 4060
CYP2C9_Veith 	 0: 5652 | 1: 2813

CYP2C19_Veith 	 0: 673 | 1: 593
CYP2D6_Veith 	 0: 1053 | 1: 260
CYP3A4_Veith 	 0: 717 | 1: 516
CYP1A2_Veith 	 0: 677 | 1: 581
CYP2C9_Veith 	 0: 796 | 1: 413

CYP2C19_Veith 	 0: 1370 | 1: 1163
CYP2D6_Veith 	 0: 2138 | 1: 488
CYP3A4_Veith 	 0: 1449 | 1: 1017
CYP1A2_Veith 	 0: 1328 | 1: 1188
CYP2C9_Veith 	 0: 1599 | 1: 819



(None, None, None)

In [None]:
batch_size = 64
loader_params = {'batch_size': batch_size, 'shuffle': True}
train_loader = get_loader(trains, names, loader_params, 'MLP')
valid_loader = get_loader(valids, names, loader_params, 'MLP')

test_params = {'batch_size': batch_size, 'shuffle': False}
test_loader  = get_loader(tests,  names,  test_params,  'MLP')

In [46]:
in_dim = 167
hid_dims = [256]
dropout = 0.1
lr = 3e-4
wd = 1e-5
patience = 10

config = {'model_type': 'MLP',
          'in_dim': in_dim,
          'hid_dims': hid_dims,
          'out_dim': len(names),
          'prop_names': names,
          'dropout': dropout,
          'IS_R': IS_R,
          'lr': lr,
          'wd': wd,
          'patience': patience,
          'model_path': 'ckpt.pt'}
print(config)
models = PRED(**config)
models.train(train_loader, valid_loader)

{'model_type': 'MLP', 'in_dim': 167, 'hid_dims': [256], 'out_dim': 5, 'prop_names': ['CYP2C19_Veith', 'CYP2D6_Veith', 'CYP3A4_Veith', 'CYP1A2_Veith', 'CYP2C9_Veith'], 'dropout': 0.1, 'IS_R': [False, False, False, False, False], 'lr': 0.0003, 'wd': 1e-05, 'patience': 10, 'model_path': 'ckpt.pt'}


In [85]:
models.eval(test_loader, config['model_path'])

load pretrained model from  ckpt.pt
[Test] Loss: 0.550
*************** CYP2C19_Veith ***************
Accuracy, weighted accuracy, precision, recall/SE, SP,     F1,     AUC,     MCC,     AP
& 0.803  &  0.802  &          0.782  &     0.791  &0.812  &0.786 &0.879 &   0.603 &   0.850

*************** CYP2D6_Veith ***************
Accuracy, weighted accuracy, precision, recall/SE, SP,     F1,     AUC,     MCC,     AP
& 0.869  &  0.722  &          0.715  &     0.488  &0.956  &0.580 &0.852 &   0.518 &   0.659

*************** CYP3A4_Veith ***************
Accuracy, weighted accuracy, precision, recall/SE, SP,     F1,     AUC,     MCC,     AP
& 0.777  &  0.761  &          0.764  &     0.667  &0.855  &0.712 &0.860 &   0.535 &   0.806

*************** CYP1A2_Veith ***************
Accuracy, weighted accuracy, precision, recall/SE, SP,     F1,     AUC,     MCC,     AP
& 0.836  &  0.837  &          0.812  &     0.848  &0.825  &0.830 &0.914 &   0.672 &   0.903

*************** CYP2C9_Veith ***********

# TEST ATTENTIVEFP MO

In [None]:
loader_params = {'batch_size': batch_size, 'shuffle': True}
train_loader = get_loader(trains, names, loader_params, 'AttentiveFP')
valid_loader = get_loader(valids, names, loader_params, 'AttentiveFP')

test_params = {'batch_size': batch_size, 'shuffle': False}
test_loader  = get_loader(tests,  names,  test_params,  'AttentiveFP')

In [None]:
n_layers = 5
graph_feat_size = 300
config_AT = {'model_type': 'AttentiveFP',
          'in_dim': graph_feat_size,
          'n_layers': n_layers,
          'out_dim': len(names),
          'prop_names': names,
          'dropout': dropout,
          'IS_R': IS_R,
          'lr': lr,
          'wd': wd,
          'patience': patience,
          'model_path': 'ckpt_AT.pt'}

print(config_AT)
models = PRED(**config_AT)
models.train(train_loader, valid_loader)

In [89]:
models.eval(test_loader, config_AT['model_path'])

load pretrained model from  ckpt_AT.pt
[Test] Loss: 0.480
*************** CYP2C19_Veith ***************
Accuracy, weighted accuracy, precision, recall/SE, SP,     F1,     AUC,     MCC,     AP
& 0.835  &  0.837  &          0.801  &     0.853  &0.820  &0.826 &0.909 &   0.671 &   0.889

*************** CYP2D6_Veith ***************
Accuracy, weighted accuracy, precision, recall/SE, SP,     F1,     AUC,     MCC,     AP
& 0.887  &  0.771  &          0.753  &     0.586  &0.956  &0.659 &0.887 &   0.599 &   0.726

*************** CYP3A4_Veith ***************
Accuracy, weighted accuracy, precision, recall/SE, SP,     F1,     AUC,     MCC,     AP
& 0.822  &  0.814  &          0.794  &     0.767  &0.861  &0.780 &0.909 &   0.631 &   0.867

*************** CYP1A2_Veith ***************
Accuracy, weighted accuracy, precision, recall/SE, SP,     F1,     AUC,     MCC,     AP
& 0.878  &  0.876  &          0.895  &     0.840  &0.912  &0.867 &0.944 &   0.756 &   0.941

*************** CYP2C9_Veith ********

# Train GIN

In [93]:
loader_params = {'batch_size': batch_size, 'shuffle': True}
train_loader = get_loader(trains, names, loader_params, 'GIN')
valid_loader = get_loader(valids, names, loader_params, 'GIN')

test_params = {'batch_size': batch_size, 'shuffle': False}
test_loader  = get_loader(tests,  names,  test_params,  'GIN')

--> preparing data loader for model type  GIN
--> preparing data loader for model type  GIN
--> preparing data loader for model type  GIN


In [110]:
config_GIN = {'model_type': 'GIN',
          'in_dim': in_dim,
          'hid_dims': hid_dims,
          'out_dim': len(names),
          'prop_names': names,
          'dropout': dropout,
          'IS_R': IS_R,
          'lr': lr,
          'wd': wd,
          'patience': patience,
          'model_path': 'ckpt_GIN.pt'}

print(config_GIN)
models = PRED(**config_GIN)
models.train(train_loader, valid_loader)

Epoch:27, [Train] Loss: 1.079
Epoch:27, [Valid] Loss: 0.363
EarlyStopping counter: 8 out of 10
Epoch:28, [Train] Loss: 1.083
Epoch:28, [Valid] Loss: 0.366
EarlyStopping counter: 9 out of 10
Epoch:29, [Train] Loss: 1.075
Epoch:29, [Valid] Loss: 0.364
EarlyStopping counter: 10 out of 10
early stop
best epoch: 19, min loss: 0.3622



In [115]:
models.eval(test_loader, config_GIN['model_path'])

Downloading gin_supervised_contextpred_pre_trained.pth from https://data.dgl.ai/dgllife/pre_trained/gin_supervised_contextpred.pth...
Pretrained model loaded
load pretrained model from  ckpt_GIN.pt
[Test] Loss: 0.427
*************** CYP2C19_Veith ***************
Accuracy, weighted accuracy, precision, recall/SE, SP,     F1,     AUC,     MCC,     AP
& 0.856  &  0.856  &          0.836  &     0.854  &0.858  &0.845 &0.929 &   0.710 &   0.915

*************** CYP2D6_Veith ***************
Accuracy, weighted accuracy, precision, recall/SE, SP,     F1,     AUC,     MCC,     AP
& 0.888  &  0.783  &          0.741  &     0.615  &0.951  &0.672 &0.904 &   0.609 &   0.760

*************** CYP3A4_Veith ***************
Accuracy, weighted accuracy, precision, recall/SE, SP,     F1,     AUC,     MCC,     AP
& 0.837  &  0.833  &          0.801  &     0.806  &0.859  &0.804 &0.920 &   0.665 &   0.886

*************** CYP1A2_Veith ***************
Accuracy, weighted accuracy, precision, recall/SE, SP,     

In [21]:
# names = ['k', 'b', 'd', 'f', 'a']
# IS_R = True
patience = 30
dropout = 0.1
in_dim = 167 # len of fingerprint
out_dim = len(names)
hid_dims = [128, 64, 32, 16] # hidden dims changeable
lr = 3e-4
wd = 1e-5

dims = [in_dim, hid_dims, out_dim]
print(dims)

config = {'model_type': 'MLP',
          'in_dim': in_dim,
          'hid_dims': hid_dims,
          'out_dim': len(names),
          'prop_names': names,
          'dropout': dropout,
          'IS_R': IS_R,
          'lr': lr,
          'wd': wd,
          'patience': patience,
          'model_path': 'ckpt.pt'}

[167, [128, 64, 32, 16], 5]


In [22]:
model = Classifier(**config)
cuda = torch.cuda.is_available()

if cuda: model = model.cuda(); device = 'cuda'
else: device = 'cpu'
from torchsummary import summary
batch_size = 128
summary(model, (batch_size, config['in_dim']))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1             [-1, 128, 128]          21,504
            Linear-2              [-1, 128, 64]           8,256
            Linear-3              [-1, 128, 32]           2,080
            Linear-4              [-1, 128, 16]             528
            Linear-5               [-1, 128, 5]              85
           Dropout-6               [-1, 128, 5]               0
Total params: 32,453
Trainable params: 32,453
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.08
Forward/backward pass size (MB): 0.24
Params size (MB): 0.12
Estimated Total Size (MB): 0.45
----------------------------------------------------------------


In [18]:
model = Classifier(**config)
cuda = torch.cuda.is_available()

if cuda: model = model.cuda(); device = 'cuda'
else: device = 'cpu'
from torchsummary import summary
batch_size = 128
summary(model, (batch_size, config['in_dim']))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1             [-1, 128, 128]          21,504
            Linear-2              [-1, 128, 64]           8,256
            Linear-3              [-1, 128, 32]           2,080
            Linear-4              [-1, 128, 16]             528
            Linear-5               [-1, 128, 1]              17
           Dropout-6               [-1, 128, 1]               0
Total params: 32,385
Trainable params: 32,385
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.08
Forward/backward pass size (MB): 0.24
Params size (MB): 0.12
Estimated Total Size (MB): 0.44
----------------------------------------------------------------


In [16]:


train_loss_dict = {}
valid_loss_dict = {}

epochs = 1000
best_epoch = 0

optimizer = torch.optim.AdamW(params=model.parameters(), lr=lr, weight_decay=wd)


from torchsummary import summary
summary(model, (batch_size, config['in_dim']))


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1             [-1, 128, 128]          21,504
            Linear-2              [-1, 128, 64]           8,256
            Linear-3              [-1, 128, 32]           2,080
            Linear-4              [-1, 128, 16]             528
            Linear-5               [-1, 128, 1]              17
           Dropout-6               [-1, 128, 1]               0
Total params: 32,385
Trainable params: 32,385
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.08
Forward/backward pass size (MB): 0.24
Params size (MB): 0.12
Estimated Total Size (MB): 0.44
----------------------------------------------------------------
