<a href="https://colab.research.google.com/github/yingzibu/MOL2ADMET/blob/main/examples/RF/RF_no_cuda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
! pip install rdkit --quiet
! pip install selfies --quiet
! pip install PyTDC --quiet
! pip install mycolorpy --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.7/107.7 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for PyTDC (setup.py) ... [?25l[?25hdone


# Random Forest as Baseline

In [1]:
cd /content/drive/MyDrive/ADMET

/content/drive/MyDrive/ADMET


In [45]:
import plotly.figure_factory as ff
import numpy as np
import matplotlib.pyplot as plt
from scripts.preprocess_mols import *
from scripts.CONSTANT import *
from scripts.eval_utils import *
from rdkit import Chem
from rdkit.Chem.MACCSkeys import GenMACCSKeys

m = Chem.MolFromSmiles
header = ['bit' + str(i) for i in range(167)]
MASK = -100

def smile_list_to_MACCS(smi_list:list):
    MACCS_list = []
    for smi in smi_list:
        maccs = [float(i) for i in list(GenMACCSKeys(m(smi)).ToBitString())]
        MACCS_list.append(maccs)
    return MACCS_list

def process(data_, ver=True):
    data = data_.copy()
    # data = convert_with_qed_sa(data)
    if ver: print('---> converting SMILES to MACCS...')
    MACCS_list = smile_list_to_MACCS(data['Drug'].tolist())
    data[header] = pd.DataFrame(MACCS_list)
    if ver: print('---> FINISHED')
    return data

# radius = 2 : morgan ECFP4
# radius = 3 : ECFP6
def smile_list_to_MORGAN(smi_list, morgan_fp_len=MORGAN_LEN, radius=RADIUS):
    import rdkit
    from rdkit import Chem
    from tqdm import tqdm
    from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect as MorganFP
    MORGAN_list = []
    for smi in tqdm(smi_list, total=len(smi_list)):
        mol = m(smi)
        morgan = [float(i) for i in list(MorganFP(m(smi), useChirality=True,
                                        radius=radius, nBits=morgan_fp_len))]

        MORGAN_list.append(morgan)
    return MORGAN_list

def process_Morgan(data_, header=header_MORGAN, ver=True):
    data = data_.copy()
    if ver: print('---> converting SMILES to Morgan FP...')
    l = smile_list_to_MORGAN(data['Drug'].tolist())
    len_here = len(l[0])
    assert len_here == len(header)
    data[header] = pd.DataFrame(l)
    if ver: print('---> FINISHED')

    return data

## classification

In [46]:
from sklearn.ensemble import RandomForestClassifier as RFC
from scripts.yaml_utils import *

def train_RF(name, metric_num, ver=True, repeat_time=1, save_path=None):
    trn, val, tst = collect_data(name)
    trn = process(trn, ver); val = process(val, ver); tst = process(tst, ver)

    trn_x = trn[header]; trn_y = trn[name]
    val_x = val[header]; val_y = val[name]

    AP_results = []
    estimators = []
    best_param = 0
    max_value = 0

    if ver: print(f'Finding best param (n_estimators) for RF on {name}: \n')
    for n in range(1, 12):
        n_estimators = 50 * n
        model = RFC(n_estimators=n_estimators)
        model.fit(trn_x, trn_y)
        val_pred = model.predict(val_x)
        val_prob = model.predict_proba(val_x)
        if ver: print(f'Estimators: {n_estimators}')
        results = evaluate(val_y, val_pred, val_prob, ver=ver)
        estimators.append(n_estimators)
        AP_results.append(results[metric_num])
        if results[metric_num] > max_value:
            best_param = n_estimators
            max_value = results[metric_num]
        if ver: print('\n')

    if ver:
        plt.plot(estimators, AP_results)
        # plt.axhline(y = 0.5, color = 'r', linestyle = '-')
        plt.xlabel('n estimators')
        plt.ylabel('AUPRC')
        plt.title(f'{name}, RF on Validation set')
        plt.show(); plt.close()
    perfs = []
    for i in range(repeat_time):
        model = RFC(n_estimators=best_param)
        model.fit(trn_x, trn_y)
        tst_x = tst[header]; tst_y = tst[name]
        tst_pred = model.predict(tst_x)
        tst_prob = model.predict_proba(tst_x)
        if ver: print(f'best Estimators: {best_param}')
        results = evaluate(tst_y, tst_pred, tst_prob, ver=ver)
        p = {name: results}
        perfs.append(p)
    if save_path != None:
        with open(f'{save_path}/{name}.pkl', 'wb') as fp:
            pickle.dump(perfs, fp)
        if ver: print(f'RF on {name} saved: {save_path}/{name}.pkl')
    return perfs

In [11]:
names_cls

['CYP2C19_Veith',
 'CYP2D6_Veith',
 'CYP3A4_Veith',
 'CYP1A2_Veith',
 'CYP2C9_Veith',
 'BBB_Martins',
 'Bioavailability_Ma',
 'Pgp_Broccatelli',
 'HIA_Hou',
 'PAMPA_NCATS',
 'hERG_Karim',
 'AMES',
 'CYP2C9_Substrate_CarbonMangels',
 'CYP2D6_Substrate_CarbonMangels',
 'CYP3A4_Substrate_CarbonMangels',
 'DILI',
 'Skin Reaction',
 'Carcinogens_Lagunin',
 'ClinTox']

In [None]:
name = 'HIA_Hou'
ver = False
metric_num = -1 # AP
repeat_time = 3
save_path = 'RF'



for name in names_cls:
    perfs = train_RF(name, metric_num, ver, repeat_time, save_path)

    # with open(f'RF/{name}.pkl', 'rb') as fp:
    #     perf_reload = pickle.load(fp)

    # eval_perf_list(perf_reload, name)
    eval_perf_list(perfs, name)

Found local copy...
Loading...
Done!


collect data for:  ['CYP2C19_Veith']


Found local copy...
Loading...


******************** CYP2C19_Veith ******************** 
	|       acc      |       auc      |       ap      
	&0.780$\pm$0.003  &0.854$\pm$0.000  &0.808$\pm$0.001  collect data for:  ['CYP2D6_Veith']


Done!


In [37]:
import pickle



# save dictionary to person_data.pkl file
with open('data.pkl', 'wb') as fp:
    pickle.dump(perfs, fp)
    print('dictionary saved successfully to file')

dictionary saved successfully to file


******************** HIA_Hou ******************** 
	|       acc      |       auc      |       ap      
	&0.960$\pm$0.004  &0.977$\pm$0.008  &0.995$\pm$0.002  

0

******************** HIA_Hou ******************** 
	|       acc      |       auc      |       ap      
	&0.960$\pm$0.004  &0.977$\pm$0.008  &0.995$\pm$0.002  

0

Found local copy...
Loading...
Done!


collect data for:  ['HIA_Hou']
---> converting SMILES to MACCS...
---> FINISHED
---> converting SMILES to MACCS...
---> FINISHED
---> converting SMILES to MACCS...
---> FINISHED
******************** HIA_Hou ******************** 
	|       acc      |       w_acc      |       prec      |       recall      |       sp      |       f1      |       auc      |       mcc      |       ap      
	&0.960$\pm$0.004  &0.877$\pm$0.012  &0.954$\pm$0.004  &1.000$\pm$0.000  &0.754$\pm$0.025  &0.977$\pm$0.002  &0.977$\pm$0.008  &0.848$\pm$0.016  &0.995$\pm$0.002  

0

In [16]:
RF = RandomForestClassifier(n_estimators=n_estimator)
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    y_prob = RF.predict_proba(X_test)
trn_x

Unnamed: 0,bit0,bit1,bit2,bit3,bit4,bit5,bit6,bit7,bit8,bit9,...,bit157,bit158,bit159,bit160,bit161,bit162,bit163,bit164,bit165,bit166
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
8862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
8863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
8864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0


In [4]:
# from function import evaluate
from tkinter import Grid
# from function import evaluate, load_data, get_tpr_fpr, save_tpr_fpr, load_tpr_fpr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, average_precision_score, roc_auc_score
# from function import evaluate,get_preds,save_tpr_fpr
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import pickle
from sklearn import metrics

def RF_single(X, y, n_estimator, enzyme, test_size):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=42)
    RF = RandomForestClassifier(n_estimators=n_estimator)
    RF.fit(X_train, y_train)
    y_pred = RF.predict(X_test)
    y_prob = RF.predict_proba(X_test)
    evaluate(y_test, y_pred, y_prob)
    tpr_values, fpr_values = get_tpr_fpr(y_test, y_prob)
    save_tpr_fpr('RF', enzyme, tpr_values, fpr_values)
    # metrics.plot_roc_curve(RF, X_test, y_test)

    filename = 'RF_' + enzyme +'.sav'
    model_path = 'model/'
    modelname = model_path + filename
    pickle.dump(RF, open(modelname, 'wb'))