# compared with machine learning methods

In [11]:
import numpy as np
import os

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score, auc
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
import math
import datetime
import multiprocessing as mp
from sklearn.model_selection import train_test_split

In [12]:
import pandas as pd

In [13]:
os.getcwd()

os.chdir('/home/dqw_zyf/GraphDTI/data')

alldata = pd.read_csv('alldata.csv') #491718
tar_embeddings = pd.read_csv('corpus2_tar_embeddings.csv')
mol_embeddings = pd.read_csv('corpus2_mol_embeddings.csv')

alldata.drop_duplicates(inplace = True)
alldata = alldata[(alldata.notna())] #462361

data = pd.merge(alldata,tar_embeddings)
print(len(data))
data1 = pd.merge(data,mol_embeddings)
print(len(data1))
data1.drop_duplicates(inplace = True)
data1 = data1[(data1.notna())] #462361
print(len(data1))

462361
462361
462361


In [14]:
data1 = data1.drop('mol_id',axis=1)
# 进行训练、验证和测试集划分
train = data1[data1['Curation/DataSource']=='ChEMBL']##399553
val = data1[data1['Curation/DataSource']=='Curated from the literature by BindingDB'] ##27899
test =  data1[data1['Curation/DataSource']=='PubChem'] ##34909


In [15]:
train_feature = train.iloc[:,9:209]
train_label = train.iloc[:,7]
test_BD_feature = val.iloc[:,9:209]
test_BD_label = val.iloc[:,7]

In [16]:

def ml_classifiers(x,y,test_x,test_y): 
   
    
    "----------------------------------五折交叉验证-----------------------------------------------"   

    for seed in [1,2,3,4,5]:
        X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=seed)

        fit_clf = clf.fit(X_train, Y_train)
    
        y_pred = fit_clf.predict(X_test)
        y_prob = fit_clf.predict_proba(X_test)

        prec_reca_f1_supp_report = classification_report(Y_test, y_pred, target_names = ['label_0', 'label_1'])
        tn, fp, fn, tp = confusion_matrix(Y_test, y_pred).ravel()
        accuracy = (tp+tn)/(tp+fp+tn+fn)
        recall = tp / (tp+fn)
        precision = tp / (tp+fp)
        f1 = 2*precision*recall / (precision+recall)
        roc_auc = roc_auc_score(Y_test, y_prob[:, 1])
        prec, reca, _ = precision_recall_curve(Y_test, y_prob[:, 1])
        aupr = auc(reca, prec)
             
        '--------------------------------------------打印&输出--------------------------------------------------------------'   
   
        print('{}折交叉验证'.format(seed))
        print('acc={:.4f}|precision={:.4f}|recall={:.4f}|f1={:.4f}|auc={:.4f}|aupr={:.4f}'
                 .format(accuracy, precision, recall, f1, roc_auc, aupr))
        print('tn = {}, fp = {}, fn = {}, tp = {}'.format(tn, fp, fn, tp))
        print('-----------------------------------------------------------------')
    


        '--------------------------------------------独立测试集评估--------------------------------------------------------------'

        test_y_pred = fit_clf.predict(test_x)
        test_y_prob = fit_clf.predict_proba(test_x)
        
        prec_reca_f1_supp_report = classification_report(test_y, test_y_pred, target_names = ['label_0', 'label_1'])
        tn, fp, fn, tp = confusion_matrix(test_y, test_y_pred).ravel()
        accuracy = (tp+tn)/(tp+fp+tn+fn)
        recall = tp / (tp+fn)
        precision = tp / (tp+fp)
        f1 = 2*precision*recall / (precision+recall)
        roc_auc = roc_auc_score(test_y, test_y_prob[:, 1])
        prec, reca, _ = precision_recall_curve(test_y, test_y_prob[:, 1])
        aupr = auc(reca, prec)

        print('acc={:.4f}|precision={:.4f}|recall={:.4f}|f1={:.4f}|auc={:.4f}|aupr={:.4f}'.format(accuracy, precision, recall, f1, roc_auc, aupr))
        print('tn = {}, fp = {}, fn = {}, tp = {}'.format(tn, fp, fn, tp))
        print('-----------------------------------------------------------------')
        


In [21]:
#names =['SVM','RF','GBDT','DNN']

names =['RF']
#Classifier = [SVC(gamma=2, C=1, random_state=42,probability=True),RandomForestClassifier(random_state=43),
              #GradientBoostingClassifier( random_state=44),MLPClassifier(alpha=1, max_iter=1000,random_state =45)]
Classifier = [RandomForestClassifier(random_state=43)]
              
for name,clf in zip(names,Classifier):
        ml_classifiers(train_feature,train_label,test_BD_feature,test_BD_label )

1折交叉验证
acc=0.9119|precision=0.9113|recall=0.9437|f1=0.9272|auc=0.9660|aupr=0.9745
tn = 28010, fp = 4366, fn = 2678, tp = 44857
-----------------------------------------------------------------
acc=0.7749|precision=0.8066|recall=0.7218|f1=0.7618|auc=0.8713|aupr=0.8761
tn = 11573, fp = 2409, fn = 3872, tp = 10045
-----------------------------------------------------------------
2折交叉验证
acc=0.9101|precision=0.9092|recall=0.9426|f1=0.9256|auc=0.9662|aupr=0.9743
tn = 28042, fp = 4465, fn = 2719, tp = 44685
-----------------------------------------------------------------
acc=0.7722|precision=0.8069|recall=0.7142|f1=0.7577|auc=0.8675|aupr=0.8733
tn = 11603, fp = 2379, fn = 3977, tp = 9940
-----------------------------------------------------------------
3折交叉验证
acc=0.9107|precision=0.9115|recall=0.9414|f1=0.9262|auc=0.9656|aupr=0.9739
tn = 27998, fp = 4350, fn = 2786, tp = 44777
-----------------------------------------------------------------
acc=0.7779|precision=0.8081|recall=0.7275|f1=0.765

In [9]:
names =['GBDT','DNN']

#names =['RF']
Classifier = [GradientBoostingClassifier( random_state=44),
                MLPClassifier(random_state =45)]
#Classifier = [RandomForestClassifier(random_state=43)]
              
for name,clf in zip(names,Classifier):
        ml_classifiers(train_feature,train_label,test_BD_feature,test_BD_label )

1折交叉验证
acc=0.7668|precision=0.7613|recall=0.8856|f1=0.8187|auc=0.8393|aupr=0.8730
tn = 19177, fp = 13199, fn = 5439, tp = 42096
-----------------------------------------------------------------
acc=0.6881|precision=0.6505|recall=0.8097|f1=0.7214|auc=0.7743|aupr=0.7716
tn = 7929, fp = 6053, fn = 2649, tp = 11268
-----------------------------------------------------------------
2折交叉验证
acc=0.7669|precision=0.7605|recall=0.8860|f1=0.8185|auc=0.8400|aupr=0.8723
tn = 19283, fp = 13224, fn = 5404, tp = 42000
-----------------------------------------------------------------
acc=0.6762|precision=0.6484|recall=0.7664|f1=0.7025|auc=0.7673|aupr=0.7712
tn = 8198, fp = 5784, fn = 3251, tp = 10666
-----------------------------------------------------------------
3折交叉验证
acc=0.7682|precision=0.7628|recall=0.8862|f1=0.8198|auc=0.8399|aupr=0.8728
tn = 19239, fp = 13109, fn = 5415, tp = 42148
-----------------------------------------------------------------
acc=0.6932|precision=0.6576|recall=0.8032|f1=0.7

In [10]:
test_Pub_feature = test.iloc[:,9:209]
test_Pub_label = test.iloc[:,7]

In [57]:
names =['RF','GBDT','DNN']

#names =['RF']
Classifier = [RandomForestClassifier(random_state=43),GradientBoostingClassifier( random_state=44),
                MLPClassifier(random_state =45)]
#Classifier = [RandomForestClassifier(random_state=43)]
              
for name,clf in zip(names,Classifier):
        ml_classifiers(train_feature,train_label,test_Pub_feature,test_Pub_label )

1折交叉验证
acc=0.9119|precision=0.9113|recall=0.9437|f1=0.9272|auc=0.9660|aupr=0.9745
tn = 28010, fp = 4366, fn = 2678, tp = 44857
-----------------------------------------------------------------
acc=0.8201|precision=0.0435|recall=0.6275|f1=0.0813|auc=0.7565|aupr=0.1884
tn = 28351, fp = 6115, fn = 165, tp = 278
-----------------------------------------------------------------
2折交叉验证
acc=0.9101|precision=0.9092|recall=0.9426|f1=0.9256|auc=0.9662|aupr=0.9743
tn = 28042, fp = 4465, fn = 2719, tp = 44685
-----------------------------------------------------------------
acc=0.8207|precision=0.0455|recall=0.6569|f1=0.0851|auc=0.7619|aupr=0.1988
tn = 28360, fp = 6106, fn = 152, tp = 291
-----------------------------------------------------------------
3折交叉验证
acc=0.9107|precision=0.9115|recall=0.9414|f1=0.9262|auc=0.9656|aupr=0.9739
tn = 27998, fp = 4350, fn = 2786, tp = 44777
-----------------------------------------------------------------
acc=0.8305|precision=0.0477|recall=0.6524|f1=0.0890|auc

In [7]:
from sklearn.naive_bayes import GaussianNB


In [8]:
#names =['SVM','RF','GBDT','DNN']
names =['GNB']
#Classifier = [SVC(gamma=2, C=1, random_state=42,probability=True),RandomForestClassifier(random_state=43),
              #GradientBoostingClassifier( random_state=44),MLPClassifier(alpha=1, max_iter=1000,random_state =45)]
GNB = [GaussianNB()]
              
for name,clf in zip(names,GNB):
        ml_classifiers(train_feature,train_label,test_BD_feature,test_BD_label )

1折交叉验证
acc=0.6641|precision=0.7033|recall=0.7529|f1=0.7273|auc=0.7099|aupr=0.7735
tn = 17278, fp = 15098, fn = 11746, tp = 35789
-----------------------------------------------------------------
acc=0.6327|precision=0.6218|recall=0.6729|f1=0.6464|auc=0.6789|aupr=0.6583
tn = 8287, fp = 5695, fn = 4552, tp = 9365
-----------------------------------------------------------------
2折交叉验证
acc=0.6640|precision=0.7045|recall=0.7470|f1=0.7251|auc=0.7107|aupr=0.7723
tn = 17651, fp = 14856, fn = 11994, tp = 35410
-----------------------------------------------------------------
acc=0.6365|precision=0.6244|recall=0.6808|f1=0.6514|auc=0.6836|aupr=0.6631
tn = 8282, fp = 5700, fn = 4442, tp = 9475
-----------------------------------------------------------------
3折交叉验证
acc=0.6644|precision=0.7069|recall=0.7451|f1=0.7255|auc=0.7087|aupr=0.7712
tn = 17657, fp = 14691, fn = 12126, tp = 35437
-----------------------------------------------------------------
acc=0.6340|precision=0.6230|recall=0.6742|f1=0.

In [11]:
ml_classifiers(train_feature,train_label,test_Pub_feature,test_Pub_label )

1折交叉验证
acc=0.6641|precision=0.7033|recall=0.7529|f1=0.7273|auc=0.7099|aupr=0.7735
tn = 17278, fp = 15098, fn = 11746, tp = 35789
-----------------------------------------------------------------
acc=0.5767|precision=0.0163|recall=0.5440|f1=0.0316|auc=0.5834|aupr=0.0161
tn = 19891, fp = 14575, fn = 202, tp = 241
-----------------------------------------------------------------
2折交叉验证
acc=0.6640|precision=0.7045|recall=0.7470|f1=0.7251|auc=0.7107|aupr=0.7723
tn = 17651, fp = 14856, fn = 11994, tp = 35410
-----------------------------------------------------------------
acc=0.5808|precision=0.0159|recall=0.5260|f1=0.0309|auc=0.5814|aupr=0.0159
tn = 20041, fp = 14425, fn = 210, tp = 233
-----------------------------------------------------------------
3折交叉验证
acc=0.6644|precision=0.7069|recall=0.7451|f1=0.7255|auc=0.7087|aupr=0.7712
tn = 17657, fp = 14691, fn = 12126, tp = 35437
-----------------------------------------------------------------
acc=0.5800|precision=0.0159|recall=0.5282|f1=0.

In [15]:
names =['DNN']

#names =['RF']
Classifier = [MLPClassifier((256,512,256),random_state =45)]
#Classifier = [RandomForestClassifier(random_state=43)]
              
for name,clf in zip(names,Classifier):
        ml_classifiers(train_feature,train_label,test_BD_feature,test_BD_label )

1折交叉验证
acc=0.8986|precision=0.9111|recall=0.9193|f1=0.9152|auc=0.9502|aupr=0.9592
tn = 28111, fp = 4265, fn = 3835, tp = 43700
-----------------------------------------------------------------
acc=0.7444|precision=0.7374|recall=0.7573|f1=0.7472|auc=0.8157|aupr=0.7974
tn = 10228, fp = 3754, fn = 3377, tp = 10540
-----------------------------------------------------------------
2折交叉验证
acc=0.8959|precision=0.9047|recall=0.9216|f1=0.9131|auc=0.9503|aupr=0.9597
tn = 27906, fp = 4601, fn = 3716, tp = 43688
-----------------------------------------------------------------
acc=0.7520|precision=0.7366|recall=0.7828|f1=0.7590|auc=0.8224|aupr=0.8009
tn = 10086, fp = 3896, fn = 3023, tp = 10894
-----------------------------------------------------------------
3折交叉验证
acc=0.8933|precision=0.9119|recall=0.9085|f1=0.9102|auc=0.9485|aupr=0.9587
tn = 28171, fp = 4177, fn = 4351, tp = 43212
-----------------------------------------------------------------
acc=0.7560|precision=0.7505|recall=0.7654|f1=0.75



5折交叉验证
acc=0.8972|precision=0.9111|recall=0.9165|f1=0.9138|auc=0.9507|aupr=0.9596
tn = 28123, fp = 4251, fn = 3967, tp = 43570
-----------------------------------------------------------------
acc=0.7447|precision=0.7413|recall=0.7499|f1=0.7456|auc=0.8141|aupr=0.7980
tn = 10340, fp = 3642, fn = 3480, tp = 10437
-----------------------------------------------------------------


In [16]:
ml_classifiers(train_feature,train_label,test_Pub_feature,test_Pub_label )

1折交叉验证
acc=0.8986|precision=0.9111|recall=0.9193|f1=0.9152|auc=0.9502|aupr=0.9592
tn = 28111, fp = 4265, fn = 3835, tp = 43700
-----------------------------------------------------------------
acc=0.7150|precision=0.0265|recall=0.6005|f1=0.0508|auc=0.6990|aupr=0.0278
tn = 24695, fp = 9771, fn = 177, tp = 266
-----------------------------------------------------------------
2折交叉验证
acc=0.8959|precision=0.9047|recall=0.9216|f1=0.9131|auc=0.9503|aupr=0.9597
tn = 27906, fp = 4601, fn = 3716, tp = 43688
-----------------------------------------------------------------
acc=0.7051|precision=0.0256|recall=0.6005|f1=0.0491|auc=0.6973|aupr=0.0275
tn = 24348, fp = 10118, fn = 177, tp = 266
-----------------------------------------------------------------
3折交叉验证
acc=0.8933|precision=0.9119|recall=0.9085|f1=0.9102|auc=0.9485|aupr=0.9587
tn = 28171, fp = 4177, fn = 4351, tp = 43212
-----------------------------------------------------------------
acc=0.7212|precision=0.0247|recall=0.5440|f1=0.0472|au



5折交叉验证
acc=0.8972|precision=0.9111|recall=0.9165|f1=0.9138|auc=0.9507|aupr=0.9596
tn = 28123, fp = 4251, fn = 3967, tp = 43570
-----------------------------------------------------------------
acc=0.7111|precision=0.0238|recall=0.5440|f1=0.0456|auc=0.6583|aupr=0.0222
tn = 24583, fp = 9883, fn = 202, tp = 241
-----------------------------------------------------------------


In [13]:
names =['DNN']

#names =['RF']
Classifier = [MLPClassifier(random_state =45)]
#Classifier = [RandomForestClassifier(random_state=43)]
              
for name,clf in zip(names,Classifier):
        ml_classifiers(train_feature,train_label,test_Pub_feature,test_Pub_label )



1折交叉验证
acc=0.8611|precision=0.8634|recall=0.9106|f1=0.8864|auc=0.9259|aupr=0.9413
tn = 25529, fp = 6847, fn = 4251, tp = 43284
-----------------------------------------------------------------
acc=0.7223|precision=0.0259|recall=0.5711|f1=0.0496|auc=0.7037|aupr=0.0312
tn = 24963, fp = 9503, fn = 190, tp = 253
-----------------------------------------------------------------




2折交叉验证
acc=0.8444|precision=0.8515|recall=0.8934|f1=0.8720|auc=0.9128|aupr=0.9311
tn = 25122, fp = 7385, fn = 5053, tp = 42351
-----------------------------------------------------------------
acc=0.7126|precision=0.0266|recall=0.6072|f1=0.0509|auc=0.7078|aupr=0.0255
tn = 24607, fp = 9859, fn = 174, tp = 269
-----------------------------------------------------------------




3折交叉验证
acc=0.8200|precision=0.8524|recall=0.8437|f1=0.8481|auc=0.8907|aupr=0.9146
tn = 25401, fp = 6947, fn = 7433, tp = 40130
-----------------------------------------------------------------
acc=0.7279|precision=0.0240|recall=0.5147|f1=0.0458|auc=0.6783|aupr=0.0216
tn = 25181, fp = 9285, fn = 215, tp = 228
-----------------------------------------------------------------




4折交叉验证
acc=0.7819|precision=0.7850|recall=0.8712|f1=0.8259|auc=0.8513|aupr=0.8818
tn = 21140, fp = 11322, fn = 6110, tp = 41339
-----------------------------------------------------------------
acc=0.6187|precision=0.0219|recall=0.6659|f1=0.0424|auc=0.6638|aupr=0.0198
tn = 21303, fp = 13163, fn = 148, tp = 295
-----------------------------------------------------------------




5折交叉验证
acc=0.7642|precision=0.7701|recall=0.8606|f1=0.8128|auc=0.8300|aupr=0.8625
tn = 20161, fp = 12213, fn = 6627, tp = 40910
-----------------------------------------------------------------
acc=0.6003|precision=0.0202|recall=0.6411|f1=0.0391|auc=0.6374|aupr=0.0179
tn = 20672, fp = 13794, fn = 159, tp = 284
-----------------------------------------------------------------


In [None]:

clf= [RandomForestClassifier(random_state=43)]
              
fit_clf = clf.fit(train_feature,train_label)

y_prob = fit_clf.predict_proba(X_test)