In [201]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

rates = 2**np.arange(7)/80
print(rates)

[0.0125 0.025  0.05   0.1    0.2    0.4    0.8   ]


In [202]:
def get_inputs(sm):
    seq_len = 220
    sm = sm.split()
    if len(sm)>218:
        # print('SMILES is too long ({:d})'.format(len(sm)))
        sm = sm[:109]+sm[-109:]
    ids = [vocab.stoi.get(token, unk_index) for token in sm]
    ids = [sos_index] + ids + [eos_index]
    seg = [1]*len(ids)
    padding = [pad_index]*(seq_len - len(ids))
    ids.extend(padding), seg.extend(padding)
    return ids, seg

def get_array(smiles):
    x_id, x_seg = [], []
    for sm in smiles:
        a,b = get_inputs(sm)
        x_id.append(a)
        x_seg.append(b)
    return torch.tensor(x_id), torch.tensor(x_seg)

### ST

In [203]:
import torch
from pretrain_trfm import TrfmSeq2seq
from build_vocab import WordVocab
from utils import split
import torch.nn as nn

pad_index = 0
unk_index = 1
eos_index = 2
sos_index = 3
mask_index = 4

vocab = WordVocab.load_vocab('../models/vocab.pkl')

trfm = TrfmSeq2seq(len(vocab), 256, len(vocab), 4)
# trfm = nn.DataParallel(trfm)
trfm.load_state_dict(torch.load('../models/trfm_12_23000.pkl'))
trfm.eval()

print('Total parameters:', sum(p.numel() for p in trfm.parameters()))
# print(trfm)

Total parameters: 4245037


### ST, RNN, BERT

## Drug

In [204]:
df = pd.read_csv('../data_drug/drug_ab_smiles/drug2058_abba_seq_class&synergy.csv')
print(df.shape)
df.head()

(7334, 3)


Unnamed: 0,smiles,class,synergy
0,C[C@]1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1O=c1[nH]c...,0,7.69353
1,C[C@@]1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1O=c1[nH]...,0,7.69353
2,C[C@]1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1C=CCn1c(=...,0,10.248808
3,C[C@@]1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1C=CCn1c(...,0,10.248808
4,C[C@]1(c2nc3c(C(N)=O)cccc3[nH]2)CCCN1CC(C)C[C@...,0,7.237948


In [205]:
train,test = train_test_split(df,random_state=10,train_size=0.8)

In [206]:
train.describe()

Unnamed: 0,class,synergy
count,5867.0,5867.0
mean,0.103801,6.289857
std,0.305028,21.148259
min,0.0,-78.528867
25%,0.0,-5.96728
50%,0.0,2.774369
75%,0.0,15.187697
max,1.0,135.288435


In [191]:
train.head()

Unnamed: 0,smiles,class,synergy
1090,COC1=C2C[C@@H](C)C[C@H](OC)[C@H](O)[C@H](C)C=C...,0,8.998418
6044,COc1cccc2c1C(=O)c1c(O)c3c(c(O)c1C2=O)C[C@@](O)...,0,12.392604
6066,Nc1ccn([C@H]2O[C@H](CO)[C@H](O)C2(F)F)c(=O)n1O...,0,8.201511
4398,COC1=C2C[C@@H](C)C[C@H](OC)[C@H](O)[C@H](C)C=C...,0,5.967876
5780,CCC1=C[C@H]2C[N@@](C1)Cc1c([nH]c3ccccc13)[C@](...,0,-3.395957


In [192]:
X_train = train['smiles']
y_train = train['synergy']
label_train = train['class']

X_test = test['smiles']
y_test = test['synergy']
label_test = test['class']

In [193]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [194]:
len(X_train)

5867

In [195]:
def get_class(x):
    if x >= 30:
        return 1
    else:
        return 0

In [196]:
def regression_acc(synergy,label):
    acc = 0
    sum = len(synergy)
    for i in range(sum):
        class_i = get_class(synergy[i])
        if class_i == label[i]:
            acc += 1
    acc = acc/sum
    return acc

In [197]:
def ablation_regression(X, X_test, y, n_repeats):
    
    ret = []
    for i in range(n_repeats):
        reg = MLPRegressor(max_iter=1000, random_state=1)
        reg.fit(X, y)
    ret.append(reg.predict(X_test))
    
    return ret

In [198]:
def ablation(X, X_test, y, y_test, rate, n_repeats):
    auc = np.empty(n_repeats)
    precision=[]
    for i in range(n_repeats):
        clf = MLPClassifier(max_iter=1000)
        if rate==1:
            X_train, y_train = X,y
        else:
            X_train, _, y_train, __ = train_test_split(X, y, test_size=1-rate, stratify=y)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        auc[i] = roc_auc_score(y_test, y_score[:,1])
    print('training set:',accuracy_score(y_train, clf.predict(X_train))) 
    print('testing set accuracy_score:',accuracy_score(y_test, clf.predict(X_test)))
    print('testing set precision_score:',precision_score(y_test, clf.predict(X_test))) 
    print('testing set recall_score:',recall_score(y_test, clf.predict(X_test))) 
    print('testing set f1_score:',f1_score(y_test, clf.predict(X_test))) 
    ret = {}
    ret['auc mean'] = np.mean(auc)
    ret['auc std'] = np.std(auc)
    return ret

### ST

In [199]:
x_split = [split(sm) for sm in X_train.values]
xid, _ = get_array(x_split)
X = trfm.encode(torch.t(xid))
print(X.shape)
x_split = [split(sm) for sm in X_test.values]
xid, _ = get_array(x_split)
X_test = trfm.encode(torch.t(xid))
print(X_test.shape)
y, y_test = y_train.values, y_test.values

There are 5867 molecules. It will take a little time.
(5867, 1024)
There are 1467 molecules. It will take a little time.
(1467, 1024)


In [182]:
score_dic = ablation_regression(X, X_test, y, 3)
print(score_dic)

[array([ 4.066076 ,  6.7606597,  3.901361 , ..., -3.307425 , 22.167635 ,
        2.0586295], dtype=float32)]


In [183]:
regression_acc(score_dic[0],label_test.values)

0.8997955010224948

In [200]:
# .save_SmilesTransformer/trfm_B32_1_10000.pkl
score_dic = ablation(X, X_test, label_train , label_test, 0.8, 20)
print(score_dic)

training set: 0.9401235883230343
testing set accuracy_score: 0.9270620313565099
testing set precision_score: 0.7941176470588235
testing set recall_score: 0.48502994011976047
testing set f1_score: 0.6022304832713754
{'auc mean': 0.9435929295255642, 'auc std': 0.0071875694385787085}


In [37]:
# tfevent/321/.save/trfm_B8_1_48.pkl
score_dic = ablation(X, X_test, y, y_test, 0.8, 20)
print(score_dic)

training set: 0.9396974216918815
testing set accuracy_score: 0.9038854805725971
testing set precision_score: 0.5855263157894737
testing set recall_score: 0.5329341317365269
testing set f1_score: 0.5579937304075235
{'auc mean': 0.8645444495624137, 'auc std': 0.015441251002189491}


In [25]:
# tfevent/321/.save/trfm_B8_1_16.pkl
score_dic = ablation(X, X_test, y, y_test, 0.8, 20)
print(score_dic)

training set: 0.9343703388024718
testing set accuracy_score: 0.9045671438309475
testing set precision_score: 0.6173913043478261
testing set recall_score: 0.4251497005988024
testing set f1_score: 0.5035460992907802
{'auc mean': 0.8675191156149241, 'auc std': 0.013417317650156604}


In [200]:
# models/vocab_AH_20W_shuffle.pkl .save/trfm_B8_1_69.pkl
score_dic = ablation(X, X_test, y, y_test, 0.8, 20)
print(score_dic)

training set: 0.9322395056467079
testing set accuracy_score: 0.8977505112474438
testing set precision_score: 0.5562913907284768
testing set recall_score: 0.5029940119760479
testing set f1_score: 0.5283018867924528
{'auc mean': 0.8740028788576693, 'auc std': 0.014486884712658812}


In [188]:
# .save/trfm_B32_1_17.pkl
score_dic = ablation(X, X_test, y, y_test, 0.8, 20)
print(score_dic)

training set: 0.9132750905604091
testing set accuracy_score: 0.8957055214723927
testing set precision_score: 0.7058823529411765
testing set recall_score: 0.1437125748502994
testing set f1_score: 0.2388059701492537
{'auc mean': 0.8637443574389682, 'auc std': 0.012722206945148282}


In [176]:
# .save/trfm_B32_1_16.pkl
score_dic = ablation(X, X_test, y, y_test, 0.8, 20)
print(score_dic)

training set: 0.9226507564457703
testing set accuracy_score: 0.9004771642808452
testing set precision_score: 0.647887323943662
testing set recall_score: 0.2754491017964072
testing set f1_score: 0.38655462184873957
{'auc mean': 0.8677441271303546, 'auc std': 0.016979602887987433}


In [164]:
# .save/trfm_B32_1_10.pkl
score_dic = ablation(X, X_test, y, y_test, 0.8, 20)
print(score_dic)

training set: 0.9124227572981035
testing set accuracy_score: 0.8936605316973415
testing set precision_score: 0.5964912280701754
testing set recall_score: 0.20359281437125748
testing set f1_score: 0.3035714285714286
{'auc mean': 0.858103523721787, 'auc std': 0.017492929747309004}


In [152]:
# .save/trfm_B32_1_40000.pkl
score_dic = ablation(X, X_test, y, y_test, 0.8, 20)
print(score_dic)

training set: 0.9456637545280204
testing set accuracy_score: 0.9359236537150648
testing set precision_score: 0.811965811965812
testing set recall_score: 0.5688622754491018
testing set f1_score: 0.6690140845070424
{'auc mean': 0.9360002303086133, 'auc std': 0.01341711317644627}


In [117]:
# .save/trfm_B128_1_2000.pkl smiles transformer
score_dic = ablation(X, X_test, y, y_test, 0.8, 20)
print(score_dic)

training set: 0.9586618367781803
testing set accuracy_score: 0.9427402862985685
testing set precision_score: 0.8029197080291971
testing set recall_score: 0.6586826347305389
testing set f1_score: 0.7236842105263158
{'auc mean': 0.9526338093044681, 'auc std': 0.00818816387010075}


In [36]:
# .save/trfm_B32_1_20000.pkl
score_dic = ablation(X, X_test, y, y_test, 0.8, 20)
print(score_dic)

training set: 0.9217984231834647
testing set accuracy_score: 0.9038854805725971
testing set precision_score: 0.7241379310344828
testing set recall_score: 0.25149700598802394
testing set f1_score: 0.37333333333333335
{'auc mean': 0.8689531321971442, 'auc std': 0.014988574694371945}


In [24]:
# models/trfm_12_23000.pkl
score_dic = ablation(X, X_test, y, y_test, 0.8, 20)
print(score_dic)

training set: 0.9970168335819305
testing set accuracy_score: 0.9761417859577369
testing set precision_score: 0.9230769230769231
testing set recall_score: 0.8622754491017964
testing set f1_score: 0.8916408668730652
{'auc mean': 0.9940741593735606, 'auc std': 0.0012705126579486239}


In [12]:
# .save/trfm_B32_1_10000.pkl
score_dic = ablation(X, X_test, y, y_test, 0.8, 20)
print(score_dic)

training set: 0.9343703388024718
testing set accuracy_score: 0.9086571233810498
testing set precision_score: 0.6633663366336634
testing set recall_score: 0.40119760479041916
testing set f1_score: 0.5
{'auc mean': 0.8693302625518193, 'auc std': 0.01570973752850061}


training set: 0.9249946729171106
testing set accuracy_score: 0.9032038173142468
testing set precision_score: 0.6373626373626373
testing set recall_score: 0.3473053892215569
testing set f1_score: 0.44961240310077516
{'auc mean': 0.8684799631506218, 'auc std': 0.017962056956294988}


In [60]:
# .save/trfm_B32_1_40000.pkl
score_dic = ablation(X, X_test, y, y_test, 0.8, 20)
print(score_dic)

training set: 0.9303217558065203
testing set accuracy_score: 0.9038854805725971
testing set precision_score: 0.6625
testing set recall_score: 0.31736526946107785
testing set f1_score: 0.42914979757085014
{'auc mean': 0.8661416397973285, 'auc std': 0.014636803320067903}


In [40]:
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set: 1.0
testing set accuracy_score: 0.8207225630538514
testing set precision_score: 0.24705882352941178
testing set recall_score: 0.23728813559322035
testing set f1_score: 0.2420749279538905
0.0125 {'auc mean': 0.6130743879472694, 'auc std': 0.06823235555099305}
training set: 0.9931506849315068
testing set accuracy_score: 0.8302658486707567
testing set precision_score: 0.1896551724137931
testing set recall_score: 0.12429378531073447
testing set f1_score: 0.1501706484641638
0.025 {'auc mean': 0.6400324092322516, 'auc std': 0.04917043689826123}
training set: 0.931740614334471
testing set accuracy_score: 0.8466257668711656
testing set precision_score: 0.25
testing set recall_score: 0.13559322033898305
testing set f1_score: 0.17582417582417584
0.05 {'auc mean': 0.7031605570884247, 'auc std': 0.024656231437098563}
training set: 0.8993174061433447
testing set accuracy_score: 0.8575323790047716
testing set precision_score: 0.3644067796610169
testing set recall_score: 0.2429378531073

## 标准化

In [20]:
from sklearn import preprocessing
scaler_X = preprocessing.StandardScaler().fit(X)
scaler_X_test = preprocessing.StandardScaler().fit(X_test)

In [23]:
X_nrml = scaler_X.transform(X)
X_test_nrml = scaler_X_test.transform(X_test)

In [24]:
score_dic = ablation(X_nrml, X_test_nrml, y, y_test, 0.8, 20)
print(score_dic)

training set: 0.9784785851267845
testing set accuracy_score: 0.8922972051806408
testing set precision_score: 0.5568862275449101
testing set recall_score: 0.5254237288135594
testing set f1_score: 0.5406976744186046
{'auc mean': 0.888143914509701, 'auc std': 0.006005828892987175}


In [17]:
tt = pd.DataFrame(columns=['auc mean','auc std'])

In [18]:
tt.append(score_dic,ignore_index=True)

  tt.append(score_dic,ignore_index=True)


Unnamed: 0,auc mean,auc std
0,0.993295,0.001289


In [13]:
score_dic

{'auc mean': 0.9932947926247099, 'auc std': 0.0012886099623777107}

In [19]:
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set 1.0
testing set accuracy_score 0.8486707566462167
testing set precision_score 0.35294117647058826
testing set recall_score 0.3050847457627119
testing set f1_score 0.3272727272727273
0.0125 {'auc mean': 0.7755449349625543, 'auc std': 0.03249938289497124}
training set 1.0
testing set accuracy_score 0.8650306748466258
testing set precision_score 0.4186046511627907
testing set recall_score 0.3050847457627119
testing set f1_score 0.3529411764705882
0.025 {'auc mean': 0.808355450444532, 'auc std': 0.023537921110122945}
training set 1.0
testing set accuracy_score 0.89093387866394
testing set precision_score 0.5521472392638037
testing set recall_score 0.5084745762711864
testing set f1_score 0.5294117647058824
0.05 {'auc mean': 0.8571159068015592, 'auc std': 0.01586337916341544}
training set 1.0
testing set accuracy_score 0.9120654396728016
testing set precision_score 0.6538461538461539
testing set recall_score 0.576271186440678
testing set f1_score 0.6126126126126126
0.1 {'auc mea

In [14]:
# 原模型 A375
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set 1.0
testing set 0.8616223585548739
0.0125 {'auc mean': 0.7704288748740857, 'auc std': 0.03159579655650447}
training set 1.0
testing set 0.8827539195637355
0.025 {'auc mean': 0.8075089782332588, 'auc std': 0.03201121647506722}
training set 1.0
testing set 0.89093387866394
0.05 {'auc mean': 0.8610167301712434, 'auc std': 0.016249241983318504}
training set 1.0
testing set 0.9216087252897068
0.1 {'auc mean': 0.902348026978496, 'auc std': 0.014260628865419078}
training set 0.9982949701619779
testing set 0.9318336741649625
0.2 {'auc mean': 0.9483274208382604, 'auc std': 0.009964148407719748}
training set 0.9880647911338448
testing set 0.9556918882072256
0.4 {'auc mean': 0.9780617965225771, 'auc std': 0.004422989085448162}
training set 0.9889196675900277
testing set 0.9686434901158828
0.8 {'auc mean': 0.9928929619410504, 'auc std': 0.0016434623811392099}
0.8943692556512817


In [38]:
# 原模型
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set 1.0
testing set 0.8595773687798227
0.0125 {'auc mean': 0.6810278673422386, 'auc std': 0.05991598929775121}
training set 1.0
testing set 0.8895705521472392
0.025 {'auc mean': 0.76884914785813, 'auc std': 0.04369759902830329}
training set 1.0
testing set 0.8916155419222904
0.05 {'auc mean': 0.8473691847075081, 'auc std': 0.026869982003979064}
training set 1.0
testing set 0.9202453987730062
0.1 {'auc mean': 0.9059143251957623, 'auc std': 0.017334607558005605}
training set 0.9974424552429667
testing set 0.94546693933197
0.2 {'auc mean': 0.9519284891754951, 'auc std': 0.011179505116320737}
training set 0.9970161977834612
testing set 0.9577368779822768
0.4 {'auc mean': 0.979619760479042, 'auc std': 0.008483217930743317}
training set 0.9904112507990624
testing set 0.9713701431492843
0.8 {'auc mean': 0.9938203592814372, 'auc std': 0.0013645939084935119}
0.8755041620056591


In [48]:
#chem24_B8_n4_1_2000_good.pkl
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set 1.0
testing set 0.7750511247443763
0.0125 {'auc mean': 0.5423682634730539, 'auc std': 0.04164323620546127}
training set 1.0
testing set 0.8629856850715747
0.025 {'auc mean': 0.5656102026715799, 'auc std': 0.0398809067032774}
training set 0.9863481228668942
testing set 0.8302658486707567
0.05 {'auc mean': 0.6148438507600186, 'auc std': 0.03275663123439158}
training set 0.9453924914675768
testing set 0.8725289706884799
0.1 {'auc mean': 0.6748076923076923, 'auc std': 0.02488327410437577}
training set 0.9232736572890026
testing set 0.8520790729379687
0.2 {'auc mean': 0.726570935052971, 'auc std': 0.03247922900789843}
training set 0.9156010230179028
testing set 0.8970688479890934
0.4 {'auc mean': 0.7893422385997236, 'auc std': 0.019110275491432353}
training set 0.9256339228638397
testing set 0.9025221540558964
0.8 {'auc mean': 0.839883809304468, 'auc std': 0.013466126996794045}
0.6790609988813582


## ab

In [15]:
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set 1.0
testing set 0.8471074380165289
0.0125 {'auc mean': 0.5999144403701764, 'auc std': 0.09827209115839779}
training set 1.0
testing set 0.7975206611570248
0.025 {'auc mean': 0.6756521739130436, 'auc std': 0.07010837747785428}
training set 1.0
testing set 0.8574380165289256
0.05 {'auc mean': 0.7237558931377686, 'auc std': 0.057347923310083135}
training set 1.0
testing set 0.8429752066115702
0.1 {'auc mean': 0.7737471625632967, 'auc std': 0.040755992715846685}
training set 1.0
testing set 0.8842975206611571
0.2 {'auc mean': 0.8664536406495549, 'auc std': 0.03089002089360336}
training set 1.0
testing set 0.9049586776859504
0.4 {'auc mean': 0.9191286886677142, 'auc std': 0.015071255174312955}
training set 1.0
testing set 0.9545454545454546
0.8 {'auc mean': 0.9679465688842326, 'auc std': 0.006992332519298442}
0.7895140811693981


## abba

In [15]:
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set 1.0
testing set 0.7603305785123967
0.0125 {'auc mean': 0.596327380952381, 'auc std': 0.062254179560351}
training set 1.0
testing set 0.8305785123966942
0.025 {'auc mean': 0.6541190476190476, 'auc std': 0.060432028307442036}
training set 1.0
testing set 0.8305785123966942
0.05 {'auc mean': 0.6937916666666667, 'auc std': 0.056954516938663886}
training set 1.0
testing set 0.8099173553719008
0.1 {'auc mean': 0.7314940476190477, 'auc std': 0.043360020481242705}
training set 1.0
testing set 0.8140495867768595
0.2 {'auc mean': 0.7832023809523809, 'auc std': 0.050128942298926164}
training set 1.0
testing set 0.8553719008264463
0.4 {'auc mean': 0.8459702380952381, 'auc std': 0.02607836597741803}
training set 0.9935233160621761
testing set 0.8925619834710744
0.8 {'auc mean': 0.8925119047619047, 'auc std': 0.015425691419910915}
0.7424880952380952


## 异构体 部分参数3m

In [12]:
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set 1.0
testing set 0.8631406761177753
0.0125 {'auc mean': 0.5931571968971963, 'auc std': 0.08567331753717906}
training set 1.0
testing set 0.8827699018538713
0.025 {'auc mean': 0.6294569745226255, 'auc std': 0.0984025964808309}
training set 1.0
testing set 0.8696837513631407
0.05 {'auc mean': 0.7016110846241383, 'auc std': 0.05604483478615813}
training set 1.0
testing set 0.9056706652126499
0.1 {'auc mean': 0.781064753108546, 'auc std': 0.03217618999780552}
training set 0.9972677595628415
testing set 0.9143947655398037
0.2 {'auc mean': 0.8651728013975841, 'auc std': 0.022514302198750884}
training set 1.0
testing set 0.9242093784078517
0.4 {'auc mean': 0.9252048873684979, 'auc std': 0.01143792120080787}
training set 0.990450204638472
testing set 0.9449291166848419
0.8 {'auc mean': 0.9587510831625214, 'auc std': 0.007382435489095433}
0.7792026830115871


In [10]:
score_dic = ablation(X, X_test, y, y_test, 1, 20)
print(score_dic)

{'auc mean': 0.7293981481481482, 'auc std': 0.05145690962211546}


## 异构体 smiles-transformer全部参数4m

In [25]:
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set 1.0
testing set 0.8010899182561307
0.0125 {'auc mean': 0.6504859976027025, 'auc std': 0.08215775891964465}
training set 1.0
testing set 0.8841961852861036
0.025 {'auc mean': 0.7025553012967201, 'auc std': 0.07886442289359047}
training set 1.0
testing set 0.885558583106267
0.05 {'auc mean': 0.7911408957175547, 'auc std': 0.03790122943690139}
training set 1.0
testing set 0.9032697547683923
0.1 {'auc mean': 0.8692622861501581, 'auc std': 0.022978743015923626}
training set 1.0
testing set 0.9264305177111717
0.2 {'auc mean': 0.9253100141658492, 'auc std': 0.015916858979249087}
training set 0.9982949701619779
testing set 0.9591280653950953
0.4 {'auc mean': 0.9666710253895608, 'auc std': 0.01082714158647372}
training set 0.9970161977834612
testing set 0.9604904632152589
0.8 {'auc mean': 0.9863528386182848, 'auc std': 0.0030272075772981755}
0.8416826227058329


## 异构体 smiles-transformer全部参数4m 测试集为0.8

In [57]:
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set 1.0
testing set 0.8719346049046321
0.0125 {'auc mean': 0.6203160074098288, 'auc std': 0.08289079448661878}
training set 1.0
testing set 0.9019073569482289
0.025 {'auc mean': 0.7258058188950638, 'auc std': 0.040188726534540364}
training set 1.0
testing set 0.9073569482288828
0.05 {'auc mean': 0.7947412008281572, 'auc std': 0.0450251006818532}
training set 1.0
testing set 0.94141689373297
0.1 {'auc mean': 0.8860411899313501, 'auc std': 0.021766271394637198}
training set 0.9982935153583617
testing set 0.9305177111716622
0.2 {'auc mean': 0.9271875340525224, 'auc std': 0.023167510152352182}
training set 0.9982949701619779
testing set 0.9618528610354223
0.4 {'auc mean': 0.9734869783153537, 'auc std': 0.010263736169350246}
training set 0.9957374254049446
testing set 0.9795640326975477
0.8 {'auc mean': 0.987396752751444, 'auc std': 0.003164978364988497}
0.8449964974548171


## 异构体 

In [36]:
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set 1.0
testing set 0.8778625954198473
0.0125 {'auc mean': 0.6214138185437422, 'auc std': 0.07820667264685029}
training set 1.0
testing set 0.9002181025081788
0.025 {'auc mean': 0.6588695785323276, 'auc std': 0.058637561495599094}
training set 1.0
testing set 0.8969465648854962
0.05 {'auc mean': 0.7330053975423956, 'auc std': 0.03601249116074987}
training set 0.994535519125683
testing set 0.9045801526717557
0.1 {'auc mean': 0.799248224309473, 'auc std': 0.031706709155339755}
training set 1.0
testing set 0.9231188658669575
0.2 {'auc mean': 0.8860741840175672, 'auc std': 0.02184860573397175}
training set 0.9986357435197817
testing set 0.9471101417666303
0.4 {'auc mean': 0.9360521727631171, 'auc std': 0.009552579365934809}
training set 0.9945429740791268
testing set 0.9547437295528899
0.8 {'auc mean': 0.9721313158380112, 'auc std': 0.005012680227976288}
0.8009706702209476


## 异构体 AH head=4

In [12]:
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set 0.8636363636363636
testing set 0.9056706652126499
0.0125 {'auc mean': 0.5674804160736098, 'auc std': 0.11636057389968579}
training set 0.8888888888888888
testing set 0.9056706652126499
0.025 {'auc mean': 0.5488762080089645, 'auc std': 0.11546255930519383}
training set 0.8791208791208791
testing set 0.9056706652126499
0.05 {'auc mean': 0.6138194137524231, 'auc std': 0.08298961574917427}
training set 0.8852459016393442
testing set 0.9056706652126499
0.1 {'auc mean': 0.56953885986922, 'auc std': 0.1198735833605656}
training set 0.8825136612021858
testing set 0.9056706652126499
0.2 {'auc mean': 0.6029071386065222, 'auc std': 0.07201458172370492}
training set 0.8826739427012278
testing set 0.9056706652126499
0.4 {'auc mean': 0.6328553208075085, 'auc std': 0.015426635245275197}
training set 0.8826739427012278
testing set 0.9056706652126499
0.8 {'auc mean': 0.6389284260126047, 'auc std': 0.007510254754619374}
0.5963436833044076


## 异构体 AH head=8

In [47]:
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set 0.8888888888888888
testing set 0.9032697547683923
0.0125 {'auc mean': 0.6275525770949113, 'auc std': 0.12732481488768096}
training set 0.8904109589041096
testing set 0.9059945504087193
0.025 {'auc mean': 0.6584755366677563, 'auc std': 0.09410084765237711}
training set 0.8904109589041096
testing set 0.9059945504087193
0.05 {'auc mean': 0.6841048272856053, 'auc std': 0.04789084127280928}
training set 0.8907849829351536
testing set 0.9059945504087193
0.1 {'auc mean': 0.7045417892557481, 'auc std': 0.023893679367947424}
training set 0.8907849829351536
testing set 0.9059945504087193
0.2 {'auc mean': 0.7280195052849515, 'auc std': 0.017112710256834526}
training set 0.8908780903665814
testing set 0.9059945504087193
0.4 {'auc mean': 0.7275602048599761, 'auc std': 0.012585136170698497}
training set 0.8913043478260869
testing set 0.9059945504087193
0.8 {'auc mean': 0.7344867603792089, 'auc std': 0.010473086497636687}
0.6949630286897369


In [73]:
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set 0.8888888888888888
testing set 0.9059945504087193
0.0125 {'auc mean': 0.5981627983000981, 'auc std': 0.14962874803720835}
training set 0.8904109589041096
testing set 0.9059945504087193
0.025 {'auc mean': 0.6668453743053285, 'auc std': 0.10645086500893927}
training set 0.8904109589041096
testing set 0.9059945504087193
0.05 {'auc mean': 0.6851340307289965, 'auc std': 0.04707600568483058}
training set 0.8907849829351536
testing set 0.9059945504087193
0.1 {'auc mean': 0.6987043696197014, 'auc std': 0.04907458781001199}
training set 0.8907849829351536
testing set 0.9059945504087193
0.2 {'auc mean': 0.7181677018633541, 'auc std': 0.016180911881005816}
training set 0.8934356351236147
testing set 0.9019073569482289
0.4 {'auc mean': 0.7315713196033562, 'auc std': 0.01308843822499298}
training set 0.8913043478260869
testing set 0.9059945504087193
0.8 {'auc mean': 0.7340171079873598, 'auc std': 0.0027190097083513395}
0.6903718146297421


## chembl24 8h

In [13]:
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set 0.8888888888888888
testing set 0.9059945504087193
0.0125 {'auc mean': 0.5994241037376049, 'auc std': 0.17471073971719772}
training set 0.8904109589041096
testing set 0.9059945504087193
0.025 {'auc mean': 0.6325602048599761, 'auc std': 0.14982370685101815}
training set 0.8904109589041096
testing set 0.9059945504087193
0.05 {'auc mean': 0.6717157022992264, 'auc std': 0.09601324731267054}
training set 0.8907849829351536
testing set 0.9059945504087193
0.1 {'auc mean': 0.6809921542987903, 'auc std': 0.03693464900256645}
training set 0.8907849829351536
testing set 0.9059945504087193
0.2 {'auc mean': 0.6969314590824889, 'auc std': 0.03244931946207068}
training set 0.8908780903665814
testing set 0.9059945504087193
0.4 {'auc mean': 0.7122202244742291, 'auc std': 0.00361210747398511}
training set 0.8913043478260869
testing set 0.9059945504087193
0.8 {'auc mean': 0.7087365152010461, 'auc std': 0.005963940043626651}
0.6717971948504803


## chembl24_half 8h

In [42]:
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))



training set 0.8888888888888888
testing set 0.9059945504087193
0.0125 {'auc mean': 0.5796262395118231, 'auc std': 0.13777587029711505}
training set 0.8904109589041096
testing set 0.9059945504087193
0.025 {'auc mean': 0.6408041843739783, 'auc std': 0.07976302504772169}
training set 0.8904109589041096
testing set 0.9059945504087193
0.05 {'auc mean': 0.6588520213577421, 'auc std': 0.0576116114572321}
training set 0.8907849829351536
testing set 0.9059945504087193
0.1 {'auc mean': 0.675530129672006, 'auc std': 0.0083822044851723}
training set 0.8907849829351536
testing set 0.9059945504087193
0.2 {'auc mean': 0.6775792742726382, 'auc std': 0.007198420859464451}
training set 0.8908780903665814
testing set 0.9059945504087193
0.4 {'auc mean': 0.6767543859649122, 'auc std': 0.00574297085060615}
training set 0.8913043478260869
testing set 0.9059945504087193
0.8 {'auc mean': 0.6761828484254113, 'auc std': 0.007470953952460843}
0.6550470119397874


## chembl24_17W

In [51]:
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

training set 0.8888888888888888
testing set 0.9059945504087193
0.0125 {'auc mean': 0.5357148305546474, 'auc std': 0.0973578326274796}
training set 0.8904109589041096
testing set 0.9059945504087193
0.025 {'auc mean': 0.5414852348261959, 'auc std': 0.0821974967797185}
training set 0.8904109589041096
testing set 0.9059945504087193
0.05 {'auc mean': 0.5252103083796447, 'auc std': 0.08779165288193447}
training set 0.8907849829351536
testing set 0.9059945504087193
0.1 {'auc mean': 0.5470229922632669, 'auc std': 0.06702537163572068}
training set 0.8907849829351536
testing set 0.9059945504087193
0.2 {'auc mean': 0.5935322000653809, 'auc std': 0.08788353288565419}
training set 0.8908780903665814
testing set 0.9059945504087193
0.4 {'auc mean': 0.6266170861937452, 'auc std': 0.04094289347768922}
training set 0.8913043478260869
testing set 0.9059945504087193
0.8 {'auc mean': 0.6552718753405252, 'auc std': 0.02099996805626154}
0.5749792182319152


### ECFP

In [11]:
x,X,y = extract_morgan(df_train['smiles'].values, df_train['HIV_active'].values)
print(len(X), len(y))
x,X_test,y_test = extract_morgan(df_test['smiles'].values, df_test['HIV_active'].values)
print(len(X_test), len(y_test))

41082 41082
45 45


In [12]:
scores = []
for rate in rates:
    score_dic = ablation_hiv(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

0.0125 {'auc mean': 0.5419753086419754, 'auc std': 0.04503846854450115}
0.025 {'auc mean': 0.5503086419753087, 'auc std': 0.0564072236675212}
0.05 {'auc mean': 0.5694444444444444, 'auc std': 0.08849201955412457}
0.1 {'auc mean': 0.5827160493827162, 'auc std': 0.11926366834883169}
0.2 {'auc mean': 0.6459876543209876, 'auc std': 0.13795329598523573}
0.4 {'auc mean': 0.755246913580247, 'auc std': 0.08677780763643579}
0.8 {'auc mean': 0.8388888888888889, 'auc std': 0.06304612034579853}
0.640652557319224


In [13]:
score_dic = ablation_hiv(X, X_test, y, y_test, 1, 20)
print(score_dic)

{'auc mean': 0.8560185185185183, 'auc std': 0.03140384702527043}


### RNN

In [14]:
x_split = [split(sm) for sm in df_train['smiles'].values]
xid, _ = get_array(x_split)
X = rnn.encode(torch.t(xid))
print(X.shape)
x_split = [split(sm) for sm in df_test['smiles'].values]
xid, _ = get_array(x_split)
X_test = rnn.encode(torch.t(xid))
print(X_test.shape)
y, y_test = df_train['HIV_active'].values, df_test['HIV_active'].values

There are 41082 molecules. It will take a little time.
(41082, 1024)
SMILES is too long (220)
SMILES is too long (274)
SMILES is too long (247)
SMILES is too long (226)
SMILES is too long (244)
SMILES is too long (243)
SMILES is too long (253)
SMILES is too long (266)
SMILES is too long (346)
SMILES is too long (232)
SMILES is too long (242)
SMILES is too long (247)
SMILES is too long (240)
SMILES is too long (370)
SMILES is too long (224)
SMILES is too long (283)
SMILES is too long (265)
SMILES is too long (240)
SMILES is too long (219)
SMILES is too long (246)
SMILES is too long (243)
SMILES is too long (284)
SMILES is too long (270)
SMILES is too long (232)
SMILES is too long (260)
SMILES is too long (284)
SMILES is too long (284)
SMILES is too long (439)
SMILES is too long (491)
SMILES is too long (439)
SMILES is too long (296)
SMILES is too long (341)
SMILES is too long (285)
SMILES is too long (327)
SMILES is too long (341)
SMILES is too long (400)
SMILES is too long (263)
SMILES

In [15]:
scores = []
for rate in rates:
    score_dic = ablation_hiv(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

0.0125 {'auc mean': 0.6413580246913579, 'auc std': 0.11069279967556718}
0.025 {'auc mean': 0.6655864197530865, 'auc std': 0.056627812535610146}
0.05 {'auc mean': 0.644753086419753, 'auc std': 0.07297865449282823}
0.1 {'auc mean': 0.6932098765432098, 'auc std': 0.06506711622922942}
0.2 {'auc mean': 0.7206018518518518, 'auc std': 0.08554099486619125}
0.4 {'auc mean': 0.6810185185185185, 'auc std': 0.06736796063488669}
0.8 {'auc mean': 0.7066358024691357, 'auc std': 0.05058786972816301}
0.6790233686067019


In [16]:
score_dic = ablation_hiv(X, X_test, y, y_test, 1, 20)
print(score_dic)

{'auc mean': 0.6912037037037038, 'auc std': 0.09106494426547503}


### GC

In [5]:
featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(
      tasks=['HIV_active'],
      smiles_field='smiles',
      featurizer=featurizer)
dataset = loader.featurize('data/hiv.csv')

Loading raw samples now.
shard_size: 8192
About to start loading CSV from data/hiv.csv
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 0 took 34.156 s
Loading shard 2 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 1 took 35.162 s
Loading shard 3 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
Featurizing sample 8000
TIMING: featurizing shard 2 took 36.376 s
Loading shard 4 of size 8192.
Featurizing sample 0
Featuri

In [6]:
train_data = dataset.select(np.where(np.array(list(map(len, df['smiles'])))<=218)[0])
test_data = dataset.select(np.where(np.array(list(map(len, df['smiles'])))>218)[0])

scores = []
for rate in rates:
    score_dic = ablation_hiv_dc(train_data, test_data, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))

TIMING: dataset construction took 62.290 s
Loading dataset from disk.
TIMING: dataset construction took 20.102 s
Loading dataset from disk.
TIMING: dataset construction took 0.422 s
Loading dataset from disk.
TIMING: dataset construction took 40.842 s
Loading dataset from disk.
TIMING: dataset construction took 40.622 s
Loading dataset from disk.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


computed_metrics: [0.7407407407407407]
TIMING: dataset construction took 0.252 s
Loading dataset from disk.
TIMING: dataset construction took 41.058 s
Loading dataset from disk.
TIMING: dataset construction took 40.823 s
Loading dataset from disk.
computed_metrics: [0.617283950617284]
TIMING: dataset construction took 0.437 s
Loading dataset from disk.
TIMING: dataset construction took 41.790 s
Loading dataset from disk.
TIMING: dataset construction took 41.036 s
Loading dataset from disk.
computed_metrics: [0.45987654320987653]
TIMING: dataset construction took 0.602 s
Loading dataset from disk.
TIMING: dataset construction took 42.796 s
Loading dataset from disk.
TIMING: dataset construction took 42.979 s
Loading dataset from disk.
computed_metrics: [0.6604938271604939]
TIMING: dataset construction took 0.290 s
Loading dataset from disk.
TIMING: dataset construction took 42.907 s
Loading dataset from disk.
TIMING: dataset construction took 41.147 s
Loading dataset from disk.
computed

computed_metrics: [0.6450617283950617]
TIMING: dataset construction took 0.845 s
Loading dataset from disk.
TIMING: dataset construction took 40.368 s
Loading dataset from disk.
TIMING: dataset construction took 39.981 s
Loading dataset from disk.
computed_metrics: [0.6604938271604939]
TIMING: dataset construction took 0.731 s
Loading dataset from disk.
TIMING: dataset construction took 41.865 s
Loading dataset from disk.
TIMING: dataset construction took 40.573 s
Loading dataset from disk.
computed_metrics: [0.5370370370370371]
TIMING: dataset construction took 1.214 s
Loading dataset from disk.
TIMING: dataset construction took 40.154 s
Loading dataset from disk.
TIMING: dataset construction took 39.725 s
Loading dataset from disk.
computed_metrics: [0.8024691358024691]
TIMING: dataset construction took 1.098 s
Loading dataset from disk.
TIMING: dataset construction took 42.479 s
Loading dataset from disk.
TIMING: dataset construction took 41.651 s
Loading dataset from disk.
computed

TIMING: dataset construction took 36.613 s
Loading dataset from disk.
computed_metrics: [0.5401234567901234]
TIMING: dataset construction took 4.005 s
Loading dataset from disk.
TIMING: dataset construction took 37.509 s
Loading dataset from disk.
TIMING: dataset construction took 37.128 s
Loading dataset from disk.
computed_metrics: [0.6820987654320987]
TIMING: dataset construction took 3.768 s
Loading dataset from disk.
TIMING: dataset construction took 37.242 s
Loading dataset from disk.
TIMING: dataset construction took 37.010 s
Loading dataset from disk.
computed_metrics: [0.521604938271605]
TIMING: dataset construction took 3.799 s
Loading dataset from disk.
TIMING: dataset construction took 37.627 s
Loading dataset from disk.
TIMING: dataset construction took 37.220 s
Loading dataset from disk.
computed_metrics: [0.7407407407407407]
TIMING: dataset construction took 4.253 s
Loading dataset from disk.
TIMING: dataset construction took 37.222 s
Loading dataset from disk.
TIMING: d

TIMING: dataset construction took 32.592 s
Loading dataset from disk.
computed_metrics: [0.617283950617284]
0.2 {'auc std': 0.07716913531857271, 'auc mean': 0.6438271604938273}
TIMING: dataset construction took 16.732 s
Loading dataset from disk.
TIMING: dataset construction took 24.272 s
Loading dataset from disk.
TIMING: dataset construction took 24.716 s
Loading dataset from disk.
computed_metrics: [0.4537037037037037]
TIMING: dataset construction took 15.886 s
Loading dataset from disk.
TIMING: dataset construction took 25.192 s
Loading dataset from disk.
TIMING: dataset construction took 25.132 s
Loading dataset from disk.
computed_metrics: [0.5740740740740741]
TIMING: dataset construction took 16.026 s
Loading dataset from disk.
TIMING: dataset construction took 24.997 s
Loading dataset from disk.
TIMING: dataset construction took 24.861 s
Loading dataset from disk.
computed_metrics: [0.7685185185185186]
TIMING: dataset construction took 17.079 s
Loading dataset from disk.
TIMING

TIMING: dataset construction took 8.555 s
Loading dataset from disk.
TIMING: dataset construction took 8.085 s
Loading dataset from disk.
computed_metrics: [0.5987654320987654]
TIMING: dataset construction took 35.332 s
Loading dataset from disk.
TIMING: dataset construction took 7.510 s
Loading dataset from disk.
TIMING: dataset construction took 7.368 s
Loading dataset from disk.
computed_metrics: [0.7314814814814814]
TIMING: dataset construction took 33.929 s
Loading dataset from disk.
TIMING: dataset construction took 8.334 s
Loading dataset from disk.
TIMING: dataset construction took 8.216 s
Loading dataset from disk.
computed_metrics: [0.6512345679012346]
TIMING: dataset construction took 33.538 s
Loading dataset from disk.
TIMING: dataset construction took 9.017 s
Loading dataset from disk.
TIMING: dataset construction took 8.748 s
Loading dataset from disk.
computed_metrics: [0.7345679012345678]
TIMING: dataset construction took 33.991 s
Loading dataset from disk.
TIMING: data

In [8]:
score_dic = ablation_hiv_dc(train_data, test_data, 1, 20)
print(rate, score_dic)

TIMING: dataset construction took 42.882 s
Loading dataset from disk.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


computed_metrics: [0.7407407407407407]
TIMING: dataset construction took 42.898 s
Loading dataset from disk.
computed_metrics: [0.7438271604938271]
TIMING: dataset construction took 43.568 s
Loading dataset from disk.
computed_metrics: [0.5987654320987654]
TIMING: dataset construction took 42.949 s
Loading dataset from disk.
computed_metrics: [0.478395061728395]
TIMING: dataset construction took 43.000 s
Loading dataset from disk.
computed_metrics: [0.8148148148148149]
TIMING: dataset construction took 45.167 s
Loading dataset from disk.
computed_metrics: [0.7067901234567902]
TIMING: dataset construction took 44.496 s
Loading dataset from disk.
computed_metrics: [0.808641975308642]
TIMING: dataset construction took 43.518 s
Loading dataset from disk.
computed_metrics: [0.7438271604938271]
TIMING: dataset construction took 43.286 s
Loading dataset from disk.
computed_metrics: [0.8333333333333333]
TIMING: dataset construction took 43.622 s
Loading dataset from disk.
computed_metrics: [0.