In [2]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

rates = 2**np.arange(7)/80
print(rates)

def get_inputs(sm):
    seq_len = 220
    sm = sm.split()
    if len(sm)>218:
        # print('SMILES is too long ({:d})'.format(len(sm)))
        sm = sm[:109]+sm[-109:]
    ids = [vocab.stoi.get(token, unk_index) for token in sm]
    ids = [sos_index] + ids + [eos_index]
    seg = [1]*len(ids)
    padding = [pad_index]*(seq_len - len(ids))
    ids.extend(padding), seg.extend(padding)
    return ids, seg

def get_array(smiles):
    x_id, x_seg = [], []
    for sm in smiles:
        a,b = get_inputs(sm)
        x_id.append(a)
        x_seg.append(b)
    return torch.tensor(x_id), torch.tensor(x_seg)
def ablation(X, X_test, y, y_test, rate, n_repeats):
    auc = np.empty(n_repeats)
    precision=[]
    for i in range(n_repeats):
        clf = MLPClassifier(max_iter=1000)
        if rate==1:
            X_train, y_train = X,y
        else:
            X_train, _, y_train, __ = train_test_split(X, y, test_size=1-rate, stratify=y)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        auc[i] = roc_auc_score(y_test, y_score[:,1])
    print('training set:',accuracy_score(y_train, clf.predict(X_train))) 
    print('testing set accuracy_score:',accuracy_score(y_test, clf.predict(X_test)))
    print('testing set precision_score:',precision_score(y_test, clf.predict(X_test))) 
    print('testing set recall_score:',recall_score(y_test, clf.predict(X_test))) 
    print('testing set f1_score:',f1_score(y_test, clf.predict(X_test))) 
    ret = {}
    ret['auc mean'] = np.mean(auc)
    ret['auc std'] = np.std(auc)
    return ret

[0.0125 0.025  0.05   0.1    0.2    0.4    0.8   ]


In [5]:
models = []
for i in range(16,88,5):
    models.append('../data_drug/.save/trfm_B8_1_{:d}.pkl'.format(i))


In [6]:
models

['../data_drug/.save/trfm_B8_1_16.pkl',
 '../data_drug/.save/trfm_B8_1_21.pkl',
 '../data_drug/.save/trfm_B8_1_26.pkl',
 '../data_drug/.save/trfm_B8_1_31.pkl',
 '../data_drug/.save/trfm_B8_1_36.pkl',
 '../data_drug/.save/trfm_B8_1_41.pkl',
 '../data_drug/.save/trfm_B8_1_46.pkl',
 '../data_drug/.save/trfm_B8_1_51.pkl',
 '../data_drug/.save/trfm_B8_1_56.pkl',
 '../data_drug/.save/trfm_B8_1_61.pkl',
 '../data_drug/.save/trfm_B8_1_66.pkl',
 '../data_drug/.save/trfm_B8_1_71.pkl',
 '../data_drug/.save/trfm_B8_1_76.pkl',
 '../data_drug/.save/trfm_B8_1_81.pkl',
 '../data_drug/.save/trfm_B8_1_86.pkl']

In [1]:
models = ['models/zincAH_B6_n4_1_2000.pkl','models/zincAH_B6_n4_1_3000.pkl','models/zincAH_B8_n4_1_2000.pkl','models/zincAH_B8_n4_1_3000.pkl']

In [12]:
import torch
from pretrain_trfm import TrfmSeq2seq
from build_vocab import WordVocab
from utils import split
import torch.nn as nn

pad_index = 0
unk_index = 1
eos_index = 2
sos_index = 3
mask_index = 4

vocab = WordVocab.load_vocab('../models/vocab_AH_20W_shuffle.pkl')



trfm = TrfmSeq2seq(len(vocab), 256, len(vocab), 4)
for model in models:
    trfm.load_state_dict(torch.load(model))
    trfm.eval()
    print(model)
    # print(' Total parameters:', sum(p.numel() for p in trfm.parameters()))

    df = pd.read_csv('../data_drug/drug_abba_smiles/drug_abba_A2058.csv')
    train,test = train_test_split(df,random_state=10,train_size=0.8)
    print('读取数据')
    X_train = train['smiles']
    y_train = train['class']
    X_test = test['smiles']
    y_test = test['class']

    print('encoding')
    x_split = [split(sm) for sm in X_train.values]
    xid, _ = get_array(x_split)
    X = trfm.encode(torch.t(xid))
    x_split = [split(sm) for sm in X_test.values]
    xid, _ = get_array(x_split)
    X_test = trfm.encode(torch.t(xid))
    y, y_test = y_train.values, y_test.values

    print('testing')
    scores = []
    for rate in rates:
        score_dic = ablation(X, X_test, y, y_test, rate, 20)
        print(rate, score_dic)
        scores.append(score_dic['auc mean'])
    print(np.mean(scores))
    print()

../data_drug/.save/trfm_B8_1_60.pkl
读取数据
encoding
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
testing
training set: 1.0
testing set accuracy_score: 0.8329925017041582
testing set precision_score: 0.15789473684210525
testing set recall_score: 0.10778443113772455
testing set f1_score: 0.12811387900355872
0.0125 {'auc mean': 0.5388565177337632, 'auc std': 0.03293329652302322}
training set: 1.0
testing set accuracy_score: 0.8214042263122018
testing set precision_score: 0.13740458015267176
testing set recall_score: 0.10778443113772455
testing set f1_score: 0.12080536912751678
0.025 {'auc mean': 0.5782984799631505, 'auc std': 0.03511565941719179}
training set: 1.0
testing set accuracy_score: 0.8404907975460123
testing set precision_score: 0.22764227642276422
testing set recall_score: 0.16766467065868262
testing set f1_score: 0.19310344827586204
0.05 {'auc mean': 0.6083477660064486, 'auc std': 0.0357588279138841}
training set: 0.

In [None]:
scores = []
for rate in rates:
    score_dic = ablation(X, X_test, y, y_test, rate, 20)
    print(rate, score_dic)
    scores.append(score_dic['auc mean'])
print(np.mean(scores))