In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.neural_network import MLPClassifier
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

rates = 2**np.arange(7)/80
print(rates)

[0.0125 0.025  0.05   0.1    0.2    0.4    0.8   ]


In [2]:
def get_inputs(sm):
    seq_len = 220
    sm = sm.split()
    if len(sm)>218:
        # print('SMILES is too long ({:d})'.format(len(sm)))
        sm = sm[:109]+sm[-109:]
    ids = [vocab.stoi.get(token, unk_index) for token in sm]
    ids = [sos_index] + ids + [eos_index]
    seg = [1]*len(ids)
    padding = [pad_index]*(seq_len - len(ids))
    ids.extend(padding), seg.extend(padding)
    return ids, seg

def get_array(smiles):
    x_id, x_seg = [], []
    for sm in smiles:
        a,b = get_inputs(sm)
        x_id.append(a)
        x_seg.append(b)
    return torch.tensor(x_id), torch.tensor(x_seg)

## 对一个模型进行测试所有细胞系

In [4]:
import torch
from pretrain_trfm import TrfmSeq2seq
from build_vocab import WordVocab
from utils import split
import torch.nn as nn

pad_index = 0
unk_index = 1
eos_index = 2
sos_index = 3
mask_index = 4

vocab = WordVocab.load_vocab('./models/vocab.pkl')

trfm = TrfmSeq2seq(len(vocab), 256, len(vocab), 4)
# trfm = nn.DataParallel(trfm)
trfm.load_state_dict(torch.load('./models/trfm_12_23000.pkl'))
trfm.eval()
print('Total parameters:', sum(p.numel() for p in trfm.parameters()))

Total parameters: 4245037


In [65]:
smi_path = './drug_abba_synergy&class_canonical/'
drug_abba_name = os.listdir(smi_path) 

In [60]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [15]:
def ablation(X, X_test, y, y_test, rate, n_repeats):
    auc = np.empty(n_repeats)
    
    ret = {}
    for i in range(n_repeats):
        clf = MLPClassifier(max_iter=1000)
        if rate == 1:
            X_train, y_train = X, y
        else:
            X_train, _, y_train, __ = train_test_split(
                X, y, test_size=1-rate, stratify=y)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        auc[i] = roc_auc_score(y_test, y_score[:, 1])

    print('training set:', accuracy_score(y_train, clf.predict(X_train)))
    print('testing set accuracy_score:',
          accuracy_score(y_test, clf.predict(X_test)))
            
    ret['accuracy'] = accuracy_score(y_test, clf.predict(X_test))
    ret['precision'] = precision_score(y_test, clf.predict(X_test))
    ret['recall'] = recall_score(y_test, clf.predict(X_test))
    ret['f1'] = f1_score(y_test, clf.predict(X_test))

    return ret
    


In [35]:
mtr = pd.DataFrame(columns=['accuracy','precision','recall','f1'])
rows = []
for i in drug_abba_name:
    df = pd.read_csv(smi_path+i)
    train,test = train_test_split(df,random_state=10,train_size=0.8)
    X_train = train['smiles']
    y_train = train['class']
    X_test = test['smiles']
    y_test = test['class']

    #ST 编码
    x_split = [split(sm) for sm in X_train.values]
    xid, _ = get_array(x_split)
    X = trfm.encode(torch.t(xid))
    x_split = [split(sm) for sm in X_test.values]
    xid, _ = get_array(x_split)
    X_test = trfm.encode(torch.t(xid))
    y, y_test = y_train.values, y_test.values

    
    score_dic = ablation(X, X_test, y, y_test,0.8, 20)
    rows.append(score_dic)

mtr = mtr.from_dict(rows, orient='columns')
mtr.to_csv('trfm_12_23000.pkl.csv',index=False)


There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9965906669507778
testing set accuracy_score: 0.9815950920245399
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 1.0
testing set accuracy_score: 0.9971969166082691
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9965906669507778
testing set accuracy_score: 0.9877300613496932
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9982953334753889
testing set accuracy_score: 0.9952283571915473
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 1.0
testing set accuracy_score: 0.9887876664330764
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will ta

## 对多个模型进行测试所有细胞系

In [14]:
model_para = ['trfm_12_23000.pkl']

## 先回归再分类

In [6]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [7]:
def get_class(x):
    if x >= 30:
        return 1
    else:
        return 0

In [8]:
def regression_acc(synergy,label):
    acc = 0
    sum = len(synergy)
    for i in range(sum):
        class_i = get_class(synergy[i])
        if class_i == label.values[i]:
            acc += 1
    acc = acc/sum
    return acc

In [16]:
acc = []

In [9]:
def ablation_regression(X, X_test, y, label_test, n_repeats):
    ret = {}
    for i in range(n_repeats):
        reg = MLPRegressor(max_iter=1000, random_state=1)
        reg.fit(X, y)
        
    res = []
    for i in reg.predict(X_test):
        res.append(get_class(i))
    
    ret['accuracy'] = accuracy_score(label_test,res)
    print(ret['accuracy'])
    ret['precision'] = precision_score(label_test,res)
    ret['recall'] = recall_score(label_test,res)
    ret['f1'] = f1_score(label_test,res)

    return ret

In [10]:
def ablation_result(cell_line,X, X_test, y, label_test):
    
    ret = pd.DataFrame(columns=['synergy','class'])

    
    reg = MLPRegressor(max_iter=1000, random_state=1)
    reg.fit(X, y)

    synergy = reg.predict(X_test)
    res = []
    for i in synergy:
        res.append(get_class(i))
    
    # ret['cell_line'] = cell_line
    ret['synergy'] = synergy
    ret['class'] = res
    acc_score = accuracy_score(label_test,res)
    print(acc_score)
    acc.append(acc_score)
    

    return ret

In [11]:
import torch
from pretrain_trfm import TrfmSeq2seq
from build_vocab import WordVocab
from utils import split
import torch.nn as nn

pad_index = 0
unk_index = 1
eos_index = 2
sos_index = 3
mask_index = 4

In [12]:
train_path = './canonical_train/'
test_path = './canonical_test/'
cell_line_name = os.listdir('./cell_line')

In [17]:
# 只保存测试结果到本地 train和test的数据分别存在两个目录当中
acc = []
for model in model_para:

    vocab = WordVocab.load_vocab('./models/vocab.pkl')
    # 加载模型
    trfm = TrfmSeq2seq(len(vocab), 256, len(vocab), 4)
    trfm.load_state_dict(torch.load('./models/'+model))
    trfm.eval()
    print('Total parameters:', sum(p.numel() for p in trfm.parameters()))
    # 获取模型的名字
    model_name = model.split(sep='/')[-1].split(sep='.')[0]
    

    for i in cell_line_name:
        
        cell_line = i.split(sep='.')[0]
        
        df_train = pd.read_csv(train_path+'train_drug_abba_synergy&class_canonical_'+i)
        df_test = pd.read_csv(test_path+'test_drug_abba_synergy&class_canonical_'+i)
        
        train = df_train
        test = df_test


        #train,_ = train_test_split(df_train,random_state=10,train_size=1)
        #test,_ = train_test_split(df_test,random_state=10,train_size=1)
        
        X_train = train['smiles']
        y_train = train['synergy']
        label_train = train['class']

        X_test = test['smiles']
        y_test = test['synergy']
        label_test = test['class']

        #ST 编码
        x_split = [split(sm) for sm in X_train.values]
        xid, _ = get_array(x_split)
        X = trfm.encode(torch.t(xid))
        x_split = [split(sm) for sm in X_test.values]
        xid, _ = get_array(x_split)
        X_test = trfm.encode(torch.t(xid))
        y, y_test = y_train.values, y_test.values

        
        res = ablation_result(cell_line,X, X_test, y, label_test)
        file_name = 'test_data_predict{}'
        res.to_csv('./canonical_predict/predict_synergy&class_canonical_{:s}.csv'.format(cell_line),index=False)

Total parameters: 4245037
There are 966 molecules. It will take a little time.
There are 242 molecules. It will take a little time.
0.9008264462809917
There are 933 molecules. It will take a little time.
There are 233 molecules. It will take a little time.
0.9227467811158798
There are 966 molecules. It will take a little time.
There are 242 molecules. It will take a little time.
0.8553719008264463
There are 966 molecules. It will take a little time.
There are 242 molecules. It will take a little time.
0.9256198347107438
There are 966 molecules. It will take a little time.
There are 242 molecules. It will take a little time.
0.9008264462809917
There are 966 molecules. It will take a little time.
There are 242 molecules. It will take a little time.
0.9504132231404959
There are 933 molecules. It will take a little time.
There are 233 molecules. It will take a little time.
0.9656652360515021
There are 966 molecules. It will take a little time.
There are 242 molecules. It will take a little



0.8927038626609443
There are 966 molecules. It will take a little time.
There are 242 molecules. It will take a little time.
0.9173553719008265
There are 933 molecules. It will take a little time.
There are 233 molecules. It will take a little time.
0.9356223175965666
There are 933 molecules. It will take a little time.
There are 233 molecules. It will take a little time.
0.8583690987124464
There are 966 molecules. It will take a little time.
There are 242 molecules. It will take a little time.
0.8925619834710744
There are 966 molecules. It will take a little time.
There are 242 molecules. It will take a little time.
0.9462809917355371
There are 966 molecules. It will take a little time.
There are 242 molecules. It will take a little time.
0.9462809917355371
There are 933 molecules. It will take a little time.
There are 233 molecules. It will take a little time.
0.8884120171673819
There are 933 molecules. It will take a little time.
There are 233 molecules. It will take a little time.


In [28]:
drug_abba_path = '../data_drug/drug_abba_synergy&class_mixed/'
cell_line_name = os.listdir('../data_drug/cell_line')
drug_abba_name = os.listdir(drug_abba_path)

In [30]:
# 只保存测试结果到本地 
acc = []
for model in model_para:

    vocab = WordVocab.load_vocab('../models/vocab.pkl')

    trfm = TrfmSeq2seq(len(vocab), 256, len(vocab), 4)
    trfm.load_state_dict(torch.load('../'+model))
    trfm.eval()
    print('Total parameters:', sum(p.numel() for p in trfm.parameters()))

    model_name = model.split(sep='/')[-1].split(sep='.')[0]
    
    

    for i in drug_abba_name:
        

        df = pd.read_csv(drug_abba_path+i).sample(frac=1)
        cell_line = i.split(sep='.')[0].split(sep='_')[-1]
        train,test = train_test_split(df,random_state=10,train_size=0.8)
        # 保留测试集的数据 供后期与 模型测试出来的结果进行性对比
        pd.DataFrame(test).to_csv('../experiments/res/test_data_data-mixed/test_data-mixed_{:s}.csv'.format(cell_line),index=False)
        X_train = train['smiles']
        y_train = train['synergy']
        label_train = train['class']

        X_test = test['smiles']
        y_test = test['synergy']
        label_test = test['class']

        #ST 编码
        x_split = [split(sm) for sm in X_train.values]
        xid, _ = get_array(x_split)
        X = trfm.encode(torch.t(xid))
        x_split = [split(sm) for sm in X_test.values]
        xid, _ = get_array(x_split)
        X_test = trfm.encode(torch.t(xid))
        y, y_test = y_train.values, y_test.values

        
        res = ablation_result(cell_line,X, X_test, y, label_test)
        
        #res.to_csv('../experiments/res/synergy&class/'+cell_line+'_synergy&class_predict.csv',index=False)
        res.to_csv('../experiments/res/synergy&class_data-mixed/predict_synergy&class_mixed_{:s}.csv'.format(cell_line),index=False)

print(acc)

Total parameters: 4245037
There are 6640 molecules. It will take a little time.
There are 1660 molecules. It will take a little time.
0.9632530120481928
There are 6833 molecules. It will take a little time.
There are 1709 molecules. It will take a little time.
0.9666471620830895
There are 6640 molecules. It will take a little time.
There are 1660 molecules. It will take a little time.
0.9626506024096385
There are 6640 molecules. It will take a little time.
There are 1660 molecules. It will take a little time.
0.963855421686747
There are 6833 molecules. It will take a little time.
There are 1709 molecules. It will take a little time.
0.9742539496781744
There are 6640 molecules. It will take a little time.
There are 1660 molecules. It will take a little time.
0.986144578313253
There are 6640 molecules. It will take a little time.
There are 1660 molecules. It will take a little time.
0.9722891566265061
There are 6640 molecules. It will take a little time.
There are 1660 molecules. It will

In [32]:
df_acc = pd.DataFrame(acc,columns=['acc'])

In [34]:
df_acc.to_csv('../experiments/res/synergy&class_data-mixed/acc.csv',index=False)

In [None]:
#测试用并查看精度
for model in model_para:

    vocab = WordVocab.load_vocab('../models/vocab.pkl')

    trfm = TrfmSeq2seq(len(vocab), 256, len(vocab), 4)
    trfm.load_state_dict(torch.load('../'+model))
    trfm.eval()
    print('Total parameters:', sum(p.numel() for p in trfm.parameters()))

    res = pd.DataFrame(columns=['accuracy','precision','recall','f1'])
    rows = []

    for i in drug_abba_name:
        df = pd.read_csv(smi_path+i)
        train,test = train_test_split(df,random_state=10,train_size=0.8)
        X_train = train['smiles']
        y_train = train['synergy']
        label_train = train['class']

        X_test = test['smiles']
        y_test = test['synergy']
        label_test = test['class']

        #ST 编码
        x_split = [split(sm) for sm in X_train.values]
        xid, _ = get_array(x_split)
        X = trfm.encode(torch.t(xid))
        x_split = [split(sm) for sm in X_test.values]
        xid, _ = get_array(x_split)
        X_test = trfm.encode(torch.t(xid))
        y, y_test = y_train.values, y_test.values

        
        score_dic = ablation_regression(X, X_test, y, label_test,3)
        rows.append(score_dic)

    res = res.from_dict(rows, orient='columns')
    res.to_csv(model.split(sep='/')[-1].split(sep='.')[0]+'.csv',index=False)

In [44]:
dscb = pd.read_csv('./trfm_B8_1_48.csv')

In [45]:
dscb.loc[dscb['accuracy']>90, : ]

Unnamed: 0,accuracy,precision,recall,f1


In [54]:
better = dscb[dscb['accuracy']>0.9]

In [55]:
better.describe()

Unnamed: 0,accuracy,precision,recall,f1
count,34.0,34.0,34.0,34.0
mean,0.939084,0.538631,0.2342,0.299469
std,0.021121,0.200523,0.159693,0.175532
min,0.901159,0.125,0.016129,0.031746
25%,0.92417,0.386806,0.102931,0.152183
50%,0.939383,0.559028,0.203448,0.295151
75%,0.95452,0.644145,0.344961,0.419406
max,0.976174,1.0,0.549784,0.688347


In [32]:
dscb.describe()

Unnamed: 0,accuracy,precision,recall,f1
count,39.0,39.0,39.0,39.0
mean,0.932236,0.547552,0.245088,0.313181
std,0.026836,0.189587,0.159681,0.174073
min,0.875263,0.125,0.016129,0.031746
25%,0.918427,0.427778,0.109903,0.163281
50%,0.934128,0.569892,0.206897,0.307692
75%,0.951997,0.639168,0.377808,0.426797
max,0.976174,1.0,0.549784,0.688347


## 直接分类

In [None]:
def ablation(X, X_test, y, y_test, rate, n_repeats):
    auc = np.empty(n_repeats)
    
    ret = {}
    for i in range(n_repeats):
        clf = MLPClassifier(max_iter=1000)
        if rate == 1:
            X_train, y_train = X, y
        else:
            X_train, _, y_train, __ = train_test_split(
                X, y, test_size=1-rate, stratify=y)
        clf.fit(X_train, y_train)
        y_score = clf.predict_proba(X_test)
        auc[i] = roc_auc_score(y_test, y_score[:, 1])

    print('training set:', accuracy_score(y_train, clf.predict(X_train)))
    print('testing set accuracy_score:',
          accuracy_score(y_test, clf.predict(X_test)))
            
    ret['accuracy'] = accuracy_score(y_test, clf.predict(X_test))
    ret['precision'] = precision_score(y_test, clf.predict(X_test))
    ret['recall'] = recall_score(y_test, clf.predict(X_test))
    ret['f1'] = f1_score(y_test, clf.predict(X_test))

    return ret

In [11]:
for model in model_para:

    vocab = WordVocab.load_vocab('../vocab_AH_20W_shuffle.pkl')

    trfm = TrfmSeq2seq(len(vocab), 256, len(vocab), 4)
    # trfm = nn.DataParallel(trfm)
    trfm.load_state_dict(torch.load('../'+model))
    trfm.eval()
    print('Total parameters:', sum(p.numel() for p in trfm.parameters()))

    res = pd.DataFrame(columns=['accuracy','precision','recall','f1'])
    rows = []

    for i in drug_abba_name:
        df = pd.read_csv(smi_path+i)
        train,test = train_test_split(df,random_state=10,train_size=0.8)
        X_train = train['smiles']
        y_train = train['class']
        X_test = test['smiles']
        y_test = test['class']

        #ST 编码
        x_split = [split(sm) for sm in X_train.values]
        xid, _ = get_array(x_split)
        X = trfm.encode(torch.t(xid))
        x_split = [split(sm) for sm in X_test.values]
        xid, _ = get_array(x_split)
        X_test = trfm.encode(torch.t(xid))
        y, y_test = y_train.values, y_test.values

        
        score_dic = ablation(X, X_test, y, y_test,0.8, 20)
        rows.append(score_dic)

    res = res.from_dict(rows, orient='columns')
    res.to_csv(model+'.csv',index=False)

Total parameters: 4240420
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9260600894949925
testing set accuracy_score: 0.9229720518064076
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9743702081051479
testing set accuracy_score: 0.9635599159074982
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.968889835925847
testing set accuracy_score: 0.9611451942740287
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9699552525037289
testing set accuracy_score: 0.9706884798909339


  _warn_prf(average, modifier, msg_start, len(result))


There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9518072289156626
testing set accuracy_score: 0.9404344779257183
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9791894852135816
testing set accuracy_score: 0.9747722494744219
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9665459194545067
testing set accuracy_score: 0.9570552147239264
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9577217962760132
testing set accuracy_score: 0.9621583742116327
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9494992542083955
testing set accuracy_score: 0.9461486025903204
There are 5707 molecules. It will take a little time.
There 

  _warn_prf(average, modifier, msg_start, len(result))


There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9369112814895948
testing set accuracy_score: 0.9418360196215837
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9024078414660132
testing set accuracy_score: 0.8875255623721882
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.8825848849945236
testing set accuracy_score: 0.8934828311142257


  _warn_prf(average, modifier, msg_start, len(result))


There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9499254208395482
testing set accuracy_score: 0.9529652351738241
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9414020882164926
testing set accuracy_score: 0.9420586230402181
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9697699890470974
testing set accuracy_score: 0.9719691660826909
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.8950711938663746
testing set accuracy_score: 0.866853538892782
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9482207543149371
testing set accuracy_score: 0.923653715064758
There are 5707 molecules. It will take a little time.
There ar

  _warn_prf(average, modifier, msg_start, len(result))


There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9437020810514787
testing set accuracy_score: 0.9411352487736511
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9750273822562979
testing set accuracy_score: 0.9789768745620182
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9642020029831664
testing set accuracy_score: 0.9570552147239264
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9588170865279299
testing set accuracy_score: 0.9593552908199019
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.947581504368208
testing set accuracy_score: 0.9406952965235174
There are 5707 molecules. It will take a little time.
There a

  _warn_prf(average, modifier, msg_start, len(result))


There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9393209200438116
testing set accuracy_score: 0.9320252277505255
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9247815896015342
testing set accuracy_score: 0.9079754601226994
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9217962760131435
testing set accuracy_score: 0.9243167484232656
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9490730875772427
testing set accuracy_score: 0.9495569188820723
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9151928404005967
testing set accuracy_score: 0.9154737559645535
There are 5707 molecules. It will take a little time.
There 

  _warn_prf(average, modifier, msg_start, len(result))


There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.952683461117196
testing set accuracy_score: 0.9586545199719692
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9445983379501385
testing set accuracy_score: 0.9434219495569189
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.8836801752464403
testing set accuracy_score: 0.8906797477224947
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9369112814895948
testing set accuracy_score: 0.9453398738612474
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9518072289156626
testing set accuracy_score: 0.9425367904695164
There are 5707 molecules. It will take a little time.
There a

  _warn_prf(average, modifier, msg_start, len(result))


There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9373493975903614
testing set accuracy_score: 0.9404344779257183
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.916471340294055
testing set accuracy_score: 0.9011588275391956
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.8913472070098576
testing set accuracy_score: 0.8934828311142257
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.946516087790326
testing set accuracy_score: 0.9468302658486708
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9333049222245898
testing set accuracy_score: 0.9222903885480572
There are 5707 molecules. It will take a little time.
There ar

  _warn_prf(average, modifier, msg_start, len(result))


There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9618838992332969
testing set accuracy_score: 0.9579537491240364
There are 5867 molecules. It will take a little time.
There are 1467 molecules. It will take a little time.
training set: 0.9488600042616663
testing set accuracy_score: 0.9420586230402181
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.8987951807228916
testing set accuracy_score: 0.8927820602662929
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9404162102957284
testing set accuracy_score: 0.9460406447091801
There are 5707 molecules. It will take a little time.
There are 1427 molecules. It will take a little time.
training set: 0.9496166484118291
testing set accuracy_score: 0.9397337070777856
There are 5707 molecules. It will take a little time.
There 

In [1]:
import pandas as pd

In [38]:
res_model = ['trfm_12_23000.pkl.csv','chem24_B32_n3_1_2000_good.pkl.csv','zincAH_B6_n4_1_2000.pkl.csv','zincAH_B6_n4_1_3000.pkl.csv','zincAH_B8_n4_1_2000.pkl.csv','zincAH_B8_n4_1_3000.pkl.csv']

In [48]:
for res_score in res_model:
    score = pd.read_csv(res_score)
    score.describe().to_csv('./res/describe/'+res_score.split(sep='.')[0]+'_describe.csv')
    print(res_score.split(sep='.')[0])
    print(score.describe())
    print()

trfm_12_23000
        accuracy  precision     recall         f1
count  39.000000  39.000000  39.000000  39.000000
mean    0.986169   0.922182   0.866202   0.892481
std     0.006874   0.063213   0.065845   0.059241
min     0.971370   0.750000   0.612903   0.678571
25%     0.981688   0.898718   0.834770   0.863917
50%     0.987386   0.935897   0.885057   0.909091
75%     0.992056   0.968719   0.906977   0.932491
max     0.997197   1.000000   0.969880   0.975758

chem24_B32_n3_1_2000_good
        accuracy  precision     recall         f1
count  39.000000  39.000000  39.000000  39.000000
mean    0.935258   0.604894   0.259943   0.333733
std     0.028913   0.218273   0.176997   0.182282
min     0.862649   0.000000   0.000000   0.000000
25%     0.921864   0.509259   0.117350   0.189888
50%     0.939332   0.612903   0.244186   0.341463
75%     0.951296   0.721273   0.396802   0.465127
max     0.979678   0.990291   0.644737   0.679128

zincAH_B6_n4_1_2000
        accuracy  precision     recall

In [41]:
'trfm_12_23000.pkl.csv'.split(sep='.')

['trfm_12_23000', 'pkl', 'csv']

In [25]:
B6_n4_1_2000 = pd.read_csv('chem24_B32_n3_1_2000_good.pkl.csv')

In [26]:
B6_n4_1_2000.describe()

Unnamed: 0,accuracy,precision,recall,f1
count,39.0,39.0,39.0,39.0
mean,0.935258,0.604894,0.259943,0.333733
std,0.028913,0.218273,0.176997,0.182282
min,0.862649,0.0,0.0,0.0
25%,0.921864,0.509259,0.11735,0.189888
50%,0.939332,0.612903,0.244186,0.341463
75%,0.951296,0.721273,0.396802,0.465127
max,0.979678,0.990291,0.644737,0.679128
