In [1]:
import pandas as pd

# 读取csv文件
df = pd.read_csv('Kcat_Km/subtract.csv')
df

Unnamed: 0,name,smile
0,乙酰基辅酶A,[C@@H]1(N2C3=C(C(=NC=N3)N)N=C2)O[C@H](COP(OP(O...
1,丁酰基辅酶A,[C@@H]1(N2C3=C(C(=NC=N3)N)N=C2)O[C@H](COP(OP(O...
2,己酰基辅酶A,[C@@H]1(N2C3=C(C(=NC=N3)N)N=C2)O[C@H](COP(OP(O...
3,辛酰基辅酶A,[C@@H]1(N2C3=C(C(=NC=N3)N)N=C2)O[C@H](COP(OP(O...
4,癸酰基辅酶A,[C@@H]1(N2C3=C(C(=NC=N3)N)N=C2)O[C@H](COP(OP(O...
5,烷酰基辅酶A,[C@@H]1(N2C3=C(C(=NC=N3)N)N=C2)O[C@H](COP(OP(O...
6,羟基乙酰基辅酶A,CC(C)(COP([O-])(=O)OP([O-])(=O)OC[C@H]1O[C@H](...
7,羟基丁酰基辅酶A,CC(C)(COP([O-])(=O)OP([O-])(=O)OC[C@H]1O[C@H](...
8,羟基己酰基辅酶A,CC(C)(COP([O-])(=O)OP([O-])(=O)OC[C@H]1O[C@H](...
9,羟基辛酰基辅酶A,CC(C)(COP([O-])(=O)OP([O-])(=O)OC[C@H]1O[C@H](...


In [2]:
import torch
from build_vocab import WordVocab
from pretrain_trfm import TrfmSeq2seq
from utils import split

def smiles_to_vec(Smiles):
    pad_index = 0
    unk_index = 1
    eos_index = 2
    sos_index = 3
    mask_index = 4
    vocab = WordVocab.load_vocab('vocab.pkl')
    def get_inputs(sm):
        seq_len = 220
        sm = sm.split()
        if len(sm)>218:
            print('SMILES is too long ({:d})'.format(len(sm)))
            sm = sm[:109]+sm[-109:]
        ids = [vocab.stoi.get(token, unk_index) for token in sm]
        ids = [sos_index] + ids + [eos_index]
        seg = [1]*len(ids)
        padding = [pad_index]*(seq_len - len(ids))
        ids.extend(padding), seg.extend(padding)
        return ids, seg
    def get_array(smiles):
        x_id, x_seg = [], []
        for sm in smiles:
            a,b = get_inputs(sm)
            x_id.append(a)
            x_seg.append(b)
        return torch.tensor(x_id), torch.tensor(x_seg)
    trfm = TrfmSeq2seq(len(vocab), 256, len(vocab), 4)
    trfm.load_state_dict(torch.load('trfm_12_23000.pkl'))
    trfm.eval()
    x_split = [split(sm) for sm in Smiles]
    xid, xseg = get_array(x_split)
    X = trfm.encode(torch.t(xid))
    return X

  from .autonotebook import tqdm as notebook_tqdm


In [5]:


smiles = df['smile']
smile_vec = smiles_to_vec(smiles)


In [6]:
smile_vec

array([[-0.3877925 , -0.5357646 , -0.2123122 , ...,  0.33084196,
         1.0447214 ,  0.9510764 ],
       [-0.37253132, -0.5077705 , -0.20135523, ...,  0.3890797 ,
         1.0488586 ,  0.8964245 ],
       [-0.35891196, -0.482783  , -0.19366233, ...,  0.42787895,
         1.0407933 ,  0.8760946 ],
       ...,
       [-0.41737   , -0.45078367,  0.01652827, ...,  0.5933931 ,
         0.9796915 ,  0.90317243],
       [-0.4182657 , -0.42662555,  0.02667085, ...,  0.6185897 ,
         0.97849876,  0.9018276 ],
       [-0.4198999 , -0.40217274,  0.03429233, ...,  0.6473827 ,
         0.96009356,  0.8951058 ]], dtype=float32)

In [7]:
#读入npy数据，只读前1000个
import numpy as np
pro_vec = np.load('Kcat_Km/modify_fabB1_fadb_nr_rep_0.7_0.7_rep_seq_protein_representations.npy')
pro_vec

array([[ 0.04616821,  0.208269  , -0.04209135, ..., -0.16772942,
        -0.04448099,  0.13726711],
       [-0.04570865,  0.16448402,  0.12490553, ...,  0.04520228,
         0.06402141, -0.06056025],
       [-0.02437901,  0.17447636, -0.05615647, ..., -0.06087153,
        -0.03509357,  0.13177548],
       ...,
       [ 0.09613732,  0.0862484 ,  0.03010838, ..., -0.06331904,
        -0.11094715,  0.19393542],
       [ 0.03684049,  0.05652959,  0.00385366, ..., -0.03095414,
        -0.02500127,  0.06140285],
       [ 0.07750722,  0.07704207,  0.11574969, ..., -0.0240751 ,
        -0.02355455,  0.18690695]], dtype=float32)

In [8]:
pro_vec.shape

(180811, 1280)

In [9]:
# 循# 加载模型并进行预测
import pickle
import math
with open('esm1b_for_kcat_Km.pkl', "rb") as f:
    pre_model = pickle.load(f)

combined_vec = []
i = 0
for smile in smile_vec:
    sub = df['name'][i]
    print(sub)
    for pro in pro_vec:
        # 拼接smile_vec和pro_vec的每一行
        combined_vec.append(np.concatenate((smile, pro), axis=0))
    Pre_label = pre_model.predict(combined_vec)
    Pre_label_pow = [math.pow(10, Pre_label[i]) for i in range(len(Pre_label))]
    res = pd.DataFrame({'ligand': sub, 'esm1b': Pre_label_pow})
    res.to_csv(f'Kcat_Km/pre_corason_{sub}.csv', mode='a', header=False)
    #清空res
    res = pd.DataFrame()
    #清空combined_vec
    combined_vec = []
    i += 1




乙酰基辅酶A
丁酰基辅酶A
己酰基辅酶A
辛酰基辅酶A
癸酰基辅酶A
烷酰基辅酶A
羟基乙酰基辅酶A
羟基丁酰基辅酶A
羟基己酰基辅酶A
羟基辛酰基辅酶A
羟基癸酰基辅酶A
羟基烷酰基辅酶A
