## Main Code

#### Generate protein representation

In [2]:
from core.trans_prot import ProteinCsvLoader, generate_protein_representation
loader = ProteinCsvLoader('data/task/protein.csv')
result = generate_protein_representation(loader, 'task.pickle')

[2023-06-04 21:33:06.281897] Started.
[2023-06-04 21:37:04.180296] 50 proteins transformed.
[2023-06-04 21:40:26.856575] 100 proteins transformed.
[2023-06-04 21:43:37.312196] 150 proteins transformed.
[2023-06-04 21:46:30.172328] 200 proteins transformed.
[2023-06-04 21:50:33.435885] 250 proteins transformed.
[2023-06-04 21:54:38.778155] 300 proteins transformed.
[2023-06-04 21:58:15.602512] 350 proteins transformed.
[2023-06-04 22:01:53.059098] 400 proteins transformed.
[2023-06-04 22:05:06.546322] 450 proteins transformed.
[2023-06-04 22:08:22.237705] 500 proteins transformed.
[2023-06-04 22:11:54.091679] 550 proteins transformed.
[2023-06-04 22:15:57.984214] 600 proteins transformed.
[2023-06-04 22:17:14.768906] Saving to task.pickle ...
[2023-06-04 22:17:29.570090] Finished.


#### Predict DTA

In [None]:
from core.predictor import BatchGeneratorFromCsv, batch_predict

np_path = 'data/task/lotus_np.csv'
prot_path = 'data/task/protein.csv'
pkl_path = 'task.pickle'
result_path = 'task_result.bin'
log_path = 'predict.log'

batgen = BatchGeneratorFromCsv(np_path, prot_path, 10, 311)
batch_predict(batgen, pkl_path, result_path, log_path = log_path)

#### Read predicted result

In [1]:
from core.dec import read_result_bin
df = read_result_bin('task_result.bin')
df

Unnamed: 0,lotus_id,pid,affinity
0,257199,140,5.659127
1,124597,140,5.672395
2,121510,140,5.402510
3,110032,140,5.138187
4,103990,140,5.371791
5,116501,140,5.024565
6,256634,140,5.219755
7,39360,140,5.907454
8,120344,140,5.757725
9,126799,140,5.093890


## [example] Customize the process of reading data and predict it

#### Generate protein representation

In [6]:
from core.trans_prot import ProteinDataLoader, generate_protein_representation
help(ProteinDataLoader)

Help on class ProteinDataLoader in module core.trans_prot:

class ProteinDataLoader(builtins.object)
 |  ProteinDataLoader(path: str)
 |  
 |  请用户根据自己的数据输入格式，继承该类并重写read_data_file方法以
 |  自定义数据读取过程。该方法要求返回两个列表。第一个列表为为蛋白质id，
 |  该id可根据用户的后续数据分析流程自行定义数据类型与编号规则；第二个
 |  列表为包含蛋白质序列字符串的列表。
 |  项目中使用的ProteinCsvLoader类便是通过继承该类定义的：
 |  ```
 |  import pandas as pd
 |  class ProteinCsvLoader(ProteinDataLoader):
 |      def read_data_file(self, path:str):
 |          protein_df = pd.read_csv(path)
 |          return protein_df['id'], protein_df['seq']
 |  ```
 |  该类的对象可传入generate_protein_representation函数作为loader参
 |  数的值：
 |  ```
 |  # path为要读取的csv文件路径
 |  loader = ProteinCsvLoader(path)
 |  generate_protein_representation(loader, 'task.pickle')
 |  ```
 |  亦可作为迭代器使用：
 |  ```
 |  # path为要读取的csv文件路径
 |  for pid, token in ProteinCsvLoader(path):
 |      print(pid, token)
 |      break
 |  ```
 |  
 |  Methods defined here:
 |  
 |  __init__(self, path: str)
 |      Initialize self.  See help(type(sel

In [None]:
# The process of reading protein sequence files when customizing the generation of protein representations
class MyProteinCsvLoader(ProteinDataLoader):
    # You need to inherit ProteinDataLoader and implement the read_data_file method
    def read_data_file(self, path:str):
        # The csv file to be read in this example has two columns, the 'id' column holds the protein id, 
        # the 'seq' column holds the protein sequence
        protein_df = pd.read_csv(path)
        return protein_df['id'], protein_df['seq']

prot_path = 'data/task/lotus_np.csv'
pkl_path = 'protein.pickle'

# 实例化对象
loader = MyProteinCsvLoader(prot_path)
result = generate_protein_representation(loader, pkl_path)

'''
# Since the generate_protein_representation function is not very stable, in order to prevent the function 
# from failing to save the pickle file, the function will return to the dictionary before saving at the 
# end of the run. If the save fails then the following code can be run.
import pickle
with open('task.pickle', 'wb') as f:
    pickle.dump(result, f)
'''

#### Predict DTA

In [7]:
from core.predictor import BatchGenerator, batch_predict
help(BatchGenerator)

Help on class BatchGenerator in module core.predictor:

class BatchGenerator(builtins.object)
 |  BatchGenerator(np_path: str, prot_path: str, np_batch_size: int, prot_batch_size=None)
 |  
 |  请用户根据自己的数据输入格式，继承该类并重写read_np与read_prot方
 |  法以自定义数据读取过程。
 |  其中read_np方法要求返回两个列表，第一个列表为化合物的id，该id可根
 |  据用户的后续数据分析流程自行定义数据类型与编号规则；第二个列表为包
 |  含无构型信息的SMILES字符串的列表。
 |  read_prot方法要求返回一个包含蛋白质id的列表，该id可根据用户的后续
 |  数据分析流程自行定义数据类型与编号规则。
 |  
 |  保存的批次大小由np_batch_size与prot_batch_size参数指定。每次生成
 |  的批次大小为np_batch_size * prot_batch_size。
 |  
 |  注意本类仅生成保存的批次，不指定模型预测时的批次大小。预测时模型使
 |  用的批次大小(即batch_size)由batch_predict函数的predict_batch_s
 |  ize参数指定。
 |  
 |  项目中使用的BatchGeneratorFromCsv类便是通过继承该类定义的：
 |  ```
 |  class BatchGeneratorFromCsv(BatchGenerator):
 |      def read_np(self, path):
 |          df_np = pd.read_csv(path)
 |          np_id = df_np['lotus_id'].to_list()
 |          np_smiles = df_np['smiles'].to_list()
 |          return np_id, np_smiles
 |  
 |      def read_prot(self, path):
 |        

In [None]:
np_path = 'data/task/lotus_np.csv'
prot_path = 'data/task/protein.csv'
pkl_path = 'task.pickle'
result_path = 'task_result.bin'
log_path = 'predict.log'


# The process of reading drug files and protein sequence files when customizing prediction DTA
from core.predictor import BatchGenerator, batch_predict


class MyBatchGeneratorFromCsv(BatchGenerator):
    # You need to inherit BatchGenerator and implement read_np and read_prot methods
    def read_np(self, path):
        # The csv file to be read in this example has two columns, the 'lotus_id' column holds 
        # the drug id, the 'smiles' column holds the drug's unconfigured smiles
        df_np = pd.read_csv(path)
        np_id = df_np['lotus_id'].to_list()
        np_smiles = df_np['smiles'].to_list()
        return np_id, np_smiles

    def read_prot(self, path):
        # The csv file to be read in this example has two columns, the 'id' column holds the 
        # protein id, the 'seq' column holds the protein sequence
        df_protein = pd.read_csv(path)
        return df_protein['id'].to_list()


batgen = MyBatchGeneratorFromCsv(np_path, prot_path, 10, 311)
batch_predict(batgen, pkl_path,result_path,log_path = log_path)

#### Read predicted result

In [None]:
from core.dec import read_result_bin
df = read_result_bin('task_result.bin')
df