# import

In [1]:
import numpy as np
from DeepPurpose.utils import *

## 데이터 불러오기

In [2]:
# 파일 이름 목록 생성
file_names = [f"./data/KIBA_X_target_part{i}.npy" for i in range(105)]

# 각 파일을 읽어와 리스트에 저장
arrays = [np.load(file) for file in file_names]

# 리스트에 저장된 모든 어레이를 하나로 합침
X_target = np.concatenate(arrays)

file_names = [f"./data/KIBA_X_drug_part{i}.npy" for i in range(18)]

arrays = [np.load(file) for file in file_names]

X_drug = np.concatenate(arrays)

y = np.load('./data/KIBA_y.npy')

In [3]:
# 화합물 데이터 확인
print("drug : ", X_drug[:3], end="\n\n")

# 타겟 단백질 데이터 확인
print("target : ",X_target[0], end="\n\n")

# 결합 여부 확인 (0이면 결합 안함, 1이면 결합)
print("Interaction : ",y[:3], end="\n\n")

drug :  ['COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl'
 'COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl'
 'COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl']

target :  MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNSYACKHPEVQSILKISQPQEPELMNANPSPPPSPSQQINLGPSSNPHAKPSDFHFLKVIGKGSFGKVLLARHKAEEVFYAVKVLQKKAILKKKEEKHIMSERNVLLKNVKHPFLVGLHFSFQTADKLYFVLDYINGGELFYHLQRERCFLEPRARFYAAEIASALGYLHSLNIVYRDLKPENILLDSQGHIVLTDFGLCKENIEHNSTTSTFCGTPEYLAPEVLHKQPYDRTVDWWCLGAVLYEMLYGLPPFYSRNTAEMYDNILNKPLQLKPNITNSARHLLEGLLQKDRTKRLGAKDDFMEIKSHVFFSLINWDDLINKKITPPFNPNVSGPNDLRHFDPEFTEEPVPNSIGKSPDSVLVTASVKEAAEAFLGFSYAPPTDSFL

Interaction :  [0 0 0]



## MLP 모델 입력에 맞도록 전처리 (DeepPurpose의 data_process함수 사용)

In [8]:
# MLP 모델을 위한 encoding 방식 지정 (입력 특성으로 1차원 벡터를 사용)
drug_encoding = 'Morgan'
target_encoding = 'AAC'
train, val, test = data_process(X_drug, X_target, y,
                                drug_encoding, target_encoding,
                                split_method='random', frac=[0.8,0.1,0.1])
train[:3]

Drug Target Interaction Prediction Mode...
in total: 118254 drug-target pairs
encoding drug...
unique drugs: 2068
encoding protein...
unique target sequence: 229
-- Encoding AAC takes time. Time Reference: 24s for ~100 sequences in a CPU.				 Calculate your time by the unique target sequence #, instead of the entire dataset.
splitting dataset...
Done.


Unnamed: 0,SMILES,Target Sequence,Label,drug_encoding,target_encoding
0,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[6.497, 3.944, 5.8, 4.408, 1.16, 6.265, 3.248,..."
1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[5.95, 4.959, 5.455, 5.041, 4.959, 6.364, 4.05..."
2,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[6.614, 5.657, 3.267, 5.179, 4.701, 6.135, 4.9..."


## CNN 모델 입력에 맞도록 전처리 (DeepPurpose의 data_process함수 사용)

In [9]:
drug_encoding = 'CNN'
target_encoding = 'CNN'
train, val, test = data_process(X_drug, X_target, y,
                                drug_encoding, target_encoding,
                                split_method='random',frac=[0.8,0.1,0.1])
train[:3]

Drug Target Interaction Prediction Mode...
in total: 118254 drug-target pairs
encoding drug...
unique drugs: 2068
encoding protein...
unique target sequence: 229
splitting dataset...
Done.


Unnamed: 0,SMILES,Target Sequence,Label,drug_encoding,target_encoding
0,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MTVKTEAAKGTLTYSRMRGMVAILIAFMKQRRMGLNDFIQKIANNS...,0,"[C, O, C, 1, =, C, (, C, =, C, 2, C, (, =, C, ...","[M, T, V, K, T, E, A, A, K, G, T, L, T, Y, S, ..."
1,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,0,"[C, O, C, 1, =, C, (, C, =, C, 2, C, (, =, C, ...","[M, R, P, S, G, T, A, G, A, A, L, L, A, L, L, ..."
2,COC1=C(C=C2C(=C1)CCN=C2C3=CC(=C(C=C3)Cl)Cl)Cl,MELAALCRWGLLLALLPPGAASTQVCTGTDMKLRLPASPETHLDML...,0,"[C, O, C, 1, =, C, (, C, =, C, 2, C, (, =, C, ...","[M, E, L, A, A, L, C, R, W, G, L, L, L, A, L, ..."
