# Deepchem을 활용한 Tox21 분자독성예측 예제

- Deepchem라이브러리를 활용하여 Tox21 데이터세트에 포함된 분자의 독성을 예측해보는 예제
- Deepchem은 텐서플로우 기반으로 신약개발 분야에 활용되는 머신러닝/딥러닝 라이브러리(=패키지)
- 예제를 통하여 머신러닝/딥러닝을 어떻게 실제 세계의 문제에 적용하는지 접근법과 딥러닝 모델의 구체적인 활용법을 이해

## 데이터세트와 도메인 확인

- Deepchem 라이브러리에는 Tox21데이터셋과 이에 사용할 수 있는 딥러닝 모델을 제공함
- Tox21데이터셋이란 약물(분자)의 독성예측과 관련된 표적 단백질의 실험 데이터
- dc.molnet.load_tox21()을 사용하여 아래와 같이 Task, Dataset, Transformer의 3가지 값을 불러올 수 있음

In [1]:
import numpy as np
import deepchem as dc
import pandas as pd
import pickle

tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()
train_dataset, valid_dataset, test_dataset = tox21_datasets

2023-10-09 16:46:21.349570: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-09 16:46:21.446044: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-09 16:46:21.470730: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-09 16:46:21.942834: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

In [22]:
with open("./real_coconut_smiles.pkl","rb") as f:
    coconut_smiles = pickle.load(f)
    
with open("./real_coconut_cid.pkl","rb") as f:
    coconut_cid = pickle.load(f)

print('cid_names :\t', coconut_cid[:3])
print('smiles :\t\t', coconut_smiles[:3])
print('cid_len :\t', len(coconut_cid))
print('smiles_len :\t', len(coconut_smiles))

cid_names :	 ['CNP0000002', 'CNP0000003', 'CNP0000003.1']
smiles :		 ['O=C(O)C=1C(=O)C(O)(CC(=O)C1N)C2OC(COC(=O)C)C(OC(=O)C(N=CS)=CC)C(OC3OC(C)C(O)C(OC)C3)C2O', 'O=C1OC2C(O)C=CC3C4=C5C(=O)C=6C(OC)=CC=C(OC)C6C(=O)C5=C(OC)C=C4CC32C(O)C7=CC(=CC(OC)=C17)C', 'CC1=CC(=C2C(=C1)[C@@H]([C@@]34CC5=C([C@H]4C=C[C@H]([C@H]3OC2=O)O)C6=C(C(=C5)OC)C(=O)C7=C(C(=CC=C7OC)OC)C6=O)O)OC']
cid_len :	 895068
smiles_len :	 895068


### 데이터 확인
- Task는 아래 12가지 표적 단백질로 구성
- 표적 단백질은 잠재적으로 신약에 활용될 수 있는 분자와 결합시 독성 반응을 보이는 것으로 여겨짐
- tox21_datasets는 train, valid, test의 3가지 데이터셋으로 구성됨
- X벡터 = 학습 또는 추론에 사용할 feature(특징), 각 샘플은 분자의 FingerPrint
- y벡터 = 학습 또는 추론결과인 정답, 참값등, 각 샘플의 레이블 12개는 표적 단백질 12종과의 결합 정도를 의미
- w벡터 = 가중치(weight)값

In [23]:
# DeepChem의 tox21 데이터셋 불러오기
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()
# tox21_datasets은 train, valid, test 데이터를 가지고 있기에 
train_dataset, valid_dataset, test_dataset = tox21_datasets

print('tasks : ', tox21_tasks, end='\n\n')
print('X : ', test_dataset.X, end='\n\n')
print('y : ', test_dataset.y, end='\n\n')
print('w : ', test_dataset.w, end='\n\n')

tasks :  ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']

X :  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

y :  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

w :  [[1.04502242 1.03632599 1.12502653 ... 1.05576503 1.17464996 1.05288369]
 [1.04502242 1.03632599 1.12502653 ... 1.05576503 1.17464996 1.05288369]
 [1.04502242 1.03632599 1.12502653 ... 0.         6.7257384  0.        ]
 ...
 [1.04502242 1.03632599 8.99830221 ... 0.         0.         0.        ]
 [1.04502242 1.03632599 1.12502653 ... 1.05576503 1.17464996 0.        ]
 [1.04502242 1.03632599 1.12502653 ... 0.         0.         1.05288369]]



### COCONUT 데이터와 겹치는 요소 확인

In [24]:
# set을 이용하여 두 리스트에서 겹치는 요소 찾기
train_common_elements = list(set(train_dataset.ids) & set(coconut_smiles))

# 결과 출력
print("Train common elements:", len(train_common_elements))

# train_dataset에 train_common_elements와 겹치는 값이 있으면 False, 없으면 True로 표시되는 array
not_in_array = ~np.isin(train_dataset.ids, train_common_elements)

# 각 X, y, w로 저장
MLP_train_X = train_dataset.X[not_in_array]
MLP_train_y = train_dataset.y[not_in_array]
MLP_train_w = train_dataset.w[not_in_array]

Train common elements: 190


### mlp 모델 지정 및 학습

In [25]:
mlp_train_dataset = dc.data.DiskDataset.from_numpy(MLP_train_X,
                                                   MLP_train_y,
                                                   MLP_train_w,
                                                   train_dataset.ids,
                                                   tox21_tasks)
print(mlp_train_dataset )

<DiskDataset X.shape: (6074, 1024), y.shape: (6074, 12), w.shape: (6074, 12), task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>


In [26]:
# DeepChem MLP 모델 생성 (hidden layer는 크기가 1000인 layer 하나 사용)
model_MLP = dc.models.MultitaskClassifier(n_tasks=12, n_features=1024, layer_sizes=[1000])

# 성능 지표(roc-auc score)를 저장할 리스트
train_roc_list = []
val_roc_list = []
# 최고 성능을 저장할 변수
best_score = 0
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
for i in range(30):
    # 학습
    model_MLP.fit(mlp_train_dataset, nb_epoch=1)
    # 학습 및 검증 데이터 성능 평가
    train_roc = model_MLP.evaluate(mlp_train_dataset, [metric])
    val_roc = model_MLP.evaluate(valid_dataset, [metric])
    # 최고 성능 달성 시 모델 저장 및 최고 성능 갱신
    if val_roc['mean-roc_auc_score'] > best_score:
        best_mlp_model = model_MLP
        best_score = val_roc['mean-roc_auc_score']
    #　진행중인 에폭 및 roc-auc score 출력
    print('Epoch %d:' % (i), end=' ')
    print('train : %05f   , valid : %05f'%(train_roc['mean-roc_auc_score'],                                               val_roc['mean-roc_auc_score']))
    # roc-auc score 각 리스트에 저장
    train_roc_list.append(train_roc['mean-roc_auc_score'])
    val_roc_list.append(val_roc['mean-roc_auc_score'])

Epoch 0: train : 0.860023   , valid : 0.698223
Epoch 1: train : 0.900940   , valid : 0.711668
Epoch 2: train : 0.920182   , valid : 0.714193
Epoch 3: train : 0.930559   , valid : 0.717356
Epoch 4: train : 0.938516   , valid : 0.716262
Epoch 5: train : 0.944641   , valid : 0.713024
Epoch 6: train : 0.948299   , valid : 0.712943
Epoch 7: train : 0.952686   , valid : 0.709055
Epoch 8: train : 0.956144   , valid : 0.711393
Epoch 9: train : 0.958689   , valid : 0.712812
Epoch 10: train : 0.960936   , valid : 0.711902
Epoch 11: train : 0.963475   , valid : 0.711343
Epoch 12: train : 0.965143   , valid : 0.709232
Epoch 13: train : 0.967541   , valid : 0.709173
Epoch 14: train : 0.969063   , valid : 0.710263
Epoch 15: train : 0.970627   , valid : 0.707556
Epoch 16: train : 0.971802   , valid : 0.706731
Epoch 17: train : 0.973238   , valid : 0.706409
Epoch 18: train : 0.974583   , valid : 0.705516
Epoch 19: train : 0.975734   , valid : 0.704756
Epoch 20: train : 0.976794   , valid : 0.705586
Ep

### GCN 모델 지정 및 학습

In [27]:
tasks, datasets, transformers = dc.molnet.load_tox21(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = datasets

train_common_elements = list(set(train_dataset.ids) & set(coconut_smiles))
not_in_array = ~np.isin(train_dataset.ids, train_common_elements)

graph_train_X = train_dataset.X[not_in_array]
graph_train_y = train_dataset.y[not_in_array]
graph_train_w = train_dataset.w[not_in_array]

graph_train_dataset = dc.data.DiskDataset.from_numpy(graph_train_X,
                                                     graph_train_y,
                                                     graph_train_w,
                                                     train_dataset.ids,
                                                     tox21_tasks)
model_Graph = dc.models.GraphConvModel(n_tasks=12, mode='classification', dropout=0.2)
train_roc_list = []
val_roc_list = []
best_score = 0
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
for i in range(50):
    model_Graph.fit(graph_train_dataset, nb_epoch=1)
    train_roc = model_Graph.evaluate(graph_train_dataset, [metric])
    val_roc = model_Graph.evaluate(valid_dataset, [metric])
    if val_roc['mean-roc_auc_score'] > best_score:
        best_graph_model = model_Graph
        best_score = val_roc['mean-roc_auc_score']
    print('Epoch %d:' % (i), end=' ')
    print('train : %05f   , valid : %05f'%(train_roc['mean-roc_auc_score'],                                               val_roc['mean-roc_auc_score']))
    train_roc_list.append(train_roc['mean-roc_auc_score'])
    val_roc_list.append(val_roc['mean-roc_auc_score'])







Epoch 0: train : 0.691769   , valid : 0.623161
Epoch 1: train : 0.735129   , valid : 0.661081
Epoch 2: train : 0.755791   , valid : 0.672553
Epoch 3: train : 0.779365   , valid : 0.674252
Epoch 4: train : 0.790591   , valid : 0.692688
Epoch 5: train : 0.811023   , valid : 0.704774
Epoch 6: train : 0.818076   , valid : 0.690308
Epoch 7: train : 0.834399   , valid : 0.724566
Epoch 8: train : 0.838915   , valid : 0.715603
Epoch 9: train : 0.841670   , valid : 0.726452
Epoch 10: train : 0.851500   , valid : 0.736718
Epoch 11: train : 0.850516   , valid : 0.723931
Epoch 12: train : 0.860079   , valid : 0.725931
Epoch 13: train : 0.860222   , valid : 0.732505
Epoch 14: train : 0.865498   , valid : 0.732945
Epoch 15: train : 0.866532   , valid : 0.726281
Epoch 16: train : 0.868487   , valid : 0.732767
Epoch 17: train : 0.872967   , valid : 0.744671
Epoch 18: train : 0.876447   , valid : 0.739368
Epoch 19: train : 0.880069   , valid : 0.745061
Epoch 20: train : 0.883110   , valid : 0.743447
Ep

### 앞서 제외한 COCONUT 데이터 준비

In [35]:
# fingerprint 및 Graph 데이터 불러오기
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()
MLP_train_dataset, valid_dataset, test_dataset = tox21_datasets

tasks, datasets, transformers = dc.molnet.load_tox21(featurizer='GraphConv')
GRAPH_train_dataset, valid_dataset, test_dataset = datasets

# COCONUT 데이터와 겹치는 물질 추출
in_array = np.isin(MLP_train_dataset.ids, train_common_elements)

# 겹치는 물질의 X 데이터 추출
np_MLP_X = MLP_train_dataset.X[in_array]

# COCONUT 데이터와 겹치는 물질 추출
in_array = np.isin(GRAPH_train_dataset.ids, train_common_elements)

# 겹치는 물질의 X 데이터 추출
np_GRAPH_X = GRAPH_train_dataset.X[in_array]

# 겹치는 물질의 y값 추출
np_graph_y = GRAPH_train_dataset.y[in_array]

# DiskDataset으로 저장
mlp_dataset = dc.data.DiskDataset.from_numpy(np_MLP_X, np.zeros(shape=(190, 12)))
graph_dataset = dc.data.DiskDataset.from_numpy(np_GRAPH_X, np.zeros(shape=(190, 12)))

### 모델 추론 및 후처리 결과 출력

In [30]:
# 각 데이터를 학습된 모델로 추론
np_mlp_predictions = model_MLP.predict(mlp_dataset)
np_graph_predictions = best_graph_model.predict(graph_dataset)

# 출력 값을 
np_mlp_predictions = np.argmax(np_mlp_predictions, axis=2)
print(np_mlp_predictions)

np_graph_predictions = np.argmax(np_graph_predictions, axis=2)
print(np_graph_predictions)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 ...
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 1 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 1 ... 0 0 0]]


### 모델 추련 결과 dataframe으로 출력

In [32]:
# mlp 모델 추론 결과 dataframe으로 변환 후 출력
pd.DataFrame(np_mlp_predictions, columns=tox21_tasks)[:10]

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,1,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,1,1,1,1,0,1
5,0,0,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,1,0,1,0,1
7,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,1,1,0,0,0,0,0


In [33]:
# graph 모델 추론 결과 dataframe으로 변환 후 출력
pd.DataFrame(np_graph_predictions, columns=tox21_tasks)[:10]

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,1,1,1,1,1,1
5,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,1,0,0,0,0,0,0,0


In [36]:
# 실제 y값을 dataframe으로 변환 후 출력
pd.DataFrame(np_graph_y.astype(int), columns=tox21_tasks)[:10]

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,1,0,1,0,0,1
5,1,1,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0
