# Deepchem을 활용한 Tox21 분자독성예측 예제

- Deepchem라이브러리를 활용하여 Tox21 데이터세트에 포함된 분자의 독성을 예측해보는 예제
- Deepchem은 텐서플로우 기반으로 신약개발 분야에 활용되는 머신러닝/딥러닝 라이브러리(=패키지)
- 예제를 통하여 머신러닝/딥러닝을 어떻게 실제 세계의 문제에 적용하는지 접근법과 딥러닝 모델의 구체적인 활용법을 이해

## 데이터세트와 도메인 확인

- Deepchem 라이브러리에는 Tox21데이터셋과 이에 사용할 수 있는 딥러닝 모델을 제공함
- Tox21데이터셋이란 약물(분자)의 독성예측과 관련된 표적 단백질의 실험 데이터
- dc.molnet.load_tox21()을 사용하여 아래와 같이 Task, Dataset, Transformer의 3가지 값을 불러올 수 있음

In [1]:
#
!pip install deepchem
!pip install tensorflow==2.9



In [2]:
import numpy as np
import deepchem as dc
import pandas as pd
import pickle

tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()
train_dataset, valid_dataset, test_dataset = tox21_datasets



In [3]:
# coconut database 다운로드 후 파일 압축 해제 및 파일명 확인
! wget https://coconut.s3.uni-jena.de/prod/downloads/2024-09/coconut-09-2024.csv.zip
!unzip ./coconut-09-2024.csv.zip

--2024-09-25 15:09:24--  https://coconut.s3.uni-jena.de/prod/downloads/2024-09/coconut-09-2024.csv.zip
Resolving coconut.s3.uni-jena.de (coconut.s3.uni-jena.de)... 141.35.104.25, 141.35.104.26, 2001:638:1558:2368::8d23:681a, ...
Connecting to coconut.s3.uni-jena.de (coconut.s3.uni-jena.de)|141.35.104.25|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 88935618 (85M) [application/zip]
Saving to: ‘coconut-09-2024.csv.zip.4’


2024-09-25 15:09:30 (16.7 MB/s) - ‘coconut-09-2024.csv.zip.4’ saved [88935618/88935618]

Archive:  ./coconut-09-2024.csv.zip
replace coconut-09-2024.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: coconut-09-2024.csv     
replace __MACOSX/._coconut-09-2024.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: __MACOSX/._coconut-09-2024.csv  


In [4]:
coconut = pd.read_csv('./coconut-09-2024.csv')
coconut

Unnamed: 0,standard_inchi,standard_inchi_key,canonical_smiles,identifier
0,InChI=1S/C43H53N9O14S2.Na/c1-5-22(3)35-36(57)4...,DRKUXFLLRIKRHH-QDVYGYDXSA-M,CC[C@@H]1NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)CNC(...,CNP0437004.1
1,InChI=1S/C21H32O12/c1-9-14(23)16(25)18(27)21(3...,OXHVQSRYUNGYOK-NUASCYGXSA-N,COC1=CC=C(CCO[C@@H]2O[C@H](CO[C@@H]3O[C@@H](C)...,CNP0243002.1
2,InChI=1S/C36H61N5O7/c1-21(2)18-27-35(47)48-28(...,NEGZFRNAAJQQEG-NOFCQABOSA-N,C/C1=C\[C@@H](C(C)(C)C)OC(=O)[C@H](CC(C)C)N(C)...,CNP0458114.1
3,InChI=1S/C22H22O9/c1-28-12-4-2-11(3-5-12)15-9-...,DQIVYFNWBDHNFD-WHCFWRGISA-N,COC1=CC=C(C2=CC(=O)OC3=CC(O[C@@H]4O[C@H](CO)[C...,CNP0252086.2
4,InChI=1S/C32H41N5O4/c1-6-18(4)28-32(41)36-12-8...,HKVSEIVDIONNKB-QWNGKRCASA-N,CC[C@H](C)[C@H]1C(=O)N2CCC[C@H]2C(=O)N1C(=O)[C...,CNP0107934.1
...,...,...,...,...
695128,InChI=1S/C19H19N3O4/c1-26-13-8-6-12(7-9-13)20-...,HHSNDFVMRMIDBG-INIZCTEOSA-N,COC1=CC=C(NC(=O)CC[C@@H]2NC(=O)C3=CC=CC=C3NC2=...,CNP0395779.1
695129,InChI=1S/C30H30N2O10/c1-12-23(34)27(38)28(39)3...,VVPODVCQSZKNKL-RLOKSPFPSA-N,CC(=O)OC1=CC=C2C(=O)C3=C(O)C(CC4=CC=CC(C(N)N)=...,CNP0097600.1
695130,InChI=1S/C21H22O7/c1-11(2)4-5-13-15(23)7-6-14(...,LMFCHRAKSGPODM-OAQYLSRUSA-N,COC1=C([C@]2(O)COC3=CC(O)=CC(O)=C3C2=O)C=CC(O)...,CNP0212403.1
695131,InChI=1S/C20H30O7/c1-17(2)4-3-12(23)18-8-27-20...,IJWNAKYUVUUYTE-HMBONYETSA-N,CC1(C)CC[C@H](O)[C@]23COC(O)([C@@H](O)[C@H]12)...,CNP0494455.1


In [5]:
# SMILES와 cid를 저장할 리스트 생성
coconut_smiles = list(coconut['canonical_smiles'])
coconut_cid = list(coconut['identifier'])

print('cid_names :\t', coconut_cid[:3])
print('smiles :\t\t', coconut_smiles[:3])
print('cid_len :\t', len(coconut_cid))
print('smiles_len :\t', len(coconut_smiles))

cid_names :	 ['CNP0437004.1', 'CNP0243002.1', 'CNP0458114.1']
smiles :		 ['CC[C@@H]1NC(=O)[C@@H](NC(=O)[C@H](C)NC(=O)CNC(=O)C2=CC=C(O)C=C2)CNC(=O)[C@H](CS(=O)(=O)[O-])NC(=O)/C=C/C2=CSC(=N2)[C@H](CC2=CC=C(O)C=C2)NC(=O)C(=O)[C@H]([C@@H](C)CC)NC1=O.[Na+]', 'COC1=CC=C(CCO[C@@H]2O[C@H](CO[C@@H]3O[C@@H](C)[C@H](O)[C@@H](O)[C@H]3O)[C@@H](O)[C@H](O)[C@H]2O)C=C1O', 'C/C1=C\\[C@@H](C(C)(C)C)OC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H](C)N(C)C(=O)CNC(=O)[C@H](C(C)C)NC(=O)[C@@H]2CCCN2C(=O)[C@H](C)CC1']
cid_len :	 695133
smiles_len :	 695133


### 데이터 확인
- Task는 아래 12가지 표적 단백질로 구성
- 표적 단백질은 잠재적으로 신약에 활용될 수 있는 분자와 결합시 독성 반응을 보이는 것으로 여겨짐
- tox21_datasets는 train, valid, test의 3가지 데이터셋으로 구성됨
- X벡터 = 학습 또는 추론에 사용할 feature(특징), 각 샘플은 분자의 FingerPrint
- y벡터 = 학습 또는 추론결과인 정답, 참값등, 각 샘플의 레이블 12개는 표적 단백질 12종과의 결합 정도를 의미
- w벡터 = 가중치(weight)값

In [6]:
# DeepChem의 tox21 데이터셋 불러오기
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()
# tox21_datasets은 train, valid, test 데이터를 가지고 있기에
train_dataset, valid_dataset, test_dataset = tox21_datasets

print('tasks : ', tox21_tasks, end='\n\n')
print('X : ', test_dataset.X, end='\n\n')
print('y : ', test_dataset.y, end='\n\n')
print('w : ', test_dataset.w, end='\n\n')

tasks :  ['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']

X :  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

y :  [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

w :  [[1.04502242 1.03632599 1.12502653 ... 1.05576503 1.17464996 1.05288369]
 [1.04502242 1.03632599 1.12502653 ... 1.05576503 1.17464996 1.05288369]
 [1.04502242 1.03632599 1.12502653 ... 0.         6.7257384  0.        ]
 ...
 [1.04502242 1.03632599 8.99830221 ... 0.         0.         0.        ]
 [1.04502242 1.03632599 1.12502653 ... 1.05576503 1.17464996 0.        ]
 [1.04502242 1.03632599 1.12502653 ... 0.         0.         1.05288369]]



### COCONUT 데이터와 겹치는 요소 확인

In [7]:
# set을 이용하여 두 리스트에서 겹치는 요소 찾기
train_common_elements = list(set(train_dataset.ids) & set(coconut_smiles))

# 결과 출력
print("Train common elements:", len(train_common_elements))

# train_dataset에 train_common_elements와 겹치는 값이 있으면 False, 없으면 True로 표시되는 array
not_in_array = ~np.isin(train_dataset.ids, train_common_elements)

# 각 X, y, w로 저장
MLP_train_X = train_dataset.X[not_in_array]
MLP_train_y = train_dataset.y[not_in_array]
MLP_train_w = train_dataset.w[not_in_array]

Train common elements: 996


### mlp 모델 지정 및 학습

In [8]:
mlp_train_dataset = dc.data.DiskDataset.from_numpy(MLP_train_X,
                                                   MLP_train_y,
                                                   MLP_train_w,
                                                   train_dataset.ids,
                                                   tox21_tasks)
print(mlp_train_dataset )

<DiskDataset X.shape: (5268, 1024), y.shape: (5268, 12), w.shape: (5268, 12), task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>


In [9]:
# DeepChem MLP 모델 생성 (hidden layer는 크기가 1000인 layer 하나 사용)
model_MLP = dc.models.MultitaskClassifier(n_tasks=12, n_features=1024, layer_sizes=[1000])

# 성능 지표(roc-auc score)를 저장할 리스트
train_roc_list = []
val_roc_list = []
# 최고 성능을 저장할 변수
best_score = 0
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
for i in range(50):
    # 학습
    model_MLP.fit(mlp_train_dataset, nb_epoch=1)
    # 학습 및 검증 데이터 성능 평가
    train_roc = model_MLP.evaluate(mlp_train_dataset, [metric])
    val_roc = model_MLP.evaluate(valid_dataset, [metric])
    # 최고 성능 달성 시 모델 저장 및 최고 성능 갱신
    if val_roc['mean-roc_auc_score'] > best_score:
        best_mlp_model = model_MLP
        best_score = val_roc['mean-roc_auc_score']
    #　진행중인 에폭 및 roc-auc score 출력
    print('Epoch %d:' % (i), end=' ')
    print('train : %05f   , valid : %05f'%(train_roc['mean-roc_auc_score'],                                               val_roc['mean-roc_auc_score']))
    # roc-auc score 각 리스트에 저장
    train_roc_list.append(train_roc['mean-roc_auc_score'])
    val_roc_list.append(val_roc['mean-roc_auc_score'])

Epoch 0: train : 0.853300   , valid : 0.698084
Epoch 1: train : 0.898619   , valid : 0.714052


### GCN 모델 지정 및 학습

In [14]:
tasks, datasets, transformers = dc.molnet.load_tox21(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = datasets

train_common_elements = list(set(train_dataset.ids) & set(coconut_smiles))
not_in_array = ~np.isin(train_dataset.ids, train_common_elements)

graph_train_X = train_dataset.X[not_in_array]
graph_train_y = train_dataset.y[not_in_array]
graph_train_w = train_dataset.w[not_in_array]

graph_train_dataset = dc.data.DiskDataset.from_numpy(graph_train_X,
                                                     graph_train_y,
                                                     graph_train_w,
                                                     train_dataset.ids,
                                                     tox21_tasks)
model_Graph = dc.models.GraphConvModel(n_tasks=12, mode='classification', dropout=0.2, batch_normalize=False)

train_roc_list = []
val_roc_list = []
best_score = 0
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
for i in range(50):
    model_Graph.fit(graph_train_dataset, nb_epoch=1)
    train_roc = model_Graph.evaluate(graph_train_dataset, [metric])
    val_roc = model_Graph.evaluate(valid_dataset, [metric])
    if val_roc['mean-roc_auc_score'] > best_score:
        best_graph_model = model_Graph
        best_score = val_roc['mean-roc_auc_score']
    print('Epoch %d:' % (i), end=' ')
    print('train : %05f   , valid : %05f'%(train_roc['mean-roc_auc_score'],                                               val_roc['mean-roc_auc_score']))
    train_roc_list.append(train_roc['mean-roc_auc_score'])
    val_roc_list.append(val_roc['mean-roc_auc_score'])



Epoch 0: train : 0.679370   , valid : 0.625171
Epoch 1: train : 0.719695   , valid : 0.653752
Epoch 2: train : 0.736445   , valid : 0.677514
Epoch 3: train : 0.765760   , valid : 0.690746
Epoch 4: train : 0.777632   , valid : 0.701286
Epoch 5: train : 0.799545   , valid : 0.716273
Epoch 6: train : 0.804349   , valid : 0.712068
Epoch 7: train : 0.817194   , valid : 0.721737
Epoch 8: train : 0.827660   , valid : 0.721887
Epoch 9: train : 0.831288   , valid : 0.728758
Epoch 10: train : 0.839956   , valid : 0.729105
Epoch 11: train : 0.844617   , valid : 0.734961
Epoch 12: train : 0.847211   , valid : 0.740401
Epoch 13: train : 0.851404   , valid : 0.736935
Epoch 14: train : 0.859623   , valid : 0.748383
Epoch 15: train : 0.861101   , valid : 0.737429
Epoch 16: train : 0.863292   , valid : 0.743845
Epoch 17: train : 0.865270   , valid : 0.738416
Epoch 18: train : 0.870357   , valid : 0.740081
Epoch 19: train : 0.876131   , valid : 0.743966
Epoch 20: train : 0.875150   , valid : 0.737085
Ep

### 앞서 제외한 COCONUT 데이터 준비

In [15]:
# fingerprint 및 Graph 데이터 불러오기
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()
MLP_train_dataset, valid_dataset, test_dataset = tox21_datasets

tasks, datasets, transformers = dc.molnet.load_tox21(featurizer='GraphConv')
GRAPH_train_dataset, valid_dataset, test_dataset = datasets

# COCONUT 데이터와 겹치는 물질 추출
in_array = np.isin(MLP_train_dataset.ids, train_common_elements)

# 겹치는 물질의 X 데이터 추출
np_MLP_X = MLP_train_dataset.X[in_array]

# COCONUT 데이터와 겹치는 물질 추출
in_array = np.isin(GRAPH_train_dataset.ids, train_common_elements)

# 겹치는 물질의 X 데이터 추출
np_GRAPH_X = GRAPH_train_dataset.X[in_array]

# 겹치는 물질의 y값 추출
np_graph_y = GRAPH_train_dataset.y[in_array]

# DiskDataset으로 저장
mlp_dataset = dc.data.DiskDataset.from_numpy(np_MLP_X, np.zeros(shape=(996, 12)))
graph_dataset = dc.data.DiskDataset.from_numpy(np_GRAPH_X, np.zeros(shape=(996, 12)))

### 모델 추론 및 후처리 결과 출력

In [16]:
model_MLP

<deepchem.models.fcnet.MultitaskClassifier at 0x7dd2a1a37e50>

In [17]:
# 각 데이터를 학습된 모델로 추론
np_mlp_predictions = model_MLP.predict(mlp_dataset)
np_graph_predictions = best_graph_model.predict(graph_dataset)

# 출력 값을
np_mlp_predictions = np.argmax(np_mlp_predictions, axis=2)
print(np_mlp_predictions)

np_graph_predictions = np.argmax(np_graph_predictions, axis=2)
print(np_graph_predictions)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [1 1 0 ... 0 1 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 1 0]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]]


### 모델 추련 결과 dataframe으로 출력

In [18]:
# mlp 모델 추론 결과 dataframe으로 변환 후 출력
pd.DataFrame(np_mlp_predictions, columns=tox21_tasks)[:10]

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,1,1,0,0,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0,0,0
7,0,0,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,0,0,0,0,0


In [19]:
# graph 모델 추론 결과 dataframe으로 변환 후 출력
pd.DataFrame(np_graph_predictions, columns=tox21_tasks)[:10]

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0


In [20]:
# 실제 y값을 dataframe으로 변환 후 출력
pd.DataFrame(np_graph_y.astype(int), columns=tox21_tasks)[:10]

Unnamed: 0,NR-AR,NR-AR-LBD,NR-AhR,NR-Aromatase,NR-ER,NR-ER-LBD,NR-PPAR-gamma,SR-ARE,SR-ATAD5,SR-HSE,SR-MMP,SR-p53
0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,1,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,0,0
