<a href="https://colab.research.google.com/github/yoonjihwan402/medical-data/blob/main/02_toxicity_prediction_model_practice_ipynb%EC%9D%98_%EC%82%AC%EB%B3%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DeepChem 설치

Colab 노트북에 DeepChem 설치

tensorflow 기반 모델을 사용할 것이므로 pip install 명령에 [tensorflow]를 추가하여 필요한 종속성도 설치

In [1]:
!pip install deepchem[tensorflow]

Collecting deepchem[tensorflow]
  Downloading deepchem-2.8.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdkit (from deepchem[tensorflow])
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting tensorflow-addons (from deepchem[tensorflow])
  Downloading tensorflow_addons-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons->deepchem[tensorflow])
  Downloading typeguard-2.13.3-py3-none-any.whl.metadata (3.6 kB)
Downloading deepchem-2.8.0-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorflow_addons-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinu

## 기타 모듈 설치
- 레거시 옵티마이저 사용을 위한 tf_keras 설치(deepchem ver 2.4.0일 때만)
- rdkit 설치

In [2]:
#!pip install tf_keras
!pip install rdkit



## deepchem 버전 확인

In [3]:
import deepchem as dc
dc.__version__

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


'2.8.0'

## numpy 버전 확인

In [4]:
import numpy as np
np.__version__

'2.0.2'

## warning 메시지 생략

In [5]:
import warnings
warnings.filterwarnings(action='ignore')

## 데이터셋 다루기

In [6]:
x1 = np.random.random((4,5))
y1 = np.random.random((4,1))

In [7]:
x1

array([[0.23927803, 0.41357497, 0.62167895, 0.89028286, 0.46169129],
       [0.77927194, 0.67241022, 0.51071412, 0.74797033, 0.82705775],
       [0.65564578, 0.3404343 , 0.27280618, 0.30468269, 0.32167141],
       [0.66524717, 0.67294143, 0.10427685, 0.47559093, 0.41765676]])

In [8]:
y1

array([[0.25762576],
       [0.90376443],
       [0.90158531],
       [0.46700351]])

In [9]:
dataset_sample = dc.data.NumpyDataset(x1, y1)

In [10]:
print(dataset_sample.X)
print(dataset_sample.y)

[[0.23927803 0.41357497 0.62167895 0.89028286 0.46169129]
 [0.77927194 0.67241022 0.51071412 0.74797033 0.82705775]
 [0.65564578 0.3404343  0.27280618 0.30468269 0.32167141]
 [0.66524717 0.67294143 0.10427685 0.47559093 0.41765676]]
[[0.25762576]
 [0.90376443]
 [0.90158531]
 [0.46700351]]


In [11]:
np.array_equal(x1, dataset_sample.X)

True

In [12]:
np.array_equal(y1, dataset_sample.y)

True

# 독성 분자 예측 모델 만들기
DeepChem을 사용해 분자의 독성을 예측하는 모델을 훈련하는 실습 진행

Tox21 독성 데이터 : Tox21 데이터 챌린지 대회에 사용된 데이터로 약물의 독성 예측과 관련된 표적 실험 데이터

## 데이터 불러오기 및 확인

In [13]:
# 데이터 불러오기 및 데이터셋 피처화 (행렬 및 벡터로 변환)
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
[02:03:16] Explicit valence for atom # 4 Al, 6, is greater than permitted
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, bool includeChiralPresence=False)
[02:03:17] Explicit valence for atom # 9 Al, 6, is greater than permitted
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, bool includeChiralPresence=False)
[02:03:17] Explicit valence for atom # 5 Al, 6, is greater than permitted
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, boo

In [14]:
# 데이터셋 분류(학습, 검증, 테스트)
train_dataset, valid_dataset, test_dataset = tox21_datasets

In [15]:
# target (class) : 잠재적 신약에 대한 독성 반응과 관련되어 있다고 여겨지는 단백질
# 각 표적은 실험을 통해 얻은 수치가 포함됨(분자와 해당 단백질간의 결합력)
tox21_tasks

['NR-AR',
 'NR-AR-LBD',
 'NR-AhR',
 'NR-Aromatase',
 'NR-ER',
 'NR-ER-LBD',
 'NR-PPAR-gamma',
 'SR-ARE',
 'SR-ATAD5',
 'SR-HSE',
 'SR-MMP',
 'SR-p53']

In [16]:
len(tox21_tasks)

12

In [17]:
tox21_datasets

(<DiskDataset X.shape: (np.int64(6258), np.int64(1024)), y.shape: (np.int64(6258), np.int64(12)), w.shape: (np.int64(6258), np.int64(12)), task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>,
 <DiskDataset X.shape: (np.int64(782), np.int64(1024)), y.shape: (np.int64(782), np.int64(12)), w.shape: (np.int64(782), np.int64(12)), ids: ['CN(C)C(=O)NC1(c2ccccc2)CCN(CCC[C@@]2(c3ccc(Cl)c(Cl)c3)CCCN(C(=O)c3ccccc3)C2)CC1'
  'CSc1nnc(C(C)(C)C)c(=O)n1N'
  'C=C1/C(=C\\C=C2/CCC[C@@]3(C)[C@H]2CC[C@@H]3[C@H](C)/C=C/[C@@H](O)C2CC2)C[C@@H](O)C[C@@H]1O'
  ... 'O=C(O[C@H]1CN2CCC1CC2)N1CCc2ccccc2[C@@H]1c1ccccc1'
  'C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]4[C@H]3C(=C)C[C@@]21CC'
  'NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3c(c2)CCO3)C1'], task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>,
 <DiskDataset X.shape: (np.int64(783), np.int64(1024)), y.shape: (np.int64(783), np.int64(12)), w.shape: (np.int64(783), np.int64(12)), ids: ['CC1(C)S[C@@H]2[C@H](NC

In [18]:
train_dataset.X.shape
#valid_dataset.X.shape
#test_dataset.X.shape

(6258, 1024)

In [19]:
np.shape(train_dataset.y)
#np.shape(valid_dataset.y)
#np.shape(test_dataset.y)

(6258, 12)

## 데이터 전처리
분자 데이터 대부분 표적과 결합하지 않는 데이터로 90% 넘는 데이터의 Label 0.
항상 0을 예측하는 모델을 만들면 정확도가 90%.
즉 데이터셋의 편향 문제가 발생하여 가중치 행렬 조정이 필요

In [20]:
# 불균형 데이터셋 보완
transformers

[<deepchem.trans.transformers.BalancingTransformer at 0x7998e7998110>]

## 모델 생성 및 훈련

In [21]:
# 모델 생성 및 훈련
model = dc.models.MultitaskClassifier(n_tasks=12, n_features=1024, layer_sizes=[1000])
model.fit(train_dataset, nb_epoch=10)

0.48779637018839517

## 모델 평가

In [22]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
train_scores = model.evaluate(train_dataset, [metric], transformers)
test_scores = model.evaluate(test_dataset, [metric], transformers)
print(train_scores)
print(test_scores)

{'mean-roc_auc_score': np.float64(0.9582588000541991)}
{'mean-roc_auc_score': np.float64(0.6819906785888935)}
