<a href="https://colab.research.google.com/github/yoonjihwan402/medical-data/blob/main/10__predict_PDBBind_with_rf_and_nn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# DeepChem 설치

Colab 노트북에 DeepChem 설치

tensorflow 기반 모델을 사용할 것이므로 pip install 명령에 [tensorflow]를 추가하여 필요한 종속성도 설치

In [None]:
# DeepChem 설치시 Keras 종속성 버전 일치를 위해 아래 환경 변수 설정 후 import 진행
# import os
# os.environ["TF_USE_LEGACY_KERAS"] = '1'

In [None]:
!pip install deepchem[tensorflow]

Collecting deepchem[tensorflow]
  Downloading deepchem-2.8.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdkit (from deepchem[tensorflow])
  Downloading rdkit-2025.3.2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting tensorflow-addons (from deepchem[tensorflow])
  Downloading tensorflow_addons-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons->deepchem[tensorflow])
  Downloading typeguard-2.13.3-py3-none-any.whl.metadata (3.6 kB)
Downloading deepchem-2.8.0-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rdkit-2025.3.2-cp311-cp311-manylinux_2_28_x86_64.whl (35.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.2/35.2 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorflow_addons-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinu

In [None]:
!pip install deepchem[pytorch]



In [None]:
#!pip install tensorflow==2.14.1

In [None]:
#!pip install deepchem==2.4

## 기타 모듈 설치
- rdkit 설치
- 레거시 옵티마이저 사용을 위한 tf_keras 설치(deepchem ver 2.4.0일 때만)

In [None]:
#!pip install rdkit



In [None]:
#!pip install tf_keras



## deepchem 버전 확인

In [None]:
import deepchem as dc
dc.__version__

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


'2.8.0'

## warning 메시지 생략

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

# 결합력 예측 모델 만들기
DeepChem 라이브러리를 사용하여 그래프 합성곱 신경망을 학습시켜 분자의 용해도를 예측하는 실습 진행

PDBBind 데이터 : 분자의 SMILES 표현과 해당 분자의 용해도 값을 포함함

- random forest model

- multi perceptron model (neural network)

## 랜덤 포레스트 모델

### 데이터 불러오기 및 확인

In [None]:
import deepchem as dc
featurizer = dc.feat.RdkitGridFeaturizer(voxel_width=2.0, sanitize=True, flatten=True,
                                         feature_types=['hbond', 'salt_bridge', 'pi_stack', 'cation_pi', 'ecfp', 'splif'])

In [None]:
pdbbind_tasks, pdbbind_datasets, transformers = dc.molnet.load_pdbbind(
    featurizer=featurizer,
    splitter="random",
    subset="core"
)
train_dataset, valid_dataset, test_dataset = pdbbind_datasets



In [None]:
print(train_dataset)

<DiskDataset X.shape: (np.int64(154), np.int64(1)), y.shape: (np.int64(154),), w.shape: (np.int64(154),), ids: ['3l7b' '2cet' '2weg' ... '2wbg' '3vd4' '3uex'], task_names: [0]>


### 모델 생성 및 훈련

In [None]:
# 모델 생성 및 훈련
from sklearn.ensemble import RandomForestRegressor
sklearn_model = RandomForestRegressor(n_estimators=100)
model = dc.models.SklearnModel(sklearn_model, model_dir="pdbbind_rf")
model.fit(train_dataset)

### 모델 평가

In [None]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score) # 피어슨 상관계수 지표 사용
train_scores = model.evaluate(train_dataset, [metric], transformers)
test_scores = model.evaluate(test_dataset, [metric], transformers)
print("Train scores")
print(train_scores)
print("Test scores")
print(test_scores)

Train scores
{'pearson_r2_score': np.float64(nan)}
Test scores
{'pearson_r2_score': np.float64(nan)}


## 다층 퍼셉트론 (인공 신경망)

### 데이터 불러오기 및 확인
- sanitize : DeepChem이 탐지한 오류 (3차원 구조의 잘못된 형식)

In [None]:
import deepchem as dc
featurizer = dc.feat.RdkitGridFeaturizer(voxel_width=2.0, sanitize=True, flatten=True,
                                         feature_types=['hbond', 'salt_bridge', 'pi_stack', 'cation_pi', 'ecfp', 'splif'])
pdbbind_tasks, pdbbind_datasets, transformers = dc.molnet.load_pdbbind(featurizer=featurizer, splitter="random", subset="core")
train_dataset, valid_dataset, test_dataset = pdbbind_datasets

### 모델 생성 및 훈련

In [None]:
# DiskDataset 데이터를 메모리로 로드하고 float32로 변환
train_X = train_dataset.X.astype('float32')
train_y = train_dataset.y.astype('float32')
valid_X = valid_dataset.X.astype('float32')
valid_y = valid_dataset.y.astype('float32')
test_X = test_dataset.X.astype('float32')
test_y = test_dataset.y.astype('float32')

# NumpyDataset으로 변환
train_dataset = dc.data.NumpyDataset(train_X, train_y, train_dataset.w, train_dataset.ids)
valid_dataset = dc.data.NumpyDataset(valid_X, valid_y, valid_dataset.w, valid_dataset.ids)
test_dataset = dc.data.NumpyDataset(test_X, test_y, test_dataset.w, test_dataset.ids)

In [None]:
# 모델 생성 및 훈련
n_features = train_dataset.X.shape[1]
model = dc.models.MultitaskRegressor(
        n_tasks=len(pdbbind_tasks),
        n_features=n_features,
        layer_sizes=[2000, 1000],
        dropouts=0.5,
        learning_rate=0.0003)
model.fit(train_dataset, nb_epoch=250)

0.7799971771240234

### 모델 평가

In [None]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score) # 피어슨 상관계수 지표 사용
train_scores = model.evaluate(train_dataset, [metric], transformers)
test_scores = model.evaluate(test_dataset, [metric], transformers)
print("Train scores")
print(train_scores)
print("Test scores")
print(test_scores)

Train scores
{'pearson_r2_score': np.float64(0.000409806162900601)}
Test scores
{'pearson_r2_score': np.float64(0.018822942235091802)}
