<a href="https://colab.research.google.com/github/yoonjihwan402/medical-data/blob/main/cell_counting_and_segmentation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DeepChem 설치

In [1]:
# DeepChem 설치시 Keras 종속성 버전 일치를 위해 아래 환경 변수 설정 후 import 진행
import os
os.environ["TF_USE_LEGACY_KERAS"] = '1'

In [2]:
!pip install deepchem[tensorflow]

Collecting deepchem[tensorflow]
  Downloading deepchem-2.8.0-py3-none-any.whl.metadata (2.0 kB)
Collecting rdkit (from deepchem[tensorflow])
  Downloading rdkit-2025.3.2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting tensorflow-addons (from deepchem[tensorflow])
  Downloading tensorflow_addons-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting typeguard<3.0.0,>=2.7 (from tensorflow-addons->deepchem[tensorflow])
  Downloading typeguard-2.13.3-py3-none-any.whl.metadata (3.6 kB)
Downloading deepchem-2.8.0-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rdkit-2025.3.2-cp311-cp311-manylinux_2_28_x86_64.whl (35.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.2/35.2 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tensorflow_addons-0.23.0-cp311-cp311-manylinux_2_17_x86_64.manylinu

In [None]:
# !pip install deepchem[pytorch]

In [3]:
import warnings
warnings.filterwarnings(action='ignore')

# BBBC 현미경 데이터셋

## 파일 다운로드

In [4]:
!wget https://data.broadinstitute.org/bbbc/BBBC005/BBBC005_v1_images.zip
!unzip BBBC005_v1_images.zip

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
  inflating: BBBC005_v1_images/SIMCEPImages_I14_C57_F26_s13_w2.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_E18_C74_F14_s15_w2.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_P12_C48_F48_s05_w2.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_K15_C61_F32_s01_w1.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_L12_C48_F35_s12_w2.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_N09_C35_F42_s18_w1.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_I08_C31_F26_s07_w2.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_M18_C74_F39_s13_w2.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_E21_C87_F14_s14_w1.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_A01_C1_F1_s25_w2.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_J15_C61_F29_s23_w1.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_N02_C5_F42_s07_w2.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_I11_C44_F26_s15_w1.TIF  
  inflating: BBBC005_v1_images/SIMCEPImages_A16_C66_F1_s

## 라이브러리 불러오기

In [5]:
import deepchem as dc
import tensorflow as tf
import tensorflow.keras.layers as layers
import numpy as np
import os
import re

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


In [48]:
RETRAIN = False # 모델 학습 시킬지 여부 (False 시 기존 학습된 모델 사용) / 너무 많은 양이기 때문에 false만 확인

In [7]:
os.getcwd() # 현재 위치 확인

'/content'

## 파일 불러오기

In [8]:
image_dir = 'BBBC005_v1_images'
files = []
labels = []

for f in os.listdir(image_dir):
  if f.endswith('.TIF'):
    files.append(os.path.join(image_dir, f))
    temp = re.findall('\d{2}_C(.*?)_', f) #regular expression
    labels.append(int(temp[0]))
    # labels.append(int(re.findall('_C(.*?)_', f)[0]))

In [9]:
print(len(files),len(labels))

19200 19200


In [10]:
files[100]

'BBBC005_v1_images/SIMCEPImages_P22_C91_F48_s07_w1.TIF'

In [11]:
labels[100]

91

In [12]:
dataset = dc.data.ImageDataset(files, np.array(labels))
splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset, seed=42)

In [13]:
train_dataset

<ImageDataset X.shape: (np.int64(15360), np.int64(520), np.int64(696)), y.shape: (15360,), w.shape: (15360,), task_names: [0]>

# 학습 모델 불러오기

In [14]:
!mkdir models

In [15]:
cd models

/content/models


In [16]:
!pwd

/content/models


In [17]:
!wget https://s3-us-west-1.amazonaws.com/deepchem.io/featurized_datasets/microscopy_models.zip
!unzip microscopy_models.zip

--2025-06-02 02:03:41--  https://s3-us-west-1.amazonaws.com/deepchem.io/featurized_datasets/microscopy_models.zip
Resolving s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)... 52.219.116.184, 52.219.193.40, 16.15.4.128, ...
Connecting to s3-us-west-1.amazonaws.com (s3-us-west-1.amazonaws.com)|52.219.116.184|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 88952487 (85M) [application/zip]
Saving to: ‘microscopy_models.zip’


2025-06-02 02:03:42 (81.4 MB/s) - ‘microscopy_models.zip’ saved [88952487/88952487]

Archive:  microscopy_models.zip
   creating: model/
  inflating: model/model-6999.data-00000-of-00001  
  inflating: model/model-5999.index  
  inflating: model/model-3999.meta   
  inflating: model/model-4999.index  
  inflating: model/model-7700.data-00000-of-00001  
  inflating: model/model-6999.index  
  inflating: model/model-5999.data-00000-of-00001  
  inflating: model/model-4999.data-00000-of-00001  
  inflating: model/model-5999.meta   
  inf

# 예제 1 : 세포 수 측정

## 모델 생성

In [18]:
os.getcwd()

'/content/models'

In [19]:
features = tf.keras.Input(shape=(520, 696, 1))
prev_layer = features
for num_outputs in [16, 32, 64, 128, 256]:
  prev_layer = layers.Conv2D(num_outputs, kernel_size=5, strides=2, activation=tf.nn.relu)(prev_layer)
output = layers.Dense(1)(layers.Flatten()(prev_layer))
keras_model = tf.keras.Model(inputs=features, outputs=output)
learning_rate = dc.models.optimizers.ExponentialDecay(0.001, 0.9, 250)
model = dc.models.KerasModel(
    keras_model,
    loss=dc.models.losses.L2Loss(),
    learning_rate=learning_rate,
    model_dir='model')

if not os.path.exists('./model'):
  os.mkdir('model')

if not RETRAIN:
  model.restore()

Instructions for updating:
Restoring a name-based tf.train.Saver checkpoint using the object-based restore API. This mode uses global names to match variables, and so is somewhat fragile. It also adds new restore ops to the graph each time it is called when graph building. Prefer re-encoding training checkpoints in the object-based format: run save() on the object-based saver (the same one this message is coming from) and use that checkpoint in the future.


In [20]:
model

<deepchem.models.keras_model.KerasModel at 0x7fc699913810>

In [21]:
if RETRAIN:
  print("About to fit model for 50 epochs")
  model.fit(train_dataset, nb_epoch=50)

## 모델 평가

In [38]:
import math
import numpy as np
np.math = math

In [39]:
os.chdir('../')

In [40]:
y_pred = model.predict(test_dataset).flatten()
print(np.sqrt(np.mean((y_pred-test_dataset.y)**2)))

59.5666687752442


# 예제 2 : 세포 세그멘테이션

## mask 불러오기

In [49]:
os.getcwd() #현재위치

'/content'

In [50]:
!wget https://data.broadinstitute.org/bbbc/BBBC005/BBBC005_v1_ground_truth.zip
!unzip BBBC005_v1_ground_truth.zip

--2025-06-02 02:18:48--  https://data.broadinstitute.org/bbbc/BBBC005/BBBC005_v1_ground_truth.zip
Resolving data.broadinstitute.org (data.broadinstitute.org)... 69.173.68.137
Connecting to data.broadinstitute.org (data.broadinstitute.org)|69.173.68.137|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 12158428 (12M) [application/zip]
Saving to: ‘BBBC005_v1_ground_truth.zip’


2025-06-02 02:18:49 (7.83 MB/s) - ‘BBBC005_v1_ground_truth.zip’ saved [12158428/12158428]

Archive:  BBBC005_v1_ground_truth.zip
   creating: synthetic_2_ground_truth/
  inflating: synthetic_2_ground_truth/SIMCEPImages_A14_C57_F1_s21_w1.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A04_C14_F1_s15_w2.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A22_C91_F1_s25_w1.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A13_C53_F1_s14_w2.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A23_C96_F1_s23_w1.TIF  
  inflating: synthetic_2_ground_truth/SIMCEPImages_A07

## mask 파일 읽기

In [51]:
os.getcwd()

'/content'

In [52]:
image_dir = 'BBBC005_v1_images'
label_dir = 'BBBC005_v1_ground_truth'
rows = ('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P')
blurs = (1, 4, 7, 10, 14, 17, 20, 23, 26, 29, 32, 35, 39, 42, 45, 48)
mask_files = []
mask_labels = []
for f in os.listdir(label_dir):
  if f.endswith('.TIF'):
    for row, blur in zip(rows, blurs):
      fname = f.replace('_F1', '_F%d'%blur).replace('_A', '_%s'%row)
      mask_files.append(os.path.join(image_dir, fname))
      mask_labels.append(os.path.join(label_dir, f))

In [53]:
mask_files[0]

'BBBC005_v1_images/SIMCEPImages_A13_C53_F1_s05_w1.TIF'

In [54]:
mask_labels[0]

'BBBC005_v1_ground_truth/SIMCEPImages_A13_C53_F1_s05_w1.TIF'

## 데이터셋 분리

In [55]:
dataset = dc.data.ImageDataset(mask_files, mask_labels)
splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset, seed=123)

In [32]:
train_dataset

<ImageDataset X.shape: (np.int64(15360), np.int64(520), np.int64(696)), y.shape: (15360,), w.shape: (15360,), task_names: [0]>

## 모델 생성

In [31]:
features = tf.keras.Input(shape=(520, 696, 1))

### 다운샘플링

In [30]:
conv1 = layers.Conv2D(16, kernel_size=5, strides=2, activation=tf.nn.relu, padding='same')(features/255.0)
conv2 = layers.Conv2D(32, kernel_size=5, strides=2, activation=tf.nn.relu, padding='same')(conv1)
conv3 = layers.Conv2D(64, kernel_size=5, strides=2, activation=tf.nn.relu, padding='same')(conv2)

### conv

In [57]:
conv4 = layers.Conv2D(64, kernel_size=1, strides=1)(conv3)

### 업샘플링

In [58]:
concat1 = layers.Concatenate(axis=3)([conv3, conv4])
deconv1 = layers.Conv2DTranspose(32, kernel_size=5, strides=2, activation=tf.nn.relu, padding='same')(concat1)
concat2 = layers.Concatenate(axis=3)([conv2, deconv1])
deconv2 = layers.Conv2DTranspose(16, kernel_size=5, strides=2, activation=tf.nn.relu, padding='same')(concat2)
concat3 = layers.Concatenate(axis=3)([conv1, deconv2])
deconv3 = layers.Conv2DTranspose(1, kernel_size=5, strides=2, activation=tf.nn.relu, padding='same')(concat3)

### 결과값 및 모델 생성

In [35]:
os.getcwd()

'/content/models'

In [59]:
concat4 = layers.Concatenate(axis=3)([features, deconv3])
logits = layers.Conv2D(1, kernel_size=5, strides=1, padding='same')(concat4)
output = layers.Activation(tf.math.sigmoid)(logits)

keras_model = tf.keras.Model(inputs=features, outputs=[output, logits])
learning_rate = dc.models.optimizers.ExponentialDecay(0.01, 0.9, 250)

seg_model = dc.models.KerasModel(
    keras_model,
    loss=dc.models.losses.SigmoidCrossEntropy(),
    output_types=['prediction', 'loss'],
    learning_rate=learning_rate,
    model_dir='/content/models/segmentation')

ValueError: Graph disconnected: cannot obtain value for tensor KerasTensor(type_spec=TensorSpec(shape=(None, 520, 696, 1), dtype=tf.float32, name='input_1'), name='input_1', description="created by layer 'input_1'") at layer "tf.math.truediv". The following previous layers were accessed without issue: []

In [45]:
seg_model.model_dir

NameError: name 'seg_model' is not defined

In [46]:
if not os.path.exists('./models/segmentation'):
  os.mkdir('models/segmentation')

if not RETRAIN:
  seg_model.restore()

NameError: name 'seg_model' is not defined

In [47]:
if RETRAIN:
  print("About to fit model for 50 epochs")
  seg_model.fit(train_dataset, nb_epoch=50, checkpoint_interval=100)

## 모델 평가

In [None]:
scores = []
for x, y, w, id in test_dataset.itersamples():
  y_pred = seg_model.predict_on_batch([x]).squeeze()
  scores.append(np.mean((y>0) == (y_pred>0.5)))
print(np.mean(scores))

0.7019057647408268
