In [None]:
import sys,platform,os
import numpy as np, pandas as pd
import deepchem as dc
import rdkit
from rdkit import Chem

print("Python:",sys.version)
print("Platform:",platform.platform())
print("DeepChem:", dc.__version__)
print("RDKit:", rdkit.__version__)
print("NumPy:", np.__version__)
print("Pandas:", pd.__version__)

Python: 3.10.19 | packaged by conda-forge | (main, Oct 13 2025, 14:05:01) [MSC v.1944 64 bit (AMD64)]
Platform: Windows-10-10.0.26100-SP0
DeepChem: 2.8.0
RDKit: 2024.09.1
NumPy: 1.24.4
Pandas: 2.3.3


In [23]:
import deepchem as dc
import numpy as np
from sklearn.metrics import roc_auc_score
import shutil
import os
# 壊れたキャッシュを削除
cache_dir = r"C:\Users\unine\AppData\Local\Temp\tox21-featurized"
if os.path.exists(cache_dir):
    print("古いキャッシュを削除中...")
    shutil.rmtree(cache_dir, ignore_errors=True)
    print("削除完了")

古いキャッシュを削除中...
削除完了


In [25]:
import deepchem as dc
import numpy as np
from sklearn.metrics import roc_auc_score

# データ読み込み
print("データ読み込み中...")
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21(
    featurizer='ECFP',
    splitter='random',
    reload=False  # キャッシュを使わず再読み込み
)

train_dataset, valid_dataset, test_dataset = tox21_datasets

print(f"訓練: {len(train_dataset)}, 検証: {len(valid_dataset)}, テスト: {len(test_dataset)}")
print(f"タスク数: {len(tox21_tasks)}")

# モデル構築
print("\nモデル構築中...")
model = dc.models.MultitaskClassifier(
    n_tasks=len(tox21_tasks),
    n_features=1024,
    layer_sizes=[1000],
    dropouts=0.25,
    learning_rate=0.001
)

# 学習
print("\n学習中...")
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)

for epoch in range(1, 51):
    model.fit(train_dataset, nb_epoch=1)
    if epoch % 10 == 0:
        valid_scores = model.evaluate(valid_dataset, [metric])
        print(f"Epoch {epoch}: Valid AUC = {valid_scores['mean-roc_auc_score']:.4f}")

# 評価
print("\n最終評価:")
test_scores = model.evaluate(test_dataset, [metric])
print(f"Test AUC-ROC: {test_scores['mean-roc_auc_score']:.4f}")

# タスクごとの結果
print("\nタスクごとの結果:")
y_pred = model.predict(test_dataset)  # shape: (n_samples, n_tasks, 2)

for i, task in enumerate(tox21_tasks):
    # 有効なサンプル（重みwが0でないもの）
    valid_idx = test_dataset.w[:, i] != 0

    if valid_idx.sum() == 0:
        print(f"{task}: 有効サンプルなし（skip）")
        continue

    y_true = test_dataset.y[valid_idx, i]                 # shape: (n_valid,)
    y_score = y_pred[valid_idx, i, 1]                     # 陽性クラス(=1)の予測確率/スコア, shape: (n_valid,)

    # 片側クラスのみだとAUCは定義できないのでスキップ
    uniq = np.unique(y_true[~np.isnan(y_true)])
    if uniq.size < 2:
        print(f"{task}: 片側クラスのみでAUC不可（skip）")
        continue

    auc = roc_auc_score(y_true, y_score)
    print(f"{task}: {auc:.4f}")


# モデル保存
model.save_checkpoint(model_dir='tox21_model')
print("\nモデル保存完了")

データ読み込み中...


[16:04:00] Explicit valence for atom # 8 Al, 6, is greater than permitted
Failed to featurize datapoint 1322, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(class RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, bool includeChiralPresence=False)
[16:04:01] Explicit valence for atom # 3 Al, 6, is greater than permitted
Failed to featurize datapoint 2290, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(class RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True, bool includeAtomMaps=True, bool includeChiralPresence=False)
[16:04:01] Explicit valence for atom # 4 Al, 6, is greater than permitted
Failed to featurize datapo

訓練: 6258, 検証: 782, テスト: 783
タスク数: 12

モデル構築中...

学習中...
Epoch 10: Valid AUC = 0.7958
Epoch 20: Valid AUC = 0.7884
Epoch 30: Valid AUC = 0.7851
Epoch 40: Valid AUC = 0.7807
Epoch 50: Valid AUC = 0.7790

最終評価:
Test AUC-ROC: 0.7710

タスクごとの結果:
NR-AR: 0.7933
NR-AR-LBD: 0.8380
NR-AhR: 0.8473
NR-Aromatase: 0.8128
NR-ER: 0.7041
NR-ER-LBD: 0.8113
NR-PPAR-gamma: 0.7986
SR-ARE: 0.6963
SR-ATAD5: 0.7520
SR-HSE: 0.7535
SR-MMP: 0.8013
SR-p53: 0.7905

モデル保存完了
