# Tox21 dataset

In [1]:
import numpy as np
import pandas as pd
import deepchem as dc
from sklearn.ensemble import RandomForestClassifier
from deepchem.models.multitask import SingletaskToMultitask
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.models.sklearn_models import SklearnModel
from deepchem.splits import RandomStratifiedSplitter, RandomSplitter

In [2]:
from deepchem.molnet.load_function.tox21_datasets import load_tox21

In [3]:
tasks_tox21, datasets_tox21, transformers_tox21 = dc.molnet.load_tox21(featurizer='ECFP')

In [4]:
train_dataset_tox21, valid_dataset_tox21, test_dataset_tox21 = datasets_tox21

In [5]:
train_dataset_tox21

<DiskDataset X.shape: (6264, 1024), y.shape: (6264, 12), w.shape: (6264, 12), task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>

In [7]:
test_dataset_tox21

<DiskDataset X.shape: (784, 1024), y.shape: (784, 12), w.shape: (784, 12), ids: ['CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@H]1C(=O)O.CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@H]1C(=O)O.c1ccc(CNCCNCc2ccccc2)cc1'
 'CC(C)(c1ccc(Oc2ccc3c(c2)C(=O)OC3=O)cc1)c1ccc(Oc2ccc3c(c2)C(=O)OC3=O)cc1'
 'Cc1cc(C(C)(C)C)c(O)c(C)c1Cn1c(=O)n(Cc2c(C)cc(C(C)(C)C)c(O)c2C)c(=O)n(Cc2c(C)cc(C(C)(C)C)c(O)c2C)c1=O'
 ... 'CN[C@@H]1C[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21'
 'Cl/C=C\\C[N+]12CN3CN(CN(C3)C1)C2'
 'NC(=O)c1ccc[n+]([C@@H]2O[C@H](COP(=O)([O-])OP(=O)(O)OC[C@H]3O[C@@H](n4cnc5c(N)ncnc54)[C@H](O)[C@@H]3O)[C@@H](O)[C@H]2O)c1'], task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>

MultitaskClassifier

In [6]:
model_tox21 = dc.models.MultitaskClassifier(n_tasks=12, n_features=1024, layer_sizes=[1000])

In [7]:
model_tox21.fit(train_dataset_tox21, nb_epoch=10)

0.4995523452758789

In [8]:
metric_model_tox21 = dc.metrics.Metric(dc.metrics.roc_auc_score)

In [9]:
print('train set score:', model_tox21.evaluate(train_dataset_tox21, [metric_model_tox21], transformers_tox21))

train set score: {'roc_auc_score': 0.9580265853779287}


In [10]:
print('test set score:', model_tox21.evaluate(test_dataset_tox21, [metric_model_tox21], transformers_tox21))

test set score: {'roc_auc_score': 0.6822617813929436}


MultitaskFitTransformRegressor

In [13]:
model_Trans_tox21 = dc.models.MultitaskFitTransformRegressor(n_tasks=12, n_features=1024)

In [14]:
model_Trans_tox21.fit(train_dataset_tox21, nb_epoch=10)

0.20931127866109211

In [17]:
print('train set score:', model_Trans_tox21.evaluate(train_dataset_tox21, [metric_model_tox21], transformers_tox21))

train set score: {'roc_auc_score': 0.04007922279375813}


In [16]:
print('test set score:', model_Trans_tox21.evaluate(test_dataset_tox21, [metric_model_tox21], transformers_tox21))

test set score: {'roc_auc_score': 0.32543379182124055}


GraphConvModel

In [56]:
import warnings
warnings.filterwarnings("ignore") 

In [24]:
tasks, datasets, transformers = dc.molnet.load_tox21(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = datasets

In [25]:
n_tasks = len(tasks)
model = dc.models.GraphConvModel(n_tasks, mode='classification')
model.fit(train_dataset, nb_epoch=50)

0.27615962982177733

In [26]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
print('Training set score:', model.evaluate(train_dataset, [metric], transformers))
print('Test set score:', model.evaluate(test_dataset, [metric], transformers))

Training set score: {'roc_auc_score': 0.9715475735659913}
Test set score: {'roc_auc_score': 0.6983408032545796}


Weave

In [38]:
model_weave = dc.models.WeaveModel(n_tasks=12, n_weave=2, fully_connected_layer_sizes=[2000, 1000], mode="classification")

In [35]:
tasks, datasets, transformers = dc.molnet.load_tox21(featurizer='weave')
train_dataset, valid_dataset, test_dataset = datasets

In [36]:
model_weave.fit(train_dataset, nb_epoch=10)



0.8516061147054036

In [39]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
print('Training set score:', model_weave.evaluate(train_dataset, [metric], transformers))
print('Test set score:', model_weave.evaluate(test_dataset, [metric], transformers))

Training set score: {'roc_auc_score': 0.5450440570812667}
Test set score: {'roc_auc_score': 0.5280923748222247}


GAT

In [14]:
tasks, datasets, transformers = dc.molnet.load_tox21(featurizer=dc.feat.MolGraphConvFeaturizer())
train_dataset, valid_dataset, test_dataset = datasets

In [15]:
from deepchem.models import GATModel
model_GAT_tox21 = dc.models.GATModel(mode='classification', n_tasks=12,batch_size=16, learning_rate=0.001)

In [60]:
model_GAT_tox21.fit(train_dataset, nb_epoch=10)

0.7953201293945312

In [10]:
model_GAT_tox21.fit(test_dataset, nb_epoch=10)



1.0192049662272136

In [11]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
print('Training set score:', model_GAT_tox21.evaluate(train_dataset, [metric], transformers))
print('Test set score:', model_GAT_tox21.evaluate(test_dataset, [metric], transformers))

Training set score: {'roc_auc_score': 0.674109564704637}
Test set score: {'roc_auc_score': 0.7381721056185421}


GCNModel

In [20]:
from deepchem.models import GCNModel
model = dc.models.GCNModel(mode='classification', n_tasks=12,batch_size=16, learning_rate=0.001)

In [21]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
print('Train set score:', model.evaluate(train_dataset, [metric], transformers))
print('Test set score:', model.evaluate(test_dataset, [metric], transformers))

Train set score: {'roc_auc_score': 0.44005264407981715}
Test set score: {'roc_auc_score': 0.47255847285854197}


调参

In [42]:
splitters = ['random', 'scaffold', 'butina']
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
for splitter in splitters:
    tasks, datasets, transformers = dc.molnet.load_tox21(featurizer='ECFP', splitter=splitter)
    train_dataset, valid_dataset, test_dataset = datase
    model = dc.models.MultitaskClassifier(n_tasks=len(tasks), n_features=1024, layer_sizes=[1000])
    model.fit(train_dataset, nb_epoch=10)
    print('splitter:', splitter)
    print('training set score:', model.evaluate(train_dataset, [metric], transformers))
    print('test set score:', model.evaluate(test_dataset, [metric], transformers))
    print()

splitter: random
training set score: {'roc_auc_score': 0.9547528213244006}
test set score: {'roc_auc_score': 0.781296534108335}

splitter: scaffold
training set score: {'roc_auc_score': 0.9575370421155002}
test set score: {'roc_auc_score': 0.6828622596141388}

splitter: butina
training set score: {'roc_auc_score': 0.95879692249371}
test set score: {'roc_auc_score': 0.6040297412788104}



In [45]:
nb= [10,20,30]
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
for nbi in nb:
    tasks, datasets, transformers = dc.molnet.load_tox21(featurizer='ECFP')
    train_dataset, valid_dataset, test_dataset = datasets
    model = dc.models.MultitaskClassifier(n_tasks=len(tasks), n_features=1024, layer_sizes=[1000])
    model.fit(train_dataset, nb_epoch=nbi)
    print('nb:', nbi)
    print('training set score:', model.evaluate(train_dataset, [metric], transformers))
    print('test set score:', model.evaluate(test_dataset, [metric], transformers))
    print()

nb: 10
training set score: {'roc_auc_score': 0.9583504776726373}
test set score: {'roc_auc_score': 0.6821396441979536}

nb: 20
training set score: {'roc_auc_score': 0.9755393704361918}
test set score: {'roc_auc_score': 0.6735055360404312}

nb: 30
training set score: {'roc_auc_score': 0.9838467669681895}
test set score: {'roc_auc_score': 0.6725095393238464}

