# ClinTox

In [1]:
import numpy as np
import pandas as pd
import deepchem as dc
from sklearn.ensemble import RandomForestClassifier
from deepchem.models.multitask import SingletaskToMultitask
from deepchem import metrics
from deepchem.metrics import Metric
from deepchem.models.sklearn_models import SklearnModel
from deepchem.splits import RandomStratifiedSplitter, RandomSplitter

In [5]:
from deepchem.molnet.load_function.clintox_datasets import load_clintox
tasks, datasets, transformers = dc.molnet.load_clintox(featurizer='ECFP')
train_dataset, valid_dataset, test_dataset = datasets

In [6]:
test_dataset

<DiskDataset X.shape: (148, 1024), y.shape: (148, 2), w.shape: (148, 2), ids: ['C[C@@]1(C(=O)N2[C@H](C(=O)N3CCC[C@H]3[C@@]2(O1)O)Cc4ccccc4)NC(=O)[C@@H]5C[C@@H]6c7cccc8c7c(c[nH]8)C[C@H]6[NH+](C5)C'
 'C[C@@]1([C@@H](N2[C@H](S1(=O)=O)CC2=O)C(=O)[O-])Cn3ccnn3'
 'C[C@@](c1ccccc1)(c2ccc(cc2)Cl)OCC[C@H]3CCC[NH+]3C' ...
 'C#CC[NH2+][C@@H]1CCc2c1cccc2'
 '[H]/[NH+]=C(\\N)/c1ccc(cc1)OCCCCCOc2ccc(cc2)/C(=[NH+]/[H])/N'
 '[H]/[NH+]=C(/C1=CC(=O)/C(=C\\C=c2ccc(=C([NH3+])N)cc2)/C=C1)\\N'], task_names: ['FDA_APPROVED' 'CT_TOX']>

In [7]:
train_dataset

<DiskDataset X.shape: (1182, 1024), y.shape: (1182, 2), w.shape: (1182, 2), task_names: ['FDA_APPROVED' 'CT_TOX']>

MultitaskClassifier

In [8]:
model = dc.models.MultitaskClassifier(n_tasks=2, n_features=1024, layer_sizes=[1000])

In [10]:
metric_model = dc.metrics.Metric(dc.metrics.roc_auc_score)
print('train set score:', model.evaluate(train_dataset, [metric_model], transformers))
print('test set score:', model.evaluate(test_dataset, [metric_model], transformers))

train set score: {'roc_auc_score': 0.992225036304071}
test set score: {'roc_auc_score': 0.6440308622667084}


MultitaskFitTransformRegressor

In [11]:
model = dc.models.MultitaskFitTransformRegressor(n_tasks=2, n_features=1024)

In [12]:
print('train set score:', model.evaluate(train_dataset, [metric_model], transformers))
print('test set score:', model.evaluate(test_dataset, [metric_model], transformers))

train set score: {'roc_auc_score': 0.5464377658543875}
test set score: {'roc_auc_score': 0.5529037639453654}


GraphConvModel

In [17]:
import warnings
warnings.filterwarnings("ignore") 

In [21]:
tasks, datasets, transformers = dc.molnet.load_clintox(featurizer=dc.feat.ConvMolFeaturizer())
train_dataset, valid_dataset, test_dataset = datasets

[13:18:59] Explicit valence for atom # 0 N, 5, is greater than permitted
Failed to featurize datapoint 7, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(class RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
[13:19:00] Can't kekulize mol.  Unkekulized atoms: 9
Failed to featurize datapoint 302, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(class RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
[13:19:00] Explicit valence for atom # 10 N, 4, is greater than permitted
Failed to featurize datapoint 983, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not matc

In [22]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
print('Train set score:', model.evaluate(train_dataset, [metric], transformers))
print('Test set score:', model.evaluate(test_dataset, [metric], transformers))

Train set score: {'roc_auc_score': 0.44913244657261575}
Test set score: {'roc_auc_score': 0.34487887950509155}


GATmodel

In [24]:
tasks, datasets, transformers = dc.molnet.load_clintox(featurizer=dc.feat.MolGraphConvFeaturizer())
train_dataset, valid_dataset, test_dataset = datasets

[14:33:21] Explicit valence for atom # 0 N, 5, is greater than permitted
Failed to featurize datapoint 7, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(class RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
Failed to featurize datapoint 12, [Se]. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
[14:33:23] Can't kekulize mol.  Unkekulized atoms: 9
Failed to featurize datapoint 302, None. Appending empty array
Exception message: Python argument types in
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(class RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
[14:33:29] Explicit valence for atom # 10 N, 4, is greater than permitted
Failed to f

In [25]:
from deepchem.models import GATModel
model = dc.models.GATModel(mode='classification', n_tasks=2,batch_size=16, learning_rate=0.001)

In [26]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
print('Train set score:', model.evaluate(train_dataset, [metric], transformers))
print('Test set score:', model.evaluate(test_dataset, [metric], transformers))

Train set score: {'roc_auc_score': 0.6401833911886865}
Test set score: {'roc_auc_score': 0.5808031835401244}


GCNModel

In [28]:
from deepchem.models import GCNModel
model = dc.models.GCNModel(mode='classification', n_tasks=2,batch_size=16, learning_rate=0.001)

In [29]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
print('Train set score:', model.evaluate(train_dataset, [metric], transformers))
print('Test set score:', model.evaluate(test_dataset, [metric], transformers))

Train set score: {'roc_auc_score': 0.4721913396551126}
Test set score: {'roc_auc_score': 0.4683444548708859}
