In [None]:
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

import warnings
import wandb
import os
import numpy as np
import deepchem as dc
import time
import csv
import tensorflow as tf
import pickle
import pandas as pd
from pandas import Series,DataFrame
from sklearn.metrics import accuracy_score,roc_auc_score
from deepchem.molnet import load_tox
from sklearn.ensemble import RandomForestClassifier
from deepchem.molnet.check_availability import CheckFeaturizer, CheckSplit
from deepchem.molnet.preset_hyper_parameters import hps
from deepchem.molnet import *
from deepchem.molnet.run_benchmark import run_benchmark
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.kernel_ridge import KernelRidge
from sklearn.metrics import roc_auc_score  # noqa
warnings.filterwarnings('ignore')
os.environ['TF_XLA_FLAGS'] = '–tf_xla_enable_xla_devices'

In [None]:
"""选择分子特征"""

#图像特征
g_feat = dc.feat.ConvMolFeaturizer(per_atom_fragmentation=False)()
g_feat = dc.feat.WeaveFeaturizer()
g_feat = dc.feat.MolGanFeaturizer()
g_feat = dc.feat.MolGraphConvFeaturizer(use_edges=True)

#数组特征
feat = dc.feat.MACCSKeysFingerprint()
feat = dc.feat.CircularFingerprint(size=2048, radius=4)
feat = dc.feat.RDKitDescriptors()
feat = dc.feat.MordredDescriptors()
feat = dc.feat.OneHotFeaturizer()
feat = dc.feat.SmilesToImage(img_size=80, img_spec='std')

splitters = {
    'index': dc.splits.IndexSplitter(),
    'random': dc.splits.RandomSplitter(),
    'scaffold': dc.splits.ScaffoldSplitter(),
    'butina': dc.splits.ButinaSplitter(),
    'fingerprint': dc.splits.FingerprintSplitter(),
    'task': dc.splits.TaskSplitter(),
    'stratified': dc.splits.RandomStratifiedSplitter()
}

transformers = {
    'balancing':
    TransformerGenerator(dc.trans.BalancingTransformer),
    'normalization':
    TransformerGenerator(dc.trans.NormalizationTransformer, transform_y=True),
    'minmax':
    TransformerGenerator(dc.trans.MinMaxTransformer, transform_y=True),
    'clipping':
    TransformerGenerator(dc.trans.ClippingTransformer, transform_y=True),
    'log':
    TransformerGenerator(dc.trans.LogTransformer, transform_y=True)
}
# Install wandb in shell
pip install wandb

# Login in shell (required only once)
wandb login
# # Login in notebook (required only once)
import wandb
wandb.login()

# # Initialize a WandbLogger
logger = WandbLogger(…)

# Set `wandb_logger` when creating `KerasModel`
import deepchem as dc
# Log training loss to wandb
model = dc.models.KerasModel(…, wandb_logger=logger)
model.fit(…)

# Log validation metrics to wandb using ValidationCallback
import deepchem as dc
vc = dc.models.ValidationCallback(…)
model = KerasModel(…, wandb_logger=logger)
model.fit(…, callbacks=[vc])
logger.finish()
wandb.watch(model)#pytorch模型可用

In [None]:
"""" 一.加载数据集和表征数据集"""
g_feat = dc.feat.ConvMolFeaturizer()
tox_tasks, tox_datasets, transformers = load_tox(featurizer=g_feat, split='index')
train_dataset, valid_dataset, test_dataset = tox_datasets

In [None]:
test_dataset.to_dataframe()

In [None]:
"""二.定义模型可迭代的参数集合"""
params_dict = {
    'n_tasks': [len(tox_tasks)],
    'batch_size':[64,128,],
    'n_filters':[64,128],
    'n_fully_connected_nodes':[128,256],
    'dropouts': [0.1,0.2,0.3,0.5],  
}

"""三.寻找最优参数集合"""
optimizer = dc.hyper.GridHyperparamOpt(dc.models.GraphConvModel)
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
best_model, best_hyperparams, all_results = optimizer.hyperparam_search(params_dict, 
                                                                        train_dataset, 
                                                                        valid_dataset, 
                                                                        metric, 
                                                                        transformers)

In [None]:
'''查看最优参数'''
print(best_hyperparams)
print("\n")
all_results

In [None]:
""""四.定义可视化模型Wandb及模型参数"""
# Initialize wandb and save hyperparameters
wandb.init(
  project="model",
  config={
    'n_tasks': [len(tox_tasks)],
    'batch_size':64,
    'graph_conv_layers':[256,256],
    'n_fully_connected_nodes':512,
    'dropouts': 0.2,  
    }
)
logger = wandb.config


In [None]:
"""五.建立模型，并寻找最优参数的周期"""
#设置必要的模型参数
# batch_size= 64,
# n_filters=128,
# n_fully_connected_nodes=128,
# seed=123
learning_rate = dc.models.optimizers.ExponentialDecay(0.0002, 0.8, 1000)
'''定义评价指标'''
#metric = dc.metrics.Metric(dc.metrics.roc_auc_score, dc.metrics.precision_recall_curve,dc.metrics.accuracy_score,np.mean)
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
'''定义损失函数'''
#loss_fuc = dc.models.losses.SigmoidCrossEntropy
'''定义模型参数'''
# model = dc.models.GraphConvModel(len(tox_tasks),graph_conv_layers=[256,256 ],
#                                  dense_layer_size=128,batch_size=64,
#                                  learning_rate=learning_rate,
#                                  number_atom_features = 75,
#                                  dropout= 0.1,
#                                  seed=123,
#                                  mode='classification')
model = dc.models.GraphConvModel(len(tox_tasks),
                                 wandb_logger=logger,
                                 wandb =True,
                                 learning_rate=learning_rate,
                                 number_atom_features = 75,
                                 seed=123,
                                 mode='classification',)
callback = dc.models.ValidationCallback(valid_dataset,1000,metric)
#WandbCallback()
model.fit(train_dataset, nb_epoch=50, callbacks=[callback])

In [None]:
logger.finish()

In [None]:
'''打印模型评估分数'''
print("Evaluating model")
train_scores = model.evaluate(train_dataset, [metric], transformers)
valid_scores = model.evaluate(valid_dataset, [metric], transformers)
test_scores = model.evaluate(test_dataset, [metric], transformers)

print("Train scores")
print(train_scores)

print("Validation scores")
print(valid_scores)

print("test scores")
print(test_scores)

In [None]:
"""加载外部数据集"""
dataset_file = "/home/deepchem/datasets/Hepatotoxicity.csv"


task = ['Hepatotoxicity']
g_feat = dc.feat.ConvMolFeaturizer()

loader = dc.data.data_loader.CSVLoader(tasks=task, smiles_field="smiles", featurizer=g_feat)
dataset = loader.create_dataset(dataset_file)

transformer = transformers
dataset = transformer.transform(dataset)
dataset

In [None]:
"""预测"""
'''预测外部数据集'''
# pred_task = tox21_tasks
# pred_file = "./test_datasets/TMC_ZINC_35308_CSV - 副本.csv"
# graph_featurizer = dc.feat.graph_features.ConvMolFeaturizer()
# pred_loader = dc.data.data_loader.CSVLoader(tasks=pred_task, smiles_field="smiles", featurizer=graph_featurizer)
# pred_dataset_featurized = pred_loader.featurize(pred_file)
'''预测并转换为Dataframe 格式'''
predict = model.predict(dataset,transformers)
print(predict.shape)
# pred_scores = model.evaluate(dataset, [metric], transformers)
# print(pred_scores)
# df = pd.DataFrame.from_records(predict,index=dataset.ids)
# df.insert(0,'labels',dataset.y)
# df.to_csv("/home/deepchem/pred/pred_file.csv")
'''计算预测结果的AUC值'''

# y_pred= predict[:,:,1].reshape(1313)
# y_true= dataset.y.reshape(1313)
# roc_auc_score(y_true, y_pred)

In [None]:
y_pred= predict[:,:,1].reshape(1313)
y_pred

y_true= dataset.y.reshape(1313)
y_true

In [None]:
"""精确度评分数据处理"""
a = []
for i in y_pred:
     #print(i)
    if i > 0.5:
        i = 1;
    else:
        i = 0;
    a.append(i)
y_pred_int=np.array(a)
met = dc.metrics.accuracy_score(y_true, y_pred_int)
met

In [None]:
b = np.array(a)
type(b)
accuracy_score(y_true, b)