In [1]:
! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh
! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')
! conda install -c rdkit rdkit -y
! pip install --pre deepchem


--2022-04-07 13:19:25--  https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.130.3, 104.16.131.3, 2606:4700::6810:8203, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.130.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85055499 (81M) [application/x-sh]
Saving to: ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh’


2022-04-07 13:19:25 (134 MB/s) - ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh’ saved [85055499/85055499]

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ | done
Solving environment: - \ done

## Package Plan ##

  environment location: /usr/local

  added / updated specs:
    - _libgcc_mutex==0.1=main
    - asn1crypto==1.3.0=py37_0
    - ca-certificates==2020.1.1=0
    - certifi==2019.11.28=py37_0
    - cffi==1.14.0=py37h2e261b9_0
    - chardet==3.0.4=py37_1003
    - conda-package-handling==1.6.0=py37h7b6447c_0


方法：Keras

In [2]:
import numpy as np
import deepchem as dc
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys
import tensorflow as tf
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [9]:
def create_dataset(data_x, data_y):
  mols = [Chem.MolFromSmiles(smi) for smi in data_x]
  morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048) for mol in mols]
  morgan_fps_array = np.asarray(morgan_fps, dtype=float)
  maccs_fps = [MACCSkeys.GenMACCSKeys(mol) for mol in mols]
  maccs_fps_array = np.asarray(maccs_fps, dtype=float)
  x = np.concatenate([morgan_fps_array, maccs_fps_array],axis=1)

  y = np.asarray(data_y, dtype=float).reshape(-1, 1)
  dataset = dc.data.NumpyDataset(X=x, y=y)
  return dataset

读取数据

In [5]:
data = pd.read_csv('chembl_acetylcholinesterase_bioactivity_data_processed.csv')
train_x, test_x, train_y, test_y = train_test_split(data['canonical_smiles'], data['pIC50'], test_size=0.2, random_state=1)

提取特征

In [10]:
train_dataset = create_dataset(train_x, train_y)
test_dataset = create_dataset(test_x, test_y)

建立模型

In [13]:
keras_model = tf.keras.Sequential([
tf.keras.layers.Dense(1000, activation='relu'),
tf.keras.layers.Dropout(rate=0.5),
tf.keras.layers.Dense(1)])
model = dc.models.KerasModel(keras_model, dc.models.losses.L2Loss())

模型评估

In [14]:
res_list = []
for epoch in [5, 10, 15, 20, 25, 30, 40, 50]:
      model.fit(train_dataset, nb_epoch=epoch)

      train_predict = model.predict(train_dataset)    
      train_predict_list = train_predict.reshape(-1).tolist()
      train_y_list = train_dataset.y.tolist()

      train_mae = mean_absolute_error(train_predict_list, train_y_list)
      train_mse = mean_squared_error(train_predict_list, train_y_list)
      train_mape = mean_absolute_percentage_error(train_predict_list, train_y_list)
      train_r2 = r2_score(train_predict_list, train_y_list)

      test_predict = model.predict(test_dataset)
      test_predict_list = test_predict.reshape(-1).tolist()
      test_y_list = test_dataset.y.tolist()   

      test_mae = mean_absolute_error(test_predict_list, test_y_list)
      test_mse = mean_squared_error(test_predict_list, test_y_list)
      test_mape = mean_absolute_percentage_error(test_predict_list, test_y_list)
      test_r2 = r2_score(test_predict_list, test_y_list)

      keras_perf = {'mae':[train_mae, test_mae],
                'mse':[train_mse, test_mse],
                'mape':[train_mape, test_mape],
                'r2':[train_r2, test_r2]}
      res_list.append(pd.DataFrame.from_dict(keras_perf))
final_res = pd.concat(res_list, keys = [5, 10, 15, 20, 25, 30, 40, 50])

In [15]:
final_res

Unnamed: 0,Unnamed: 1,mae,mse,mape,r2
5,0,0.48288,0.408089,0.082465,0.7469
5,1,0.642183,0.844092,0.116081,0.452266
10,0,0.335923,0.206026,0.057445,0.882807
10,1,0.590366,0.736288,0.107697,0.559499
15,0,0.262036,0.136351,0.044629,0.9265
15,1,0.572534,0.741678,0.105122,0.57689
20,0,0.256858,0.126882,0.045041,0.931439
20,1,0.586152,0.745292,0.11219,0.567149
25,0,0.232829,0.109165,0.040128,0.941722
25,1,0.579771,0.744753,0.109654,0.571052
