In [None]:
! wget https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
! chmod +x Miniconda3-py37_4.8.2-Linux-x86_64.sh
! bash ./Miniconda3-py37_4.8.2-Linux-x86_64.sh -b -f -p /usr/local
import sys
sys.path.append('/usr/local/lib/python3.7/site-packages/')
! conda install -c rdkit rdkit -y

! pip insatll pytorch
! pip install sklearn
! pip install pandas

--2022-04-05 08:11:13--  https://repo.anaconda.com/miniconda/Miniconda3-py37_4.8.2-Linux-x86_64.sh
Resolving repo.anaconda.com (repo.anaconda.com)... 104.16.130.3, 104.16.131.3, 2606:4700::6810:8303, ...
Connecting to repo.anaconda.com (repo.anaconda.com)|104.16.130.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 85055499 (81M) [application/x-sh]
Saving to: ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh.1’


2022-04-05 08:11:14 (169 MB/s) - ‘Miniconda3-py37_4.8.2-Linux-x86_64.sh.1’ saved [85055499/85055499]

PREFIX=/usr/local
Unpacking payload ...
Collecting package metadata (current_repodata.json): - \ | / - done
Solving environment: | / - \ | / 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/linux-64::urllib3==1.25.8=py37_0
  - defaults/linux-64::jpeg==9d=h7f8727e_0
  - defaults/linux-64::conda-package-handling==1.6.0=py37h7b6447c_0
  - defaults/linux-

## 方法：sklearn

Ref: https://michael-fuchs-python.netlify.app/2021/02/10/nn-multi-layer-perceptron-regressor-mlpregressor/

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MACCSkeys


1. 获取数据

In [23]:
data = pd.read_csv('chembl_acetylcholinesterase_bioactivity_data_processed.csv')
train_x, test_x, train_y, test_y = train_test_split(data['canonical_smiles'], data['pIC50'], test_size=0.2, random_state=1)

2. 提取特征

In [24]:
train_mols = [Chem.MolFromSmiles(smi) for smi in train_x]

train_morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048) for mol in train_mols]
train_morgan_fps_array = np.asarray(train_morgan_fps, dtype=float)
train_maccs_fps = [MACCSkeys.GenMACCSKeys(mol) for mol in train_mols]
train_maccs_fps_array = np.asarray(train_maccs_fps, dtype=float)
train_fps_array = np.concatenate([train_morgan_fps_array, train_maccs_fps_array],axis=1)

In [26]:
test_mols = [Chem.MolFromSmiles(smi) for smi in test_x]

test_morgan_fps = [AllChem.GetMorganFingerprintAsBitVect(mol, 2, 2048) for mol in test_mols]
test_morgan_fps_array = np.asarray(test_morgan_fps, dtype=float)
test_maccs_fps = [MACCSkeys.GenMACCSKeys(mol) for mol in test_mols]
test_maccs_fps_array = np.asarray(test_maccs_fps, dtype=float)
test_fps_array = np.concatenate([test_morgan_fps_array, test_maccs_fps_array],axis=1) 


3. 建立模型

In [27]:
param_grid = {
    'hidden_layer_sizes': [(150,100,50), (524, 20, 10)],
    'max_iter': [50, 100],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.01, 0.05],
    'learning_rate': ['constant','adaptive'],
}

grid = GridSearchCV(mlp_reg, param_grid, n_jobs= -1, cv=5)
grid.fit(train_fps_array, train_y)

print(grid.best_params_) 

{'activation': 'relu', 'alpha': 0.05, 'hidden_layer_sizes': (524, 20, 10), 'learning_rate': 'adaptive', 'max_iter': 100, 'solver': 'adam'}




4. 模型评估

In [28]:
mlp_reg_model = grid.best_estimator_
score_dict = {'mse':make_scorer(mean_squared_error),'mae':make_scorer(mean_absolute_error),'mape':make_scorer(mean_absolute_percentage_error),'r2':make_scorer(r2_score)}
# cross validation
mlp_reg_cv = cross_validate(mlp_reg_model,train_fps_array,train_y,cv = 5,n_jobs = 10,scoring = score_dict,return_train_score = True)
mlp_reg_cv_train_mae = np.mean(mlp_reg_cv['train_mae'])
mlp_reg_cv_train_mse = np.mean(mlp_reg_cv['train_mse'])
mlp_reg_cv_train_mape = np.mean(mlp_reg_cv['train_mape'])
mlp_reg_cv_train_r2 = np.mean(mlp_reg_cv['train_r2'])
mlp_reg_cv_test_mae = np.mean(mlp_reg_cv['test_mae'])
mlp_reg_cv_test_mse = np.mean(mlp_reg_cv['test_mse'])
mlp_reg_cv_test_mape = np.mean(mlp_reg_cv['test_mape'])
mlp_reg_cv_test_r2 = np.mean(mlp_reg_cv['test_r2'])
# external test
mlp_reg_ext_pred = mlp_reg_model.predict(test_fps_array)
mlp_reg_ext_mae = mean_absolute_error(test_y, mlp_reg_ext_pred)
mlp_reg_ext_mse = mean_squared_error(test_y, mlp_reg_ext_pred)
mlp_reg_ext_mape = mean_absolute_percentage_error(test_y, mlp_reg_ext_pred)
mlp_reg_ext_r2 = r2_score(test_y, mlp_reg_ext_pred)

mlp_reg_perf = {'mae':[mlp_reg_cv_train_mae, mlp_reg_cv_test_mae, mlp_reg_ext_mae],
            'mse':[mlp_reg_cv_train_mse, mlp_reg_cv_test_mse, mlp_reg_ext_mse],
            'mape':[mlp_reg_cv_train_mape, mlp_reg_cv_test_mape, mlp_reg_ext_mape],
            'r2':[mlp_reg_cv_train_r2, mlp_reg_cv_test_r2, mlp_reg_ext_r2]}

mlp_reg_perf_df = pd.DataFrame.from_dict(mlp_reg_perf)
mlp_reg_perf_df.index = ['train','cv', 'ext']

In [29]:
mlp_reg_perf_df

Unnamed: 0,mae,mse,mape,r2
train,0.163328,0.068855,0.028534,0.965939
cv,0.591493,0.660539,0.10828,0.673883
ext,0.555335,0.702841,0.101058,0.63316


## End