# 第11章　実践編6：AI創薬へのはじめの一歩
- 清水 秀幸

編集部注：2023年5月29日最終更新．コードの一部がお手元の書籍と異なる可能性がございます．正誤・更新情報は弊社ウェブサイトの[本書詳細ページ](https://www.yodosha.co.jp/jikkenigaku/book/9784758122634/index.html)をご参照ください．

##### 入力11-1

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

##### 入力11-2

In [None]:
!pip install rdkit

##### 入力11-3

In [None]:
import rdkit
print(rdkit.__version__)

##### 入力11-4◆→1刷11-5←◆

In [None]:
!wget https://raw.githubusercontent.com/dataprofessor/data/master/delaney.csv

##### 入力11-5◆→1刷11-6←◆

In [None]:
sol = pd.read_csv('delaney.csv')
sol

##### 入力11-6

In [None]:
sol['SMILES']

##### 入力11-7

In [None]:
# データの最初にある化合物のSMILES表記
sol['SMILES'][0]

##### 入力11-8

In [None]:
from rdkit import Chem
m = Chem.MolFromSmiles(sol['SMILES'][0])
m # 生成された化合物オブジェクト

##### 入力11-9

In [None]:
m.GetNumAtoms()

##### 入力11-10

In [None]:
mol_list = [Chem.MolFromSmiles(element) for element in sol['SMILES']]
len(mol_list)

##### 入力11-11

In [None]:
from rdkit.Chem import Descriptors

def calculate_three_descriptors(smiles, verbose=False):

    moldata= []
    for elem in smiles:
        mol=Chem.MolFromSmiles(elem)
        moldata.append(mol)

    baseData= np.arange(1,1)
    i=0
    for mol in moldata:
    
        desc_MolLogP = Descriptors.MolLogP(mol)
        desc_MolWt = Descriptors.MolWt(mol)
        desc_NumRotatableBonds = Descriptors.NumRotatableBonds(mol)
    
        row = np.array([desc_MolLogP,
                        desc_MolWt,
                        desc_NumRotatableBonds])
        
        if(i==0):
            baseData=row
        else:
            baseData=np.vstack([baseData, row])
        i=i+1

    columnNames=["MolLogP","MolWt","NumRotatableBonds"]
    descriptors = pd.DataFrame(data=baseData,columns=columnNames)
    
    return descriptors

##### 入力11-12

In [None]:
df = calculate_three_descriptors(sol['SMILES'])
df

##### 入力11-13

In [None]:
m = Chem.MolFromSmiles('CNC(=O)Oc1cc(C)cc(C)c1')

aromatic_atoms = [m.GetAtomWithIdx(i).GetIsAromatic() for i in range(m.GetNumAtoms())]
aromatic_atoms

##### 入力11-14

In [None]:
def AromaticAtoms(m):
    aromatic_atoms = [m.GetAtomWithIdx(i).GetIsAromatic() for i in range(m.GetNumAtoms())]
    aa_count = []
    for i in aromatic_atoms:
        if i==True:
            aa_count.append(1)
    sum_aa_count = sum(aa_count)
    return sum_aa_count


AromaticAtoms(m)

##### 入力11-15

In [None]:
desc_AromaticAtoms = [AromaticAtoms(element) for element in mol_list]
desc_AromaticAtoms

##### 入力11-16

In [None]:
m = Chem.MolFromSmiles('CNC(=O)Oc1cc(C)cc(C)c1')
AromaticAtoms(m)/Descriptors.HeavyAtomCount(m)

##### 入力11-17

In [None]:
desc_AromaticProportion = [AromaticAtoms(element)/Descriptors.HeavyAtomCount(element) for element in mol_list]
df_desc_AromaticProportion = pd.DataFrame(desc_AromaticProportion, columns=['AromaticProportion'])
df_desc_AromaticProportion

##### 入力11-18

In [None]:
X = pd.concat([df, df_desc_AromaticProportion], axis=1)
X

##### 入力11-19

In [None]:
y = sol.iloc[:,1]
y = y.rename('logS')
y

##### 入力11-20

In [None]:
y.hist()

##### 入力11-21

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

##### 入力11-22

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score

model = linear_model.LinearRegression()
model.fit(X_train, y_train)

##### 入力11-23

In [None]:
y_pred_train = model.predict(X_train)

print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)
print('Mean squared error (MSE): %.2f'
        % mean_squared_error(y_train, y_pred_train))
print('Coefficient of determination (R^2): %.2f'
        % r2_score(y_train, y_pred_train))

##### 入力11-24

In [None]:
y_pred_test = model.predict(X_test)

print('Coefficients:', model.coef_)
print('Intercept:', model.intercept_)
print('Mean squared error (MSE): %.2f'
        % mean_squared_error(y_test, y_pred_test))
print('Coefficient of determination (R^2): %.2f'
        % r2_score(y_test, y_pred_test))

##### 入力11-25

In [None]:
yintercept = '%.2f' % model.intercept_
LogP = '%.2f LogP' % model.coef_[0]
MW = '%.4f MW' % model.coef_[1]
RB = '%.4f RB' % model.coef_[2]
AP = '%.2f AP' % model.coef_[3]

print('LogS = ' +
      ' '+
      yintercept +
      ' '+
      LogP +
      ' '+
      MW +
      ' '+
      RB +
      ' '+
      AP)

##### 入力11-26

In [None]:
## 左側が訓練データに関する実測logSと予測logS
plt.figure(figsize=(11,5))

plt.subplot(1, 2, 1)
plt.scatter(x=y_train, y=y_pred_train, c="#7CAE00", alpha=0.3)

z = np.polyfit(y_train, y_pred_train, 1)
p = np.poly1d(z)
plt.plot(y_test,p(y_test),"#F8766D")

plt.ylabel('Predicted LogS')
plt.xlabel('Experimental LogS')


## 右側がテストデータに関する実測logSと予測logS
plt.subplot(1, 2, 2)
plt.scatter(x=y_test, y=y_pred_test, c="#619CFF", alpha=0.3)

z = np.polyfit(y_test, y_pred_test, 1)
p = np.poly1d(z)
plt.plot(y_test,p(y_test),"#F8766D")

plt.xlabel('Experimental LogS')

plt.show()

##### 入力11-27

In [None]:
!wget https://raw.githubusercontent.com/rdkit/rdkit/master/Docs/Book/data/solubility.train.sdf
!wget https://raw.githubusercontent.com/rdkit/rdkit/master/Docs/Book/data/solubility.test.sdf

##### 入力11-28

In [None]:
!ls

##### 入力11-29

In [None]:
!head -n 40 solubility.train.sdf

##### 入力11-30

In [None]:
train = [ mol for mol in Chem.SDMolSupplier('solubility.train.sdf')]
test = [ mol for mol in Chem.SDMolSupplier('solubility.test.sdf')]

print('training: ', len(train))
print('test: ', len(test))

##### 入力11-31

In [None]:
train[0].GetPropsAsDict()["SOL"]

##### 入力11-32

In [None]:
train[0].GetPropsAsDict()["smiles"]

##### 入力11-33

In [None]:
from rdkit.ML.Descriptors import MoleculeDescriptors

descs = [desc_name[0] for desc_name in Descriptors._descList]
descs

##### 入力11-34

In [None]:
len(descs)

##### 入力11-35

In [None]:
desc_calc = MoleculeDescriptors.MolecularDescriptorCalculator(descs)

##### 入力11-36

In [None]:
X = [desc_calc.CalcDescriptors(mol) for mol in train]
y = [mol.GetPropsAsDict()["SOL"] for mol in train]

test_X = [desc_calc.CalcDescriptors(mol) for mol in test]
test_y = [mol.GetPropsAsDict()["SOL"] for mol in test]

##### 入力11-37

In [None]:
len(X)

##### 入力11-38

In [None]:
X[0]

##### 入力11-39

In [None]:
y[:5]

##### 入力11-40

In [None]:
from sklearn.model_selection import train_test_split
(train_X, test_X, train_y, test_y) = train_test_split(X, y, test_size=0.2)

##### 入力11-41

In [None]:
import lightgbm as lgb

##### 入力11-42

In [None]:
lgb_train = lgb.Dataset(train_X,
                        label=train_y,
                        free_raw_data=False)
lgb_test = lgb.Dataset(test_X,
                       label=test_y,
                       free_raw_data=False)

##### 入力11-43

In [None]:
 params = {'num_leaves': 31,
                  'min_data_in_leaf': 30,
                  'objective':'regression',
                  'max_depth': -1,
                  'learning_rate': 0.01,
                  "min_child_samples": 20,
                  "boosting": "gbdt",
                  "feature_fraction": 0.9,
                  "bagging_freq": 1,
                  "bagging_fraction": 0.9 ,
                  "bagging_seed": 11,
                  "metric": 'rmse',
                  "lambda_l1": 0.1,
                  "verbosity": -1,
                  "nthread": 4,
                  "random_state": 0}

##### 入力11-44

In [None]:
reg = lgb.train(
                      params,
                      lgb_train,
                      valid_sets=[lgb_train, lgb_test],
                      valid_names=['train', 'test'],
                      num_boost_round=1000,
                      verbose_eval=100
)

##### 入力11-45

In [None]:
plt.figure()
plt.scatter(train_y, reg.predict(train_X, num_iteration=reg.best_iteration), label = 'Train', c = 'blue')
plt.title('lightgbm Predictor')
plt.xlabel('Measured Solubility')
plt.ylabel('Predicted Solubility')
plt.scatter(test_y, reg.predict(test_X, num_iteration=reg.best_iteration), c = 'lightgreen', label = 'Test', alpha = 0.8)
plt.legend(loc = 4)
plt.show()

##### 入力11-46

In [None]:
!pip install dgl dgllife

##### 入力11-47

In [None]:
## 訓練データの読み込みとデータフレーム化
X = [mol.GetPropsAsDict()["smiles"] for mol in train]
y = [mol.GetPropsAsDict()["SOL_classification"] for mol in train]

df = pd.DataFrame({'SMILES': X, 'classification': y})
df

##### 入力11-48

In [None]:
set(df['classification'])

##### 入力11-49

In [None]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torch.nn import CrossEntropyLoss

##### 入力11-50

In [None]:
import dgl
from dgl.data import DGLDataset

##### 入力11-51

In [None]:
from dgllife.utils import mol_to_graph
from dgllife.utils import mol_to_complete_graph
from dgllife.utils.featurizers import CanonicalAtomFeaturizer
from dgllife.utils.featurizers import CanonicalBondFeaturizer

##### 入力11-52

In [None]:
atom_featurizer = CanonicalAtomFeaturizer()
bond_featurizer = CanonicalBondFeaturizer()

##### 入力11-53

In [None]:
g = mol_to_complete_graph(train[0],
                          add_self_loop=False,
                          node_featurizer=atom_featurizer,
                          #edge_featurizer= bond_featurizer
                          )

n_feats = atom_featurizer.feat_size('h')
print(n_feats)

##### 入力11-54


In [None]:
g

##### 入力11-55

In [None]:
train[0].GetPropsAsDict()["smiles"]

##### 入力11-56

In [None]:
train_g = [mol_to_complete_graph(m, node_featurizer=atom_featurizer) for m in train]
test_g = [mol_to_complete_graph(m, node_featurizer=atom_featurizer) for m in test]

##### 入力11-57

In [None]:
prop_dict = {
"(A) low": 0,
"(B) medium": 1,
"(C) high": 2
}
train_y = np.array([prop_dict[m.GetProp('SOL_classification')] for m in train])
train_y = np.array(train_y, dtype=np.int64)
test_y = np.array([prop_dict[m.GetProp('SOL_classification')] for m in test])
test_y = np.array(test_y, dtype=np.int64)
train_y

##### 入力11-58

In [None]:
from dgllife.model import GCNPredictor

##### 入力11-59

In [None]:
ncls = 3

gcn_net = GCNPredictor(in_feats=n_feats,
                       hidden_feats=[60,20],
                       n_tasks=ncls,
                       predictor_hidden_feats=10,
                       dropout=[0.1,0.1])

##### 入力11-60

In [None]:
def collate(sample):
    graphs, labels = map(list,zip(*sample))
    batched_graph = dgl.batch(graphs)
    batched_graph.set_n_initializer(dgl.init.zero_initializer)
    batched_graph.set_e_initializer(dgl.init.zero_initializer)
    return batched_graph, torch.tensor(labels)

##### 入力11-61

In [None]:
train_data = list(zip(train_g, train_y))
train_loader = DataLoader(train_data, batch_size=128, shuffle=True, collate_fn=collate, drop_last=True)

##### 入力11-62

In [None]:
loss_fn = CrossEntropyLoss()
optimizer = torch.optim.Adam(gcn_net.parameters(), lr=0.001)
gcn_net.train()

##### 入力11-63

In [None]:
epoch_losses = []
epoch_accuracies = []
for epoch in range(1,201):
    epoch_loss = 0
    epoch_acc = 0
    for i, (bg, labels) in enumerate(train_loader):
        atom_feats = bg.ndata.pop('h')
        pred = gcn_net(bg, atom_feats)
        loss = loss_fn(pred, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        epoch_loss += loss.detach().item()
        pred_cls = pred.argmax(-1).detach().numpy()
        true_label = labels.numpy()
        epoch_acc += sum(true_label==pred_cls) / true_label.shape[0]
    epoch_acc /= (i + 1)
    epoch_loss /= (i + 1)
    if epoch % 20 == 0:
        print(f"epoch: {epoch}, Loss: {epoch_loss:.3f}, ACC: {epoch_acc:.3f}")
    epoch_accuracies.append(epoch_acc)
    epoch_losses.append(epoch_loss)

##### 入力11-64

In [None]:
plt.style.use('ggplot')
plt.plot([i for i in range(1, 201)], epoch_losses, c='b', alpha=0.6, label='loss')
plt.legend()
plt.plot([i for i in range(1, 201)], epoch_accuracies, c='r', alpha=0.6, label='acc')
plt.legend()
plt.xlabel('epoch')
plt.ylabel('loss/acc')
print('損失を青で，正解率を赤で表示')

##### 入力11-65

In [None]:
test_data = list(zip(test_g, test_y))
test_loader = DataLoader(test_data, batch_size=128, collate_fn=collate, drop_last=True)

##### 入力11-66

In [None]:
gcn_net.eval()

correct = 0
total = 0

with torch.no_grad():
    for x, y in test_loader:
        atom_feats = x.ndata.pop('h')
        pred = gcn_net(x, atom_feats)
        _, predicted = torch.max(pred.data, 1)
        total += y.size(0)
        correct += (predicted == y).sum().item()
print('正解率', int(correct)/total*100)

##### 入力11-67

In [None]:
from sklearn.metrics import confusion_matrix

true_list = []
pred_list = []

with torch.no_grad():
    for x, y in test_loader:
        atom_feats = x.ndata.pop('h')
        pred = gcn_net(x, atom_feats)
        _, predicted = torch.max(pred.data, 1)
        pred_list += predicted.detach().numpy().tolist()
        true_list += y.detach().numpy().tolist()

cm = confusion_matrix(true_list, pred_list)
print(cm)

##### 入力11-68

In [None]:
import seaborn as sns

cm = pd.DataFrame(data=cm, index=prop_dict.keys(),
                  columns=prop_dict.keys())

sns.set(rc = {'figure.figsize':(10,6)})
sns.heatmap(cm, square=True, cbar=True, annot=True, cmap='Blues', fmt='d')
plt.yticks(rotation=0)
plt.xlabel("Prediction", fontsize=13, rotation=0)
plt.ylabel("Ground Truth", fontsize=13)

print('溶解度予測モデルのテストデータにおける混同行列')

##### 入力11-69

In [None]:
!pip install chembl_webresource_client

##### 入力11-70

In [None]:
from chembl_webresource_client.new_client import new_client

target = new_client.target
target_query = target.search('coronavirus')
targets = pd.DataFrame.from_dict(target_query)
targets

##### 入力11-71

In [None]:
selected_target = targets.target_chembl_id[4]
selected_target

##### 入力11-72

In [None]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type='IC50')

res

##### 入力11-73

In [None]:
df = pd.DataFrame.from_dict(res)
df

##### 入力11-74

In [None]:
df.iloc[:,30:34]

##### 入力11-75

In [None]:
selection = ['molecule_chembl_id', 'canonical_smiles', 'standard_value']
df2 = df[selection]
df2

##### 入力11-76

In [None]:
df3 = df2[df2['standard_value'].notna()]
df3

##### 入力11-77

In [None]:
def norm_value(input_df):
    norm = []

    for i in input_df['standard_value']:
        i = float(i)
        if i > 1000000000:
            i = 1000000000
        norm.append(i)

    input_df['standard_value_norm'] = norm

    return input_df

    
df_norm = norm_value(df3)
df_norm

##### 入力11-78

In [None]:
def pIC50(input_df):
    pIC50 = []
    for i in input_df['standard_value_norm']:
        molar = i*(10**-9) # nM から M単位へ変換
        pIC50.append(-np.log10(molar))

    input_df['pIC50'] = pIC50

    return input_df

df_final = pIC50(df_norm)
df_final

##### 入力11-79

In [None]:
df_final['pIC50'].describe()

##### 入力11-80

In [None]:
df_final['canonical_smiles']

##### 入力11-81


In [None]:
X = []
for _ in df_final['canonical_smiles']:
    mol = Chem.MolFromSmiles(_)
    X.append(desc_calc.CalcDescriptors(mol))

##### 入力11-82

In [None]:
X = pd.DataFrame(np.array(X).reshape(-1, 208))
X

##### 入力11-83

In [None]:
y = df_final['pIC50']
y

##### 入力11-84

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

##### 入力11-85

In [None]:
X_train.shape, y_train.shape


##### 入力11-86

In [None]:
X_test.shape, y_test.shape

##### 入力11-87

In [None]:
# PLS回帰を行うメソッドをインポート
from sklearn.cross_decomposition import PLSRegression

# 回帰器の生成: n_componentsのデフォルトは2なのでPLSRegression()のみでも可
pls = PLSRegression(n_components=2)

# 学習
pls.fit(X_train, y_train)

##### 入力11-88


In [None]:
# テストデータを使った予測
y_pred = pls.predict(X_test)
y_pred

##### 入力11-90

In [None]:
from scipy.stats import spearmanr

# 予測と実測のスピアマン相関係数を算出
correlation, pvalue = spearmanr(y_pred, y_test)
print(correlation)

##### 入力11-91

In [None]:
import seaborn as sns

ax = sns.regplot(y_test, y_pred)
ax.set_xlabel('Experimental pIC50', fontsize='large', fontweight='bold')
ax.set_ylabel('Predicted pIC50', fontsize='large', fontweight='bold')
ax.set_xlim(0, 8)
ax.set_ylim(0, 8)
ax.figure.set_size_inches(5, 5)
plt.show