回帰-標的構造の生物学的パラメーターを予測-

In [1]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Draw, PandasTools, Descriptors
from rdkit.Chem.Draw import IPythonConsole
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet, Ridge, Lasso, LinearRegression, SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
from math import log10
from IPython.display import HTML
import pandas as pd
from rdkit.Chem.AtomPairs import Pairs, Utils
import seaborn as sns
import matplotlib.pyplot as plt
# Suppress warnings of Python library
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("ERb_IC50.csv")
df

Unnamed: 0.1,Unnamed: 0,SMILES,Standard Value,pIC50
0,CHEMBL81,O=C(c1ccc(OCCN2CCCCC2)cc1)c1c(-c2ccc(O)cc2)sc2...,470.0,6.327902
1,CHEMBL4278269,Cc1cc(-c2c[se]cc2-c2ccc(OCCN3CCCC3)c(C)c2)ccc1O,5.0,8.301030
2,CHEMBL188957,CCCc1cc(O)cc2nc(-c3ccc(O)cc3)oc12,11.0,7.958607
3,CHEMBL189706,CCOC(=O)c1cc(O)cc2nc(-c3ccc(O)cc3)oc12,190.0,6.721246
4,CHEMBL188230,O=Cc1cc(O)cc2nc(-c3ccc(O)cc3)oc12,59.0,7.229148
...,...,...,...,...
1551,CHEMBL3360846,O=C1[C@@H](c2ccc3ccccc3c2)[C@H](c2ccc(O)cc2)N1...,1100.0,5.958607
1552,CHEMBL4637118,COC(=O)C1=C(N)Oc2cc(O)ccc2C1c1ccccc1,1590.0,5.798603
1553,CHEMBL370458,N#CC1=C(N)Oc2cc(O)ccc2C1c1ccccc1,8390.0,5.076238
1554,CHEMBL4643970,COC(=O)C1=C(N)Oc2cc(O)ccc2C1c1ccc(OCCN2CCOCC2)cc1,630.0,6.200659


In [3]:
#　dfのSMILES列を参照してMolオブジェクト列をデータフレームに加える #https://insilico-notebook.com/smiles-to-desc-df/
PandasTools.AddMoleculeColumnToFrame(df,'SMILES')
#　Molオブジェクトが作成できたか確認
print(df.shape)
print(df.isnull().sum()) 
#　ROMolが作成できなかったものを確認
print(df[df.ROMol.isnull()])

#　欠損行の除去
df = df.dropna() 


(1556, 5)
Unnamed: 0        0
SMILES            0
Standard Value    0
pIC50             0
ROMol             0
dtype: int64
Empty DataFrame
Columns: [Unnamed: 0, SMILES, Standard Value, pIC50, ROMol]
Index: []


In [4]:
for i,j in Descriptors.descList:
    df[i] = df.ROMol.map(j)

df.shape

(1556, 213)

In [5]:
df['MW'] = df.ROMol.map(Descriptors.MolWt)
df['MW'].describe()

count    1556.000000
mean      366.029582
std        96.613238
min       142.110000
25%       285.255000
50%       340.949500
75%       449.572000
max       778.987000
Name: MW, dtype: float64

In [6]:
df.head
df.shape

(1556, 214)

In [7]:
df = df.dropna(how='any', axis=0)
df.head

<bound method NDFrame.head of          Unnamed: 0                                             SMILES  \
0          CHEMBL81  O=C(c1ccc(OCCN2CCCCC2)cc1)c1c(-c2ccc(O)cc2)sc2...   
2      CHEMBL188957                  CCCc1cc(O)cc2nc(-c3ccc(O)cc3)oc12   
3      CHEMBL189706             CCOC(=O)c1cc(O)cc2nc(-c3ccc(O)cc3)oc12   
4      CHEMBL188230                  O=Cc1cc(O)cc2nc(-c3ccc(O)cc3)oc12   
5      CHEMBL188528                   Oc1ccc(-c2nc3cc(O)cc(Br)c3o2)cc1   
...             ...                                                ...   
1551  CHEMBL3360846  O=C1[C@@H](c2ccc3ccccc3c2)[C@H](c2ccc(O)cc2)N1...   
1552  CHEMBL4637118               COC(=O)C1=C(N)Oc2cc(O)ccc2C1c1ccccc1   
1553   CHEMBL370458                   N#CC1=C(N)Oc2cc(O)ccc2C1c1ccccc1   
1554  CHEMBL4643970  COC(=O)C1=C(N)Oc2cc(O)ccc2C1c1ccc(OCCN2CCOCC2)cc1   
1555  CHEMBL4647883            COC(=O)C1=C(N)Oc2cc(O)ccc2C1c1ccc(O)cc1   

      Standard Value     pIC50  \
0              470.0  6.327902   
2            

In [None]:
#HTML(df_corr.to_html(escape=False))

In [None]:
#df_corr.to_csv('0202df_corr.csv')

In [None]:
rdkit_fp = []
for mol in df.ROMol:
    fp = [x for x in Chem.RDKFingerprint(mol)]
    rdkit_fp.append(fp)
rdkit_fp = np.array(rdkit_fp, dtype = object)
print(rdkit_fp.shape, type(rdkit_fp)) 
print(rdkit_fp)

#Chem.RDKFingerprint(mol)
#AllChem.GetMACCSKeysFingerprint(mol)
#AllChem.GetMorganFingerprintAsBitVect(mol, radius, bitvect)

In [None]:
# １列にfingerprintのリストを追加する場合
df['FP'] = df.apply(lambda x: rdkit_fp, axis=1) #df.apply lambda : function process against x
df.head
#fps = df.apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x.ROMol, 2, 1024), axis=1)
#fps


In [None]:
# fingerprintの各値を各列に格納する場合, https://insilico-notebook.com/smiles-to-desc-df/
# 個別に０１をデータフレームに格納する
FP = [AllChem.GetMACCSKeysFingerprint(mol) for mol in df.ROMol]
df_FP = pd.DataFrame(np.array(FP))
# フィンガープリントをもとのデータフレームに結合
df_FP.index = df.index
df = pd.concat([df, df_FP], axis=1)

In [8]:
#dfa = df.drop(df.columns[[0,1,2,3,4,214]], axis = 1)
dfa = df.drop(df.columns[[0,1,2,3,4]], axis = 1)
print(dfa)

      MaxEStateIndex  MinEStateIndex  MaxAbsEStateIndex  MinAbsEStateIndex  \
0          13.646561       -0.083443          13.646561           0.083443   
2           9.739204        0.200469           9.739204           0.200469   
3          11.947139       -0.586291          11.947139           0.096195   
4          10.931167       -0.041522          10.931167           0.041522   
5           9.482571        0.124936           9.482571           0.124936   
...              ...             ...                ...                ...   
1551       13.248370       -0.307452          13.248370           0.017635   
1552       12.148264       -0.536965          12.148264           0.014997   
1553        9.552529       -0.272639           9.552529           0.082827   
1554       12.468625       -0.562905          12.468625           0.040937   
1555       12.165818       -0.597870          12.165818           0.030115   

           qed    MolWt  HeavyAtomMolWt  ExactMolWt  NumValence

In [9]:
dfa = dfa.iloc[:, 0:376].apply(lambda x: (x-x.mean())/x.std(ddof=1), axis=0)
dfa.head


<bound method NDFrame.head of       MaxEStateIndex  MinEStateIndex  MaxAbsEStateIndex  MinAbsEStateIndex  \
0           1.198892        0.419192           1.198892          -0.284867   
2          -1.008182        0.672968          -1.008182           0.707316   
3           0.238972       -0.030282           0.238972          -0.176750   
4          -0.334901        0.456663          -0.334901          -0.640288   
5          -1.153141        0.605453          -1.153141           0.066926   
...              ...             ...                ...                ...   
1551        0.973973        0.218960           0.973973          -0.842810   
1552        0.352577        0.013808           0.352577          -0.865175   
1553       -1.113626        0.250077          -1.113626          -0.290091   
1554        0.533534       -0.009379           0.533534          -0.645245   
1555        0.362493       -0.040633           0.362493          -0.737001   

           qed     MolWt  HeavyAt

In [10]:
dfa = dfa.dropna(how='any', axis=1)
dfa.head

<bound method NDFrame.head of       MaxEStateIndex  MinEStateIndex  MaxAbsEStateIndex  MinAbsEStateIndex  \
0           1.198892        0.419192           1.198892          -0.284867   
2          -1.008182        0.672968          -1.008182           0.707316   
3           0.238972       -0.030282           0.238972          -0.176750   
4          -0.334901        0.456663          -0.334901          -0.640288   
5          -1.153141        0.605453          -1.153141           0.066926   
...              ...             ...                ...                ...   
1551        0.973973        0.218960           0.973973          -0.842810   
1552        0.352577        0.013808           0.352577          -0.865175   
1553       -1.113626        0.250077          -1.113626          -0.290091   
1554        0.533534       -0.009379           0.533534          -0.645245   
1555        0.362493       -0.040633           0.362493          -0.737001   

           qed     MolWt  HeavyAt

In [None]:
#dfa.to_csv('0207df_scaled.csv')

In [11]:
pIC50s = df.loc[:,['pIC50']]
#fs = dfa.loc[:,['MinAbsEStateIndex', 'FP']].values
#fs1 = [rdkit_fp, dfaa]
#x_train, x_test, y_train, y_test = train_test_split(dfa, pIC50s, random_state = 0)
#print(train_test_split(dfa, pIC50s))

In [12]:
#https://horomary.hatenablog.com/entry/2019/03/10/190919
from deap import algorithms
from deap import base
from deap import creator
from deap import tools
from sklearn.linear_model import RidgeCV

In [13]:
X = dfa
y = pIC50s

In [None]:
# 多項式特徴量を追加した場合
scores = []
for _ in range(300):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
    model = SVR()
    model.fit(X_train, y_train)
    scores.append(cross_val_score(model, X_train, y_train, cv=5))

print(np.array(scores).mean())    # 0.82

In [14]:

def evalIndividual(individual):
    """ 個体評価のための関数
        Prameter:
        individual: 要素が0or1のリスト  長さは入力変数の数と同じ
        Return:
        n_features: 使った特徴量の数
        score: モデルの精度
    """
    n_features = sum(individual)

    if n_features == 0:
        return 9999, -9999

    X_temp = X.iloc[:, [bool(val) for val in individual]]
    
    scores = []
    for _ in range(30):
        X_train, X_test, y_train, y_test = train_test_split(X_temp, y, test_size=0.4) 
        model = SVR()
        model.fit(X_train, y_train)
        scores.append(cross_val_score(model, X_train, y_train, cv=5))

    score = np.array(scores).mean()

    return n_features, score


creator.create("Fitness", base.Fitness, weights=(1.0, 0))   
creator.create("Individual", list, fitness=creator.Fitness)


In [15]:
import random
toolbox = base.Toolbox()
toolbox.register("attr_bool", random.randint, 0, 1)
toolbox.register("individual", tools.initRepeat, creator.Individual, 
                 toolbox.attr_bool, X.shape[1])
toolbox.register("population", tools.initRepeat, list, toolbox.individual)


toolbox.register("evaluate", evalIndividual)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selNSGA2)

In [27]:

"""遺伝的アルゴリズム設定
5世代まで
1世代の個体数10
次世代に引き継ぐ個体数2
交叉確率0.7
突然変異確率0.1
"""
NGEN = 5
MU = 2
LAMBDA = 10
CXPB = 0.7
MUTPB = 0.1

pop = toolbox.population(n=MU)
hof = tools.ParetoFront()
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean, axis=0)
stats.register("std", np.std, axis=0)
stats.register("min", np.min, axis=0)
stats.register("max", np.max, axis=0)

pop, log = algorithms.eaMuPlusLambda(pop, toolbox, MU, LAMBDA, CXPB,
                                     MUTPB, NGEN, stats, halloffame=hof)
df_log = pd.DataFrame(log)
df_log

gen	nevals	avg        	std      	min      	max        
0  	2     	[91.5  nan]	[9.5 nan]	[82. nan]	[101.  nan]
1  	7     	[101.  nan]	[ 0. nan]	[101.  nan]	[101.  nan]
2  	8     	[101.  nan]	[ 0. nan]	[101.  nan]	[101.  nan]
3  	10    	[101.  nan]	[ 0. nan]	[101.  nan]	[101.  nan]
4  	8     	[102.5   nan]	[1.5 nan]	[101.  nan]	[104.  nan]
5  	6     	[104.  nan]  	[ 0. nan]	[104.  nan]	[104.  nan]


Unnamed: 0,gen,nevals,avg,std,min,max
0,0,2,"[91.5, nan]","[9.5, nan]","[82.0, nan]","[101.0, nan]"
1,1,7,"[101.0, nan]","[0.0, nan]","[101.0, nan]","[101.0, nan]"
2,2,8,"[101.0, nan]","[0.0, nan]","[101.0, nan]","[101.0, nan]"
3,3,10,"[101.0, nan]","[0.0, nan]","[101.0, nan]","[101.0, nan]"
4,4,8,"[102.5, nan]","[1.5, nan]","[101.0, nan]","[104.0, nan]"
5,5,6,"[104.0, nan]","[0.0, nan]","[104.0, nan]","[104.0, nan]"


In [17]:
pd_hof = pd.DataFrame(hof)
pd_hof

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,179,180,181,182,183,184,185,186,187,188
0,1,1,0,1,1,0,1,1,1,1,...,0,0,1,1,1,0,1,1,1,1


In [25]:
hof

<deap.tools.support.ParetoFront at 0x2502c67eb30>

In [18]:
pd_pop = pd.DataFrame(pop)
pd_pop

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,179,180,181,182,183,184,185,186,187,188
0,1,1,0,1,1,0,1,1,1,1,...,0,0,1,1,1,0,1,1,1,1
1,1,1,0,1,1,0,1,1,0,1,...,1,1,0,1,1,1,1,1,1,1
2,1,1,0,1,1,0,1,1,1,1,...,0,0,1,1,1,0,1,1,1,1
3,1,1,0,1,1,0,1,1,1,1,...,0,0,1,1,1,0,1,1,1,1
4,1,1,0,1,1,0,1,1,0,1,...,1,0,0,1,1,1,1,1,1,1
5,1,1,0,1,1,0,1,1,0,1,...,1,1,0,1,1,1,1,1,1,1
6,1,1,0,1,1,0,1,1,1,1,...,0,0,1,1,1,0,1,1,1,1
7,1,1,0,1,1,0,1,1,0,1,...,1,1,0,1,1,1,1,1,1,1
8,1,1,0,1,1,0,1,1,0,1,...,1,1,0,1,1,1,1,1,1,1
9,0,0,1,1,1,0,1,1,0,1,...,1,1,0,1,1,1,1,1,1,1


In [24]:
best_individual = tools.selBest(pop, 1)[0]
best_individual_array = np.array(best_individual)
print(best_individual_array)
print('C : 2 ** {0}'.format(round(best_individual_array[-3])))
print('Epsilon : 2 ** {0}'.format(round(best_individual_array[-2])))
print('Gamma : 2 ** {0}'.format(round(best_individual_array[-1])))

[1 1 0 1 1 0 1 1 1 1 0 0 1 1 1 0 0 1 0 0 1 1 0 1 0 1 1 1 1 1 0 1 1 0 0 0 0
 0 0 0 0 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 0 0 1 0 0 1 1 1 1 1 1 1 0 1 1 0 0 0
 0 0 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 0 0 0 1 1 1 1 1 1 1 0 0 0 1 1 1 1 0 0
 1 1 1 1 0 1 1 0 1 1 1 0 1 0 0 1 1 1 1 0 1 1 0 1 1 0 1 0 0 1 0 1 0 1 1 1 0
 1 1 1 1 0 0 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 0 0 1 1 1 0
 1 1 1 1]
C : 2 ** 1
Epsilon : 2 ** 1
Gamma : 2 ** 1


In [None]:
y_pred = rf.predict(x_test)

In [None]:
# prediction against pIC50
%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(y_test, y_pred)
plt.xlabel('exp')
plt.ylabel('pred')
r2_score(y_test, y_pred)
score = rf_fit.score(x_test, y_test)
print('Test set score: {}'.format(score))

In [None]:
test_mol = Chem.MolFromSmiles('C[C@]12CC[C@@H]3c4ccc(O)cc4CC[C@H]3[C@@H]1CC[C@@H]2O') 
#予測したい化合物のcanonical smilesを入力してください。
test_mol #入力したcanonical smilesの構造確認用


In [None]:
fps_test = []
fp_test = Chem.RDKFingerprint(test_mol, 2)
arr_test = np.zeros((1,))
DataStructs.ConvertToNumpyArray(fp_test, arr_test)
fps_test.append(arr_test)

fps_test = np.array(fps_test)

y_pred_test = rf.predict(fps_test)
y_pred_test