In [14]:
import pandas as pd
import numpy as np
from glob import glob
from padelpy import padeldescriptor
import os

In [11]:
df = pd.read_csv('test.csv', index_col="LigandID")
#select the first 5 rows
df = df.head(5)
df = df.dropna(axis = 1)
df = df.drop(columns = ['Molecule Max Phase', 'Standard Relation', 'pIC50', 'MW', 'LogP', 'HBA', 'HBD', 'Pass', 'class'])
df

Unnamed: 0_level_0,canonical_smiles
LigandID,Unnamed: 1_level_1
CHEMBL3971505,CC(C)(O)[C@@H](NC(=O)Nc1cc2[nH]nc(N3CCC3)c2c(C...
CHEMBL4114320,C[C@@H](O)[C@@H](NC(=O)Nc1cc2[nH]nc(N3CCOCC3)c...
CHEMBL4106855,COc1n[nH]c2cc(NC(=O)N[C@@H](c3ccccc3)[C@H](O)C...
CHEMBL3654689,Cc1cc(-c2n[nH]c3cc(NC(=O)NC=C4CC4)ncc23)ccn1
CHEMBL4107477,CCCN1C[C@H](NC(=O)Nc2cc3[nH]nc(OCC(F)F)c3cn2)[...


In [12]:
df['canonical_smiles'].to_csv('smiles.smi', sep='\t', index=False, header=False)

In [16]:
#calculate 12 fingerprints
xml_files = glob("*.xml")
xml_files.sort()
print(xml_files)
#set fingerprint list
FP_list = [
 'AtomPairs2DCount',
 'AtomPairs2D',
 'EState',
 'CDKextended',
 'CDK',
 'CDKgraphonly',
 'KlekotaRothCount',
 'KlekotaRoth',
 'MACCS',
 'PubChem',
 'SubstructureCount',
 'Substructure']

fp = dict(zip(FP_list, xml_files))
print(fp)
#Calculate fingerprints
for i in FP_list:
    fingerprint = i
    fingerprint_output_file = os.path.join('fingerprints',''.join([fingerprint,'.csv']))
    fingerprint_descriptortypes = fp[fingerprint]
    padeldescriptor(mol_dir=os.path.join('smiles.smi'),
                d_file=fingerprint_output_file,
                descriptortypes= fingerprint_descriptortypes,
                retainorder=True, 
                removesalt=True,
                threads=2,
                detectaromaticity=True,
                standardizetautomers=True,
                standardizenitro=True,
                fingerprints=True
                )
    Fingerprint = pd.read_csv(fingerprint_output_file).set_index(df.index)
    Fingerprint = Fingerprint.drop('Name', axis=1)
    Fingerprint.to_csv(fingerprint_output_file)
    print(fingerprint_output_file, 'done')

['AtomPairs2DFingerprintCount.xml', 'AtomPairs2DFingerprinter.xml', 'EStateFingerprinter.xml', 'ExtendedFingerprinter.xml', 'Fingerprinter.xml', 'GraphOnlyFingerprinter.xml', 'KlekotaRothFingerprintCount.xml', 'KlekotaRothFingerprinter.xml', 'MACCSFingerprinter.xml', 'PubchemFingerprinter.xml', 'SubstructureFingerprintCount.xml', 'SubstructureFingerprinter.xml']
{'AtomPairs2DCount': 'AtomPairs2DFingerprintCount.xml', 'AtomPairs2D': 'AtomPairs2DFingerprinter.xml', 'EState': 'EStateFingerprinter.xml', 'CDKextended': 'ExtendedFingerprinter.xml', 'CDK': 'Fingerprinter.xml', 'CDKgraphonly': 'GraphOnlyFingerprinter.xml', 'KlekotaRothCount': 'KlekotaRothFingerprintCount.xml', 'KlekotaRoth': 'KlekotaRothFingerprinter.xml', 'MACCS': 'MACCSFingerprinter.xml', 'PubChem': 'PubchemFingerprinter.xml', 'SubstructureCount': 'SubstructureFingerprintCount.xml', 'Substructure': 'SubstructureFingerprinter.xml'}
fingerprints/AtomPairs2DCount.csv done
fingerprints/AtomPairs2D.csv done
fingerprints/EState.cs

In [28]:
#load fingerprints
#matching fingerprints with Lipinski filtered dataset
for i in FP_list:
    Fingerprint = pd.read_csv(os.path.join('fingerprints',i+'.csv'), index_col='LigandID')
    Fingerprint.to_csv( os.path.join('fingerprints', 'fp_test_'+''.join([i,'.csv'])))
testlist = [file for file in sorted(glob(os.path.join('fingerprints', 'fp_test_*.csv')))]
#create dictionary for fingerprints
#list
fp_list = [
 'AtomPairs2D',
 'AtomPairs2DCount',
 'CDK',
 'CDKextended',
 'CDKgraphonly',
 'EState',
 'KlekotaRoth',
 'KlekotaRothCount',
 'MACCS',
 'PubChem',
 'Substructure',
 'SubstructureCount']
fp_test_list = dict(zip(fp_list, testlist))
fp_test  = {}
for i in FP_list:
    fp_test[i]  = pd.read_csv(fp_test_list[i],index_col='LigandID')
    print(fp_test[i])

               APC2D1_C_C  APC2D1_C_N  APC2D1_C_O  APC2D1_C_S  APC2D1_C_P  \
LigandID                                                                    
CHEMBL3971505        18.0        11.0         3.0         0.0         0.0   
CHEMBL4114320        17.0        11.0         5.0         0.0         0.0   
CHEMBL4106855        14.0         8.0         5.0         0.0         0.0   
CHEMBL3654689        15.0        10.0         1.0         0.0         0.0   
CHEMBL4107477        18.0        11.0         3.0         0.0         0.0   

               APC2D1_C_F  APC2D1_C_Cl  APC2D1_C_Br  APC2D1_C_I  APC2D1_C_B  \
LigandID                                                                      
CHEMBL3971505         1.0          0.0          0.0         0.0         0.0   
CHEMBL4114320         1.0          0.0          0.0         0.0         0.0   
CHEMBL4106855         0.0          0.0          0.0         0.0         0.0   
CHEMBL3654689         0.0          0.0          0.0         0.0  

In [40]:
#load models
model_list = [
'LR-L2',
'MLP',
'NB',
'RF',
'SVC-linear',
'SVC-poly',
'SVC-rbf',
'XGB',
'kNN']
from joblib import load
Model = {}
y_test_predict = {}
for name in model_list:
    for i in FP_list:
        Model[i] = load(os.path.join('models','models-fp', name+'_reg_'+i+'.joblib'))
        y_test_predict[i] = Model[i].predict(fp_test[i].values)
    columns_list = [
     name+'_AtomPairs2D',
     name+'_AtomPairs2DCount',
     name+'_CDK',
     name+'_CDKextended',
     name+'_CDKgraphonly',
     name+'_EState',
     name+'_KlekotaRoth',
     name+'_KlekotaRothCount',
     name+'_MACCS',
     name+'_PubChem',
     name+'_Substructure',
     name+'_SubstructureCount',
     ]
    #save data for next meta-training
    df_test=pd.DataFrame.from_dict(y_test_predict,orient='index').transpose().set_index(fp_test[i].index)
    df_test.columns=columns_list
    df_test.to_csv(os.path.join('predictions', name+'_reg_test.csv'))


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-

In [47]:
#load Predictive features
testlist = [file for file in sorted(glob(os.path.join('predictions', '*.csv')))]
fp_list = [
'LR-L2',
'MLP',
'NB',
'RF',
'SVC-linear',
'SVC-poly',
'SVC-rbf',
'XGB',
'kNN']
fp_test_list = dict(zip(fp_list, testlist))
print(fp_test_list)
fp_test = {}
for i in fp_list:
    fp_test[i] = pd.read_csv(fp_test_list[i],index_col='LigandID')
x_test = pd.concat([fp_test[i] for i in fp_list], axis=1)
print(x_test.shape)
select_features = ['LR-L2_AtomPairs2D', 'LR-L2_AtomPairs2DCount', 'LR-L2_CDK',
       'LR-L2_CDKextended', 'LR-L2_CDKgraphonly', 'LR-L2_EState',
       'LR-L2_KlekotaRoth', 'LR-L2_KlekotaRothCount', 'LR-L2_MACCS',
       'LR-L2_PubChem', 'LR-L2_Substructure', 'LR-L2_SubstructureCount',
       'MLP_AtomPairs2D', 'MLP_AtomPairs2DCount', 'MLP_CDK', 'MLP_CDKextended',
       'MLP_CDKgraphonly', 'MLP_EState', 'MLP_KlekotaRoth',
       'MLP_KlekotaRothCount', 'MLP_MACCS', 'MLP_PubChem', 'MLP_Substructure',
       'MLP_SubstructureCount', 'NB_AtomPairs2D', 'NB_AtomPairs2DCount',
       'NB_CDK', 'NB_CDKextended', 'NB_CDKgraphonly', 'NB_EState',
       'NB_KlekotaRoth', 'NB_KlekotaRothCount', 'NB_MACCS', 'NB_PubChem',
       'NB_Substructure', 'NB_SubstructureCount', 'RF_AtomPairs2D',
       'RF_AtomPairs2DCount', 'RF_CDK', 'RF_CDKextended', 'RF_CDKgraphonly',
       'RF_EState', 'RF_KlekotaRoth', 'RF_KlekotaRothCount', 'RF_MACCS',
       'RF_PubChem', 'RF_Substructure', 'RF_SubstructureCount',
       'SVC-linear_AtomPairs2D', 'SVC-linear_CDK', 'SVC-linear_CDKextended',
       'SVC-linear_CDKgraphonly', 'SVC-linear_KlekotaRoth',
       'SVC-linear_KlekotaRothCount', 'SVC-linear_MACCS', 'SVC-linear_PubChem',
       'SVC-linear_SubstructureCount', 'SVC-poly_AtomPairs2D',
       'SVC-poly_AtomPairs2DCount', 'SVC-poly_CDK', 'SVC-poly_CDKextended',
       'SVC-poly_CDKgraphonly', 'SVC-poly_KlekotaRoth',
       'SVC-poly_KlekotaRothCount', 'SVC-poly_MACCS', 'SVC-poly_PubChem',
       'SVC-poly_SubstructureCount', 'SVC-rbf_AtomPairs2DCount', 'SVC-rbf_CDK',
       'SVC-rbf_CDKextended', 'SVC-rbf_CDKgraphonly', 'SVC-rbf_KlekotaRoth',
       'SVC-rbf_KlekotaRothCount', 'SVC-rbf_PubChem', 'SVC-rbf_Substructure',
       'XGB_AtomPairs2D', 'XGB_AtomPairs2DCount', 'XGB_CDK', 'XGB_CDKextended',
       'XGB_CDKgraphonly', 'XGB_EState', 'XGB_KlekotaRoth',
       'XGB_KlekotaRothCount', 'XGB_MACCS', 'XGB_PubChem', 'XGB_Substructure',
       'XGB_SubstructureCount', 'kNN_AtomPairs2D', 'kNN_AtomPairs2DCount',
       'kNN_CDK', 'kNN_CDKextended', 'kNN_CDKgraphonly', 'kNN_EState',
       'kNN_KlekotaRoth', 'kNN_KlekotaRothCount', 'kNN_MACCS', 'kNN_PubChem',
       'kNN_Substructure', 'kNN_SubstructureCount']
x_test_select = x_test[select_features]
print(x_test_select.shape)
name = 'Stack'
fp = 'All'
from joblib import load
Model = load('models/'+name+fp+'.joblib')
Model

{'LR-L2': 'predictions/LR-L2_reg_test.csv', 'MLP': 'predictions/MLP_reg_test.csv', 'NB': 'predictions/NB_reg_test.csv', 'RF': 'predictions/RF_reg_test.csv', 'SVC-linear': 'predictions/SVC-linear_reg_test.csv', 'SVC-poly': 'predictions/SVC-poly_reg_test.csv', 'SVC-rbf': 'predictions/SVC-rbf_reg_test.csv', 'XGB': 'predictions/XGB_reg_test.csv', 'kNN': 'predictions/kNN_reg_test.csv'}
(5, 108)
(5, 99)


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


ValueError: node array from the pickle has an incompatible dtype:
- expected: {'names': ['left_child', 'right_child', 'feature', 'threshold', 'impurity', 'n_node_samples', 'weighted_n_node_samples', 'missing_go_to_left'], 'formats': ['<i8', '<i8', '<i8', '<f8', '<f8', '<i8', '<f8', 'u1'], 'offsets': [0, 8, 16, 24, 32, 40, 48, 56], 'itemsize': 64}
- got     : [('left_child', '<i8'), ('right_child', '<i8'), ('feature', '<i8'), ('threshold', '<f8'), ('impurity', '<f8'), ('n_node_samples', '<i8'), ('weighted_n_node_samples', '<f8')]

In [None]:
y_test_predict  = Model.predict(x_test)
y_test_predict