In [4]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, Descriptors

In [41]:
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, cohen_kappa_score, matthews_corrcoef
import joblib

In [7]:
df = pd.read_csv("C:\\Users\\vswen\\Documents\\1. Biomedische Technologie\\BMT JAAR 5\\Kwart 4\\4. Data\\CHEMBL234_Ki.csv")
PandasTools.AddMoleculeColumnToFrame(df, "smiles")
df["Mol_H"] = df["ROMol"].apply(Chem.AddHs)
Chem.PandasTools.WriteSDF(df,"C:\\Users\\vswen\\Documents\\1. Biomedische Technologie\\BMT JAAR 5\\Kwart 4\\4. Data\\CHEMBL234_Ki.sdf", idName=None, properties=list(df.columns), allNumeric=False)

In [9]:
filename = "C:\\Users\\vswen\\Documents\\1. Biomedische Technologie\\BMT JAAR 5\\Kwart 4\\4. Data\\SDF_File.sdf"

In [19]:
# Reading molecules and activity from SDF

mols = []
y = []
for mol in Chem.SDMolSupplier(filename):
    if mol is not None:
        mols.append(mol)
        y.append(mol.GetIntProp("cliff_mol"))

In [21]:
# Calculating descriptors (fingerprints) and convert them into numpy array

## Generate binary Morgan fingerprint with radius 2
fp = [AllChem.GetMorganFingerprintAsBitVect(m,2) for m in mols]

In [22]:
def rdkit_numpy_convert(fp):
    output = []
    for f in fp:
        arr = np.zeros((1,))
        DataStructs.ConvertToNumpyArray(f,arr)
        output.append(arr)
    return np.asarray(output)

In [23]:
x = rdkit_numpy_convert(fp)
x.shape

(3657, 2048)

In [24]:
sum(y)/len(y)

0.3940388296417829

In [25]:
# Random seed to make calculations reproducible

seed = 42

In [26]:
# Split in training and test sets
## Randomly select 20% of compounds as test set

x_tr, x_ts, y_tr, y_ts = train_test_split(x,y,test_size=0.20, random_state=seed)

In [35]:
# Create folds for cross-validation
## Orgineel stond er random_state=seed maar dit gaf een error!

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [36]:
# Print out ids of folds
for i, (train_index, test_index) in enumerate(cv.split(x_tr, y_tr)):
    print("\nFold_"+str(i+1))
    print("TRAIN:", train_index)
    print("TEST:", test_index)


Fold_1
TRAIN: [   0    1    2 ... 2922 2923 2924]
TEST: [  14   21   23   28   29   30   31   33   35   37   43   44   47   49
   52   57   60   62   73   74   78   84   97  104  123  124  130  131
  135  154  161  169  171  175  190  191  192  209  211  216  217  221
  223  224  227  228  229  232  241  243  248  253  256  258  268  273
  274  275  281  291  296  303  314  319  328  329  334  335  345  349
  357  359  360  365  367  372  374  375  380  384  388  392  395  401
  407  412  413  414  429  433  436  441  451  452  460  463  465  469
  473  474  477  479  481  501  512  515  516  518  523  525  533  537
  544  558  561  567  570  576  583  593  595  608  609  611  619  623
  638  642  643  648  649  650  653  654  656  668  671  709  710  711
  718  725  728  734  735  736  738  740  742  759  760  762  767  771
  772  791  803  809  819  821  828  832  838  847  857  859  861  865
  868  877  887  892  900  906  918  923  925  934  936  938  941  944
  948  953  958  959

In [37]:
# obtain scale object 
## can be further applied to scale any data to fit the training set

scale = StandardScaler().fit(x_tr)
x_tr = scale.transform(x_tr)

In [43]:
# saving for future use

joblib.dump(scale,"C:\\Users\\vswen\\Documents\\1. Biomedische Technologie\\BMT JAAR 5\\Kwart 4\\4. Data\\CHEMBL234_Ki_scale.pkl", compress=3)

['C:\\Users\\vswen\\Documents\\1. Biomedische Technologie\\BMT JAAR 5\\Kwart 4\\4. Data\\CHEMBL234_Ki_scale.pkl']