This file trains and compares simple models from scikit-learn.<br>
Given the number of molecules in the dataset, they are likely to perform better than neural networks would.

<p>
The task is to distinguish if a molecule has a pIC50>8 or not. Therefore, we are interested in binary classifiers, like SVM, random forest, and naive bayes methods. Loss for them is typicaly measured with cross entropy, while their validation is often scored on acuracy, recall and derrived measures (L1), as well as the area under the ROC curve.

# Load the libraries

In [1]:
import pickle
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
from tqdm import tqdm, trange

from CustomMolDataset import CustomMolDataset, dataBlocks

# Load the data

In [12]:
# molecules
with open("molecules_3D.pickle", 'rb') as f:
    ligs = pickle.load(f)
    
# feature filters
with open("X_filt_w3D.pickle", 'rb') as f:
    X_filt_w3D=pickle.load(f)
with open("X_filt_no3D.pickle", 'rb') as f:
    X_filt_no3D=pickle.load(f)


# CustomMolDatasets
DescriptorBlocknames_no3D = ["MACCS", "rdkitFP", "MorganFP2", "MorganFP3",
                             "Descriptors", "EState_FP", "Graph_desc",
                             "MOE", "MQN", "AUTOCORR2D", "PEOE_VSA",
                             "SMR_VSA", "SlogP_VSA"]
DescriptorBlocknames_w3D = DescriptorBlocknames_no3D + [
                             "AUTOCORR3D", "WHIM", "RDF",
                             "USR", "USRCUT", "MORSE"]
flags_no3D = [int(dataBlocks(i).name in DescriptorBlocknames_no3D) for i in range(len(dataBlocks))]
flags_w3D = [int(dataBlocks(i).name in DescriptorBlocknames_w3D) for i in range(len(dataBlocks))]

DB_no3D = CustomMolDataset(ligs,
                  representation_flags = flags_no3D,
                  X_filter=X_filt_no3D,  # feature filter
                  normalize_x = True,   # now we normalize the features
                  use_hdf5_cache = "read-only", # read cache from file
                  name = "EGFR_set_all_features"
                 )
DB_no3D.find_normalization_factors() # will also build a faster in-memory cache
DB_no3D.cache_fp.close() # close the file handle so next dataset can also access it


DB_w3D = CustomMolDataset(ligs,
                  representation_flags = flags_w3D,
                  X_filter=X_filt_w3D, # feature filter
                  normalize_x = True,   # now we normalize the features
                  use_hdf5_cache = "read-only", # read cache from file
                  name = "EGFR_set_all_features"
                 )
DB_no3D.find_normalization_factors()
DB_no3D.cache_fp.close()

# Train/Validation and cross-validation splits
Some models have meta-parameters that need tuning. This should not be done on the validation set, so we will do this tuning on cross-validation splits within the training set.

In [13]:
from sklearn.model_selection import KFold, train_test_split

# settings
split_seed = 123450
test_frac = 0.2
xval_folds = 5

# split the 2D only features
(
    no3D_train_x,
    no3D_test_x,
    no3D_train_y,
    no3D_test_y,
) = train_test_split(DB_no3D.internal_filtered_cache[0],
                     DB_no3D.internal_filtered_cache[1],
                     test_size=test_frac, random_state=split_seed)

# split the 2D+3D features
(
    w3D_train_x,
    w3D_test_x,
    w3D_train_y,
    w3D_test_y,
) = train_test_split(DB_no3D.internal_filtered_cache[0],
                     DB_no3D.internal_filtered_cache[1],
                     test_size=test_frac, random_state=split_seed)

# cross-validation
kf = KFold(n_splits=xval_folds, shuffle=True, random_state=split_seed+1978727) # shift seed by a prime

# Set up a lists of models and their performances

In [None]:
model_names=[]
model_AUCs=[]
model_L1s=[]
model_ROCs_TPR=[]
model_ROCs_FPR=[]

# Functions that summarize classifier performance

In [None]:
# TODO

# Train SVMs

In [15]:
from sklearn.svm import SVC

svm_lin = SVC(kernel="linear", C=0.025, random_state=42)