# Baseline Methods for Molecular Property Prediction
*by: Derek Jones*

This script implements methods in order to benchmark the performance of the neural fingerprinting methods for the
    task of predicting dragon features from the smiles input

In [26]:
import multiprocessing as mp
from multiprocessing import Pool, Queue
from sklearn.linear_model import LinearRegression
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
import numpy as np
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler, Normalizer

targets = ["Hy"]

# Load data

In [6]:
def load_data(csv_file,fp_radius=2, fp_length=1024, targets=None, scaling=None, corrupt_path=None):
    cols = ["receptor", "drugID", "smiles", "label"] + targets
    print("loading data...")
    data = pd.read_csv(csv_file, usecols=cols)
    corrupt_compound_df = pd.read_csv(corrupt_path)
    data = data[~data.drugID.isin(corrupt_compound_df.drugID)]
    if scaling == "std":
        print("standardizing targets...")
        data[targets] = StandardScaler().fit_transform(data[targets])
    elif scaling == "norm":
        print("normalizing targets...")
        data[targets] = Normalizer().fit_transform(data[targets])
    elif scaling is not None:
        raise Exception("preprocessing method not implemented.")

    return data

def get_mol_job(smile):
    return Chem.MolFromSmiles(smile)

def get_fp_job(mol, fp_radius=2, nBits=1024):
    return AllChem.GetMorganFingerprintAsBitVect(mol, fp_radius, nBits=nBits)

def get_fp_data_job(fp):
    x = np.ones(fp.GetNumBits())
    DataStructs.ConvertToNumpyArray(fp,x)
    return x


def get_fp_data(data):
    return pd.DataFrame([get_fp_data_job(get_fp_job(get_mol_job(smile))) for smile in data["smiles"]])


def parallelize(data, func, workers, chunksize=10):
    data_split = np.array_split(data, chunksize)
    print("creating worker pool...")
    pool = Pool(processes=workers)
    new_data = pd.concat(pool.map(func, data_split))  # may need special function for feeding multiple args
    print("closing worker pool...")
    pool.close()
    pool.join()
    return new_data


In [4]:
data = load_data(csv_file="/scratch/wdjo224/data/deep_protein_binding/kinase_no_duplicates_with_smiles.csv",
              corrupt_path="/scratch/wdjo224/data/deep_protein_binding/corrupt_inputs.csv", targets=targets,
              scaling="std")

loading data...
standardizing targets...


# Compute Fingerprints

In [8]:
print("computing fingerprints...")
fps = parallelize(func=get_fp_data, data=data, workers=10, chunksize=4)
print("preprocessing complete.")

computing fingerprints...
creating worker pool...
closing worker pool...
preprocessing complete.


# Train on the Data

In [34]:
from sklearn.metrics import r2_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import SGDRegressor

train_idxs = np.fromfile("/scratch/wdjo224/deep_protein_binding/src/train.npy", dtype=np.int)
test_idxs = np.fromfile("/scratch/wdjo224/deep_protein_binding/src/test.npy", dtype=np.int)

# model = LinearRegression(n_jobs=2)

x_train = fps.iloc[train_idxs].as_matrix()
y_train = data[targets].iloc[train_idxs].as_matrix()
x_test = fps.iloc[test_idxs].as_matrix()
y_test = data[targets].iloc[test_idxs].as_matrix()

model.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


SGDRegressor(alpha=0.0001, average=False, epsilon=0.1, eta0=0.01,
       fit_intercept=True, l1_ratio=0.15, learning_rate='invscaling',
       loss='squared_loss', max_iter=10, n_iter=None, penalty='l2',
       power_t=0.25, random_state=None, shuffle=True, tol=None, verbose=0,
       warm_start=False)

# Evaluate Results

In [35]:
preds = model.predict(x_test)

print("test r2: {}".format(r2_score(y_pred=preds, y_true=y_test)))

test r2: 0.6922587539242938
