In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import normalize
from deepchem.utils.vina_utils import prepare_inputs
import deepchem as dc
from rdkit import Chem

from deepchem.utils.evaluate import Evaluator
from sklearn.ensemble import RandomForestRegressor

In [19]:
structure_df = pd.read_csv("../data/structures.csv")
structures = np.array(structure_df['smiles'])

In [3]:
affinity_df = pd.read_csv("../data/test.csv")
affinities = np.array(affinity_df[' G'])
affinities_normalized = affinities / np.linalg.norm(affinities)

In [22]:
fp_featurizer = dc.feat.CircularFingerprint(size=2048)
mols = [Chem.MolFromSmiles(l) for l in structures]
features = fp_featurizer.featurize(mols)

In [27]:
pdbid = '1O7S'
dataset = dc.data.NumpyDataset(X=features, y=affinities_normalized, ids=[pdbid for i in range(len(structures))])
train_dataset, test_dataset = dc.splits.RandomSplitter().train_test_split(dataset, seed=42)

In [29]:
seed = 42
sklearn_model = RandomForestRegressor(n_estimators=100, max_features='sqrt')
sklearn_model.random_state = seed
model = dc.models.SklearnModel(sklearn_model)
model.fit(train_dataset)

In [30]:
# use Pearson correlation so metrics are > 0
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)

evaluator = Evaluator(model, train_dataset, [])
train_r2score = evaluator.compute_model_performance([metric])
print("RF Train set R^2 %f" % (train_r2score["pearson_r2_score"]))

evaluator = Evaluator(model, test_dataset, [])
test_r2score = evaluator.compute_model_performance([metric])
print("RF Test set R^2 %f" % (test_r2score["pearson_r2_score"]))

RF Train set R^2 0.938019
RF Test set R^2 0.230368


In [33]:
list(zip(model.predict(test_dataset), test_dataset.y))


[(0.019739692715119404, 0.0),
 (0.03852270427029297, 0.0030860682578898635),
 (0.06496516579331262, 0.06506460577051129),
 (0.04640332246328785, 0.0),
 (0.011773350403849825, 0.0013715858923954948),
 (0.05062266356476944, 0.005572067687856698),
 (0.060593235761302, 0.006086412397505008),
 (0.0650551761175011, 0.11401307730537551),
 (0.05949853877093383, 0.10938397491854071),
 (0.043432124523886104, 0.0),
 (0.07139147431977694, 0.0),
 (0.0054203359985104446, 0.0),
 (0.025645227223064768, 0.0),
 (0.02984056557142948, 0.0586352968999074),
 (0.04781862765600343, 0.11649907673534234),
 (0.05508460392096857, 0.08983887595190491),
 (0.01647874725594912, 0.0014573100106702133),
 (0.0457141005523591, 0.10929825080026599),
 (0.013923311290179763, 0.004714826505109513)]

In [35]:
test_r2score

{'pearson_r2_score': 0.23036782003583386}