In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
import rdkit, rdkit.Chem, rdkit.Chem.Draw
from rdkit.Chem.Draw import IPythonConsole
import numpy as np
import mordred, mordred.descriptors
import counterstone as cs

IPythonConsole.ipython_useSVG = True
sns.set_context('notebook')
sns.set_style('dark',  {'xtick.bottom':True, 'ytick.left':True, 'xtick.color': '#666666', 'ytick.color': '#666666',
                        'axes.edgecolor': '#666666', 'axes.linewidth':     0.8 , 'figure.dpi': 300})
color_cycle = ['#1BBC9B', '#F06060', '#5C4B51', '#F3B562', '#6e5687']
mpl.rcParams['axes.prop_cycle'] = mpl.cycler(color=color_cycle) 
np.random.seed(0)
toxdata = pd.read_csv('https://github.com/whitead/dmol-book/raw/master/data/clintox.csv.gz')
toxdata.head()

Unnamed: 0,smiles,FDA_APPROVED,CT_TOX
0,*C(=O)[C@H](CCCCNC(=O)OCCOC)NC(=O)OCCOC,1,0
1,[C@@H]1([C@@H]([C@@H]([C@H]([C@@H]([C@@H]1Cl)C...,1,0
2,[C@H]([C@@H]([C@@H](C(=O)[O-])O)O)([C@H](C(=O)...,1,0
3,[H]/[NH+]=C(/C1=CC(=O)/C(=C\C=c2ccc(=C([NH3+])...,1,0
4,[H]/[NH+]=C(\N)/c1ccc(cc1)OCCCCCOc2ccc(cc2)/C(...,1,0


In [22]:
# make object that can compute descriptors
calc = mordred.Calculator(mordred.descriptors, ignore_3D=True)
# make subsample from pandas df
molecules = [rdkit.Chem.MolFromSmiles(smi) for smi in toxdata.smiles]

# the invalid molecules were None, so we'll just
# use the fact the None is False in Python
valid_mol_idx = [bool(m) for m in molecules]
valid_mols = [m for m in molecules if m]
raw_features = calc.pandas(valid_mols)
labels = toxdata[valid_mol_idx].CT_TOX

  2%|▏         | 24/1478 [00:02<03:41,  6.58it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


  8%|▊         | 124/1478 [00:08<04:59,  4.52it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 19%|█▉        | 287/1478 [00:16<02:03,  9.63it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 20%|█▉        | 289/1478 [00:16<01:44, 11.43it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 22%|██▏       | 328/1478 [00:18<02:12,  8.66it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 29%|██▉       | 432/1478 [00:25<04:08,  4.20it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 34%|███▍      | 499/1478 [00:36<06:03,  2.69it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 34%|███▍      | 501/1478 [00:37<05:22,  3.03it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 35%|███▌      | 520/1478 [00:38<03:56,  4.05it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 50%|█████     | 743/1478 [00:51<03:58,  3.09it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 72%|███████▏  | 1057/1478 [01:13<01:59,  3.51it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 75%|███████▍  | 1108/1478 [01:17<01:35,  3.89it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|██████████| 1478/1478 [01:40<00:00, 14.75it/s]


In [24]:
fm = raw_features.mean()
fs = raw_features.std()
def feature_convert(f):
    f -= fm
    f /= fs
    return f

features = feature_convert(raw_features)

# we have some nans in features, likely because std was 0
features = features.values.astype(float)
features_select = np.all(np.isfinite(features), axis=0)
features = features[:, features_select]

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

X_train, X_test, y_train, y_test = train_test_split(
    features.values, labels, test_size=0.2, shuffle=True)

clf = RandomForestClassifier(max_depth=3, random_state=0)
clf.fit(X_train, y_train)
predicted = clf.predict(X_test)
roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])

0.7759307359307359

In [15]:
example = toxdata.smiles.iloc[valid_mol_idx][0]
example_y = clf.predict(features.values[0:1])

def model_eval(smiles, selfies):
    molecules = [rdkit.Chem.MolFromSmiles(smi) for smi in smiles]
    # input wrangling. Get some weird values from weird smiles
    features = calc.pandas(molecules, quiet=True).astype(float, copy=False)
    features = feature_convert(features.values)
    print('isnan', np.any(np.isnan(features.values)))
    print('isinf', np.all(np.isfinite(features.values)))
    labels = clf.predict(features.values)
    return (example_y - labels).astype(bool)

In [16]:
exps = cs.explain(example[1:], model_eval)

ValueError: operands could not be broadcast together with shapes (1000,1613) (483,) (1000,1613) 

In [19]:
fs.shape

(483,)