# Property calculations

The RDKit allows for the calculation of several quantitative chemical properties. These in turn can be used as parameters for machine learning. 
The scripts below use the RDKit to calculate properties for all of the chemicals labeled in the [descriptor_clustering](descriptor_clustering.ipynb) notebook. 

In [1]:
import os.path as path
import pickle

# Load merged FEMA-JECFA database
BASE_DATA_PATH = path.join(path.expanduser('~'),
                           'Dropbox',
                           'bymt',
                           'data_dumps',
                           'chem_project')

labeled_chemicals_path = path.join(BASE_DATA_PATH,
                                  'descriptor_clustering',
                                  'labeled_chemicals_2.pkl')
with open(labeled_chemicals_path, 'rb') as f:
    labeled_chemicals = pickle.load(f)
    
DATA_PATH = path.join(BASE_DATA_PATH, 'fp', 'property_calculations')

In [2]:
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem.AtomPairs import Pairs
import numpy as np

mols = [chemical['rdkit mol'] for chemical in labeled_chemicals]
labels = np.array([chemical['label'] for chemical in labeled_chemicals])

fp_types = {
                'topological': {'function': AllChem.rdmolops.RDKFingerprint,
                                'kwargs':{'fpSize':2048, 'minSize':2048}},
                'morgan':{'function': AllChem.GetMorganFingerprintAsBitVect,
                          'kwargs': {'radius': 2}},
                'maccs':{'function': Chem.MACCSkeys.GenMACCSKeys,
                         'kwargs': {}}
            }

fp_list = []
for key in fp_types:
    dicto = fp_types[key]
    # generate fingeprints
    fps = [dicto['function'](m, **dicto['kwargs']) for m in mols]

    # convert the RDKit explicit vectors into numpy arrays
    np_fps = []
    for fp in fps:
        arr = np.array([])
        DataStructs.ConvertToNumpyArray(fp, arr)
        arr = arr[None,:]
        np_fps.append(arr)

    fp_list.append(np.concatenate(np_fps, axis=0))

In [3]:
fp_features = np.concatenate(fp_list, axis=1)
fp_features.shape

(2170, 4263)

In [4]:
from rdkit.Chem import Descriptors
import types

functions = [(a, Descriptors.__dict__.get(a)) for a in dir(Descriptors)\
             if isinstance(Descriptors.__dict__.get(a), types.FunctionType)]

to_remove = ['_isCallable', '_setupDescriptors', '_test', '_ChargeDescriptors']

functions = [tup for tup in functions if tup[0] not in to_remove]

print(len(functions))

206


In [5]:
from copy import deepcopy

def property_calculator(dicto_list, function_list):
    """
    Applies functions in the function list to rdkit molecules in the chemical dictionaries
    in dicto_list
    
    dicto_list should have dictos with an 'rdkit mol' value
    function_list is a list of tuples with:
    -tup[0] = function name
    -tup[1] = callable function
    
    returns: copy of dicto_list with calculated properties added to each dicto
    """
    new_list = deepcopy(dicto_list)
    for dicto in new_list:
        mol = dicto.get('rdkit mol')
        for function in functions:
            try:
                dicto[function[0]] = function[1](mol)
            except:
                dicto[function[0]] = 'NaN'
    return new_list         

In [6]:
propertied_chemicals = property_calculator(labeled_chemicals, functions)

### Convert `propertied_chemicals` dictionary into a numpy array usable by sklearn.

In [7]:
import numpy as np

function_keys = ['label']
function_keys += [tup[0] for tup in functions]
print (function_keys[:5])

['label', 'Asphericity', 'BalabanJ', 'BertzCT', 'Chi0']


In [8]:
def array_maker(dicto_list, key_list):
    """
    Converts the chemicals in dicto_list into a numpy array based on the 
    keys listed in key_list
    """
    array_list = []
    for dicto in dicto_list:
        temp_list = []
        for key in key_list:
            temp_list.append(dicto.get(key, 'NaN'))
        
        temp_list = np.array(temp_list).astype(np.float32)
        temp_list = temp_list[None,:]
        array_list.append(temp_list)
    final = np.concatenate(array_list, axis=0)
    return final

In [9]:
test_array = array_maker(propertied_chemicals, function_keys)

Remove properties that are either all zeroes or all NaN's

In [10]:
def empty_finder(property_array, function_keys):
    """
    Returns a new version of function keys with the empty keys removed.
    """
    new_keys = np.array(function_keys)
    test_array = property_array[:]
    nan_mask = np.where(np.isnan(test_array))
    test_array[nan_mask] = 0
    sums = np.sum(test_array, axis=0)
    non_zero_mask = (sums != 0)
    new_keys = new_keys[non_zero_mask]
    return list(new_keys)

In [11]:
non_empty_keys = empty_finder(test_array, function_keys)

In [12]:
print('{} functions were removed because they provided either all zeros or all NaN values'
      .format(len(function_keys)-len(non_empty_keys)))

47 functions were removed because they provided either all zeros or all NaN values


Remake array with only the `non_empty_keys`

In [13]:
property_array = array_maker(propertied_chemicals, non_empty_keys)

In [14]:
full_array = np.concatenate((property_array, fp_features), axis=1)
full_array.shape

(2170, 4423)

### Break data into train and test sets to prevent overfitting models down the line

In [15]:
labels, features = full_array[:,0], full_array[:,1:]
print(labels.shape, features.shape)

(2170,) (2170, 4422)


In [16]:
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels =\
train_test_split(features, labels, test_size=0.3, random_state=42)

print('Samples in training set: {}, in test set: {}'
      .format(train_labels.shape[0], test_labels.shape[0]))
print('Proportion of label 1 samples in training set: {}, in test set: {}'
      .format(round(sum(train_labels == 1)/train_labels.shape[0],2),
              round(sum(test_labels == 1)/test_labels.shape[0],2)))

Samples in training set: 1519, in test set: 651
Proportion of label 1 samples in training set: 0.13, in test set: 0.15


In [17]:
dataset = {'train features': train_features,
          'train labels': train_labels,
          'test features': test_features,
          'test labels': test_labels}

dataset_path = path.join(DATA_PATH, 'dataset.pkl')

with open(dataset_path, 'wb') as f:
    pickle.dump(dataset, f, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
clf = BernoulliNB()
scores = cross_val_score(rf, train_features, train_labels, cv=5, scoring='f1')

In [19]:
import scipy.stats as st

def mean_delta(percent, scores):
    """
    Returns the size of a t-test error bar that defines a given percent confidence interval 
    """
    mean, sem = np.mean(scores), st.sem(scores)
    interval = st.t.interval(percent/float(100), len(scores)-1, loc=mean, scale=sem)
    delta = mean - interval[0]
    return mean, delta

In [20]:
mean, delta = mean_delta(95, scores)
print('{:.2f}-{:.2f}' .format(mean-delta, mean+delta))

0.37-0.61
