# Property calculations

The RDKit allows for the calculation of several quantitative chemical properties. These in turn can be used as parameters for machine learning. 
The scripts below use the RDKit to calculate properties for all of the chemicals labeled in the [descriptor_clustering](descriptor_clustering.ipynb) notebook. 

In [1]:
import os.path as path
import pickle

# Load merged FEMA-JECFA database
BASE_DATA_PATH = path.join(path.expanduser('~'),
                           'Dropbox',
                           'bymt',
                           'data_dumps',
                           'chem_project')

labeled_chemicals_path = path.join(BASE_DATA_PATH,
                                  'descriptor_clustering',
                                  'labeled_chemicals.pkl')
with open(labeled_chemicals_path, 'rb') as f:
    labeled_chemicals = pickle.load(f)
    
DATA_PATH = path.join(BASE_DATA_PATH, 'property_calculations')

In [2]:
from rdkit.Chem import Descriptors
import types

functions = [(a, Descriptors.__dict__.get(a)) for a in dir(Descriptors)\
             if isinstance(Descriptors.__dict__.get(a), types.FunctionType)]

to_remove = ['_isCallable', '_setupDescriptors', '_test', '_ChargeDescriptors']

functions = [tup for tup in functions if tup[0] not in to_remove]

print(len(functions))

206


In [3]:
from copy import deepcopy

def property_calculator(dicto_list, function_list):
    """
    Applies functions in the function list to rdkit molecules in the chemical dictionaries
    in dicto_list
    
    dicto_list should have dictos with an 'rdkit mol' value
    function_list is a list of tuples with:
    -tup[0] = function name
    -tup[1] = callable function
    
    returns: copy of dicto_list with calculated properties added to each dicto
    """
    new_list = deepcopy(dicto_list)
    for dicto in new_list:
        mol = dicto.get('rdkit mol')
        for function in functions:
            try:
                dicto[function[0]] = function[1](mol)
            except:
                dicto[function[0]] = 'NaN'
    return new_list         

In [4]:
propertied_chemicals = property_calculator(labeled_chemicals, functions)

Convert `propertied_chemicals` dictionary into a numpy array usable by sklearn.

In [5]:
import numpy as np

function_keys = ['label']
function_keys += [tup[0] for tup in functions]
print (function_keys[:5])

['label', 'Asphericity', 'BalabanJ', 'BertzCT', 'Chi0']


In [6]:
def array_maker(dicto_list, key_list):
    """
    Converts the chemicals in dicto_list into a numpy array based on the 
    keys listed in key_list
    """
    array_list = []
    for dicto in dicto_list:
        temp_list = []
        for key in key_list:
            temp_list.append(dicto.get(key, 'NaN'))
        
        temp_list = np.array(temp_list).astype(np.float32)
        temp_list = temp_list[None,:]
        array_list.append(temp_list)
    final = np.concatenate(array_list, axis=0)
    return final

In [7]:
property_array = array_maker(propertied_chemicals, function_keys)

Save `property_array` to use for model training. Note that the first column has the labels for each chemical as determined in [descriptor_clustering](descriptor_clustering.ipynb).

In [9]:
property_array_path = path.join(DATA_PATH, 'property_array.pkl')

with open(property_array_path, 'wb') as f:
    pickle.dump(property_array, f, protocol=pickle.HIGHEST_PROTOCOL)

function_keys_path = path.join(DATA_PATH, 'function_keys.pkl')
with open(function_keys_path, 'wb') as g:
    pickle.dump(function_keys, g, protocol=pickle.HIGHEST_PROTOCOL)