# Matching the chemicals found on the FEMA and JECFA websites with rdkit chemical representations

In [28]:
import os.path as path
import pickle

# Load merged FEMA-JECFA database
BASE_DATA_PATH = path.join(path.expanduser('~'),
                           'Dropbox',
                           'bymt',
                           'data_dumps',
                           'chem_project')

merged_chemicals_path = path.join(BASE_DATA_PATH, 'fema_jecfa_merge', 'merged_chemicals.pkl')
with open(merged_chemicals_path, 'rb') as f:
    merged_chemicals = pickle.load(f)

DATA_PATH = path.join(BASE_DATA_PATH, 'rdkit_chemical_matching')

In [3]:
from chemspipy import ChemSpider
cs = ChemSpider('0201ba66-585d-4135-9e6b-d28ba4724fcf')
from rdkit import Chem
from rdkit.Chem import Descriptors
from inspect import getmembers, isfunction

In [15]:
def same_chemical(results, mw):
    '''
    returns an rdkit chemical object if a the chemicals in a chemspipy result list have:
    -the same molecular weight, and
    -the same smiles representation
    returns None otherwise
    '''
    if results.count == 0:
        return None, None

    smiles = []
    mws = []

    if results.count >= 1:
        for chemical in results:
            try:
                test1_mw = chemical.molecular_weight
                test1_mw = round(test1_mw, 1)
                smiles_base = chemical.smiles
                chem_base = Chem.MolFromSmiles(smiles_base)
                test2_mw = Chem.Descriptors.MolWt(chem_base)
                test2_mw = round(test2_mw, 1)
#                 print('Test1: {}, Test2: {}' .format(test1_mw, test2_mw))                
                if (mw == test1_mw and
                   test1_mw == test2_mw):
#                     print('Matched MWs')
                    return chem_base, chemical.csid                
                # If no mw is known determines if the results are internally consistent
                # If they are, it returns one of them
                if not mw:
                    smiles_temp = Chem.MolToSmiles(chem_base)
                    smiles.append(smiles_temp)
                    mw_temp = Chem.Descriptors.MolWt(chem_base)
                    mws.append(mw_temp)
                    if (len(set(smiles)) == 1 and
                    len(set(mws)) == 1):
#                         print('All results internally consistent, but no mw match')
                        return chem_base, chemical.csid           
            except:
                print(' MW EX', end=' ')
                continue
        else:
            return None, None    
    else:
        return None, None

def chem_search(dicto):
    '''
    returns an rdkit molecule and its chemspider id 
    after searching the chemspider database based on the items
    in the priority list.
    '''
    priority_list = ['fema', 'jecfa', 'cas', 'name']
    
    for string in priority_list:
        try:
            val = dicto.get(string)
            val = str(val)
        except AttributeError:
            continue
            
        if val :
            search_string = string + ' ' + val
#             print('searching for: {}' .format(search_string))
            results = cs.search(search_string)
            try:
                mw = dicto.get('molecular weight')
                mw = round(mw, 1)
            except TypeError:
                mw = None                
#             print('stopped searching')
#             print('real MW: {}' .format(mw))
            rd, csid = same_chemical(results, mw)
            if rd:
                return rd, csid
            else:
                continue
    return None, None

In [29]:
from copy import deepcopy

def rdkit_printed_pairer(dicto_list):
    """
    Applies chem_search function to a list of individual chemical dictionaries.
    Displays a readout so that progress is known.
    """
    new_list = deepcopy(dicto_list)
    
    # Part of percentage display
    count = 0
    total = len(dicto_list)
    last_displayed = 0
    
    for dicto in new_list:
        rd, csid = chem_search(dicto)
        if rd:
            dicto['rdkit mol'] = rd
            dicto['csid'] = csid
        else:
            print(' {} failed' .format(dicto['name']), end=' ')
        
        # This noise is all about a nice display with percentage completed
        count += 1
        val = round((count / total) * 100)
        if (val % 5 == 0 and
            val != last_displayed):
            print('{:2.0f}%' .format(val), end = '.')
        else:
            print('.', end='')
        last_displayed = val
        
    return new_list

In [36]:
import pickle

def match_chunker(chunkable, filename='rdkit_chemicals.pkl', splits=10, chunk_list=None):
    """
    Splits the matching of individual chemical information into separate chunks.
    As each chunk is completed it is saved into an updated pickle file. 
    chunk_list can specify particular chunks to be processed. 
    """
    
    total = len(chunkable)
    
    # determine chunk size
    chunk_size, mod = total//(splits), total%splits
    if mod != 0:
        chunk_size = total//(splits-1)
        mod = total%chunk_size
        if (mod == 0 or
           mod < chunk_size/2): # This makes sure that the remainder is not too large
            chunk_size -= round(chunk_size/(2*splits))
    print('Chunk size: {}' .format(chunk_size))
    
    # Generate a list with the chunk indices so that if a specific chunk number is specified
    # it can be found and generated consistently
    start = 0
    end = chunk_size
    start_end_list = []
    while end != total:
        start_end_list.append((start, end))
        start += chunk_size
        end += chunk_size
        if end > total:
            end = total
    start_end_list.append((start,end))
    print('Number of chunks: {}' .format(len(start_end_list)))
    
    
    # This part does the actual matching
    rdkit_chemicals = []
    rdkit_chemicals_path = path.join(DATA_PATH, filename)
    
    if not chunk_list:
        iterable = enumerate(start_end_list)
    else:
        sub_is = [start_end_list[i] for i in chunk_list]
        iterable = enumerate(sub_is)
    
    for i, tup in iterable:
        print ('\nChunk number {}, start: {}, end: {}' .format(i, tup[0], tup[1]))
        chunk = chunkable[tup[0]:tup[1]]
        rdkit_chemicals += rdkit_printed_pairer(chunk)
        
        # Save after every chunk
        with open(rdkit_chemicals_path, 'wb') as f:
            pickle.dump(rdkit_chemicals, f, protocol=pickle.HIGHEST_PROTOCOL)
       
    return rdkit_chemicals

In [37]:
rdkit_chemicals = match_chunker(merged_chemicals)

Chunk size: 1
Number of chunks: 10

Chunk number 0, start: 0, end: 1
100%.
Chunk number 1, start: 1, end: 2
100%.
Chunk number 2, start: 2, end: 3
100%.
Chunk number 3, start: 3, end: 4
100%.
Chunk number 4, start: 4, end: 5
100%.
Chunk number 5, start: 5, end: 6
100%.
Chunk number 6, start: 6, end: 7
100%.
Chunk number 7, start: 7, end: 8
100%.
Chunk number 8, start: 8, end: 9
100%.
Chunk number 9, start: 9, end: 10
100%.