# Matching the chemicals found on the FEMA and JECFA websites with rdkit chemical representations

In [1]:
import os.path as path
import pickle

# Load merged FEMA-JECFA database
BASE_DATA_PATH = path.join(path.expanduser('~'),
                           'Dropbox',
                           'bymt',
                           'data_dumps',
                           'chem_project')

merged_chemicals_path = path.join(BASE_DATA_PATH, 'fema_jecfa_merge', 'merged_chemicals.pkl')
with open(merged_chemicals_path, 'rb') as f:
    merged_chemicals = pickle.load(f)

In [3]:
from chemspipy import ChemSpider
cs = ChemSpider('0201ba66-585d-4135-9e6b-d28ba4724fcf')
from rdkit import Chem
from rdkit.Chem import Descriptors
from inspect import getmembers, isfunction

In [2]:
merged_chemicals[0]

{'acid value max': 'NaN',
 'boiling point (°c)': 'NaN',
 'both merge': True,
 'cas': '77-53-2',
 'chemical formula': 'c15h26o',
 'chemical name': '(3r,3as,6r,7r,8as)-3,6,8,8-tetramethyloctahydro-1h-3a,7-methanoazulen-6-ol',
 'coe': 'NaN',
 'fema': 4503,
 'fema link': 'http://www.femaflavor.org/flavor/library/cedrol',
 'flavis': 2.12,
 'jecfa': '2030',
 'jecfa link': 'http://www.fao.org/food/food-safety-quality/scientific-advice/jecfa/jecfa-flav/details/en/c/2008/',
 'link': 'http://www.femaflavor.org/flavor/library/cedrol',
 'merged descriptors': 'sweet fruiti cedar like aroma',
 'molecular weight': 222.37,
 'name': '(+)-cedrol',
 'odor': 'sweet fruiti cedar like aroma',
 'other requirements': 'm.p. = 74-77°',
 'physical': 'pale yellow to yellow green solid',
 'physical form/odour': 'pale yellow to yellow green solid; sweet fruity cedar-like aroma',
 'refractive index': 'NaN',
 'solubility': 'slightly soluble in water',
 'solubility in ethanol': 'soluble',
 'specific gravity': 'NaN',
 

In [11]:
results = cs.search('cas 77-53-2')

In [16]:
for chemical in results:
    print(round(chemical.molecular_weight,2) == merged_chemicals[0]['molecular weight'])

True


In [38]:
def same_chemical(results):
    '''
    returns an rdkit chemical object if a the chemicals in a chemspipy result list have:
    -the same molecular weight, and
    -the same smiles representation
    returns None otherwise
    '''
    if results.count == 0:
        return None

    smiles = []
    mws = []

    if results.count >= 1:
        for chemical in results:
            try:
                smiles_base = chemical.smiles
                chem_base = Chem.MolFromSmiles(smiles_base)

                smiles_temp = Chem.MolToSmiles(chem_base)
                smiles.append(smiles_temp)

                mw_temp = Chem.Descriptors.MolWt(chem_base)
                mws.append(mw_temp)
            except:
                continue

        if (len(set(smiles)) == 1 and
                len(set(mws)) == 1):
            return Chem.MolFromSmiles(Chem.MolToSmiles(chem_base)), chemical.csid

    else:
        return None, None

def chem_search(dicto):
    '''
    returns an rdkit molecule and its chemspider id 
    after searching the chemspider database based on the items
    in the priority list.
    '''
    priority_list = ['fema', 'jecfa', 'cas', 'name']
    
    for string in priority_list:
        try:
            val = dicto.get(string)
            val = str(val)
        except AttributeError:
            continue
            
        if val :
            search_string = string + ' ' + val
            #print('searching for: {}' .format(search_string))
            results = cs.search(search_string)
            #print('stopped searching')
            if same_chemical(results):
                return same_chemical(results)
            else:
                continue
    return None, None

In [41]:
from copy import deepcopy

def rdkit_pairer(dicto_list):
    new_list = deepcopy(dicto_list)
    for dicto in new_list:
        rd, csid = chem_search(dicto)
        if rd:
            dicto['rdkit mol'] = rd
            dicto['csid'] = csid
        else:
            print('{} failed' .format(dicto['name']))
    return new_list

In [42]:
test = rdkit_pairer([merged_chemicals[0]])

In [43]:
test

[{'acid value max': 'NaN',
  'boiling point (°c)': 'NaN',
  'both merge': True,
  'cas': '77-53-2',
  'chemical formula': 'c15h26o',
  'chemical name': '(3r,3as,6r,7r,8as)-3,6,8,8-tetramethyloctahydro-1h-3a,7-methanoazulen-6-ol',
  'coe': 'NaN',
  'csid': 59018,
  'fema': 4503,
  'fema link': 'http://www.femaflavor.org/flavor/library/cedrol',
  'flavis': 2.12,
  'jecfa': '2030',
  'jecfa link': 'http://www.fao.org/food/food-safety-quality/scientific-advice/jecfa/jecfa-flav/details/en/c/2008/',
  'link': 'http://www.femaflavor.org/flavor/library/cedrol',
  'merged descriptors': 'sweet fruiti cedar like aroma',
  'molecular weight': 222.37,
  'name': '(+)-cedrol',
  'odor': 'sweet fruiti cedar like aroma',
  'other requirements': 'm.p. = 74-77°',
  'physical': 'pale yellow to yellow green solid',
  'physical form/odour': 'pale yellow to yellow green solid; sweet fruity cedar-like aroma',
  'rdkit mol': <rdkit.Chem.rdchem.Mol at 0x10d21f7b0>,
  'refractive index': 'NaN',
  'solubility': '

In [31]:
search = cs.search(test_csid)
for chemical in search:
    print(chemical.common_name)

(+)-Cedrol
