# Merge FEMA and JECFA datasets

Chemical information from the FEMA website was generated in [fema_extraction](fema_extraction.ipynb).
Chemical information from the JECFA website was generated in [jecfa_extraction](jecfa_extraction.ipynb). 

Here the two datasets are merged into a single one, making sure that the merged chemicals are in fact the same.

In [2]:
import os.path as path
import pickle

# Load FEMA database
BASE_DATA_PATH = path.join(path.expanduser('~'),
                           'Dropbox',
                           'bymt',
                           'data_dumps',
                           'chem_project')

extracted_fema_path = path.join(BASE_DATA_PATH, 'fema_extraction', 'extracted_fema.pkl')

with open(extracted_fema_path, 'rb') as f:
    extracted_fema = pickle.load(f)

# Load JECFA database
extracted_jecfa_path = path.join(BASE_DATA_PATH, 'jecfa_extraction', 'extracted_jecfa.pkl')

with open(extracted_jecfa_path, 'rb') as f:
    extracted_jecfa = pickle.load(f)
    
# Data path for things made in this notebook

DATA_PATH = path.join(BASE_DATA_PATH, 'fema_jecfa_merge')

In [109]:
from copy import deepcopy
def fema_jecfa_merge(fema_d_list, jecfa_d_list):
    """
    Merges the individual chemical dictionaries from the FEMA and JECFA websites.
    Merges FEMA info onto JECFA info.
    Does not modify original dictionary lists
    """
    # everything will be merged onto the jecfa_c_list
    jecfa_c_list = deepcopy(jecfa_d_list)
    
    fema_c_list = deepcopy(fema_d_list)
    overlap = 0
    fema_only = 0
    jecfa_only = 0
    
    print('Length before merge: {}' .format(len(jecfa_c_list)))
    # Merge all overlapping chemicals
    for jecfa_c in jecfa_c_list:
        for fema_c in fema_c_list:
            if fema_c['fema'] == jecfa_c['fema']:
                # Capture the FEMA name under the synonym key if it's different from JECFA
                if fema_c['name'] != jecfa_c['name']:
                    if jecfa_c['synonym(s)'] == 'NaN':
                           jecfa_c['synonym(s)'] = fema_c['name']
                    else:
                        jecfa_c['synonym(s)'] += ', ' + fema_c['name']
                
                jecfa_c['fema link'] = fema_c['link']
                jecfa_c['jecfa link'] = jecfa_c['link']
                del jecfa_c['link']
                fema_c['both merge'] = True
                
                jecfa_c.update(fema_c)
                
                overlap += 1
    
    print('Length after overlap merge: {}' .format(len(jecfa_c_list)))
    
    # Copy non-overlapping FEMA into JECFA
    for fema_c in fema_c_list:
        if 'both merge' not in fema_c:
            fema_c['fema merge'] = True
            #make link key name consistent
            fema_c['fema link'] = fema_c['link']
            del fema_c['link']
            # add fema only link to merged list
            jecfa_c_list.append(fema_c)
            fema_only += 1
    
    print('Length after overlap and FEMA only merge: {}' .format(len(jecfa_c_list)))
    # Count jecfa only
    
    merges = ['both merge', 'fema merge']
    for jecfa_c in jecfa_c_list:
        if not any(merge in merges for merge in jecfa_c):
            #make link key name consistent
            jecfa_c['jecfa link'] = jecfa_c['link']
            del jecfa_c['link']
            jecfa_c['jecfa merge'] = True
            jecfa_only += 1
    
    print('{} overlaping chemicals, {} FEMA only, {} JECFA only' 
         .format(overlap, fema_only, jecfa_only))
    
    expected_length = overlap+fema_only+jecfa_only
    
    print('Total length: {}, overlap counts indicate it should be: {}' 
          .format(len(jecfa_c_list), expected_length))
    
    return jecfa_c_list

In [110]:
merged_list = fema_jecfa_merge(extracted_fema, extracted_jecfa)

Length before merge: 2183
Length after overlap merge: 2183
Length after overlap and FEMA only merge: 2833
2153 overlaping chemicals, 650 FEMA only, 31 JECFA only
Total length: 2833, overlap counts indicate it should be: 2834


Some record is not adding up. Figure it out

In [108]:
counts = [0, 0, 0]
for dicto in merged_list:
    if 'both merge' in dicto:
        counts[0] += 1
    elif 'fema merge' in dicto:
        counts [1] += 1
    elif 'jecfa merge' in dicto:
        counts[2] += 1
print(counts)

[2152, 650, 31]


The independent `both merge` count is 2152 although when the merge script says it was 2153, which suggests that some FEMA record got merged twice onto the same JECFA record, which explains why the expected length is 1 more than the actual length. Confirm this:

In [114]:
fema_nums = []
for dicto in extracted_fema:
    fema_nums.append(dicto['fema'])

dups = []
for num in fema_nums:
    count = fema_nums.count(num)
    if count > 1:
        dups.append((num, count))

dups = set(dups)

for tup in dups:
    print('{} is found {} times in extracted_fema'
      .format(tup[0], tup[1]))
    for dicto in extracted_fema:
        if dicto['fema'] == tup[0]:
            print(dicto)

3077 is found 2 times in extracted_fema
{'fema': 3077, 'link': 'http://www.femaflavor.org/flavor/library/p-tolyl-phenylacetate-0', 'jecfa': '705', 'descriptors': 'Floral', 'name': 'p-tolyl phenylacetate', 'cas': '101-94-0', 'cfr': '21CFR172.515', 'stems': 'floral'}
{'fema': 3077, 'link': 'http://www.femaflavor.org/flavor/library/p-tolyl-phenylacetate', 'jecfa': '705', 'descriptors': 'Floral', 'name': 'p-tolyl phenylacetate', 'cas': '101-94-0', 'cfr': '21CFR172.515', 'stems': 'floral'}


In [116]:
for dicto in merged_list:
    if dicto['fema'] == 3077:
        print(dicto)

{'name': 'p-tolyl phenylacetate', 'cas': '101-94-0', 'chemical formula': 'c15h14o2', 'both merge': True, 'jecfa link': 'http://www.femaflavor.org/flavor/library/p-tolyl-phenylacetate-0', 'link': 'http://www.femaflavor.org/flavor/library/p-tolyl-phenylacetate', 'chemical name': 'p-tolyl phenylacetate', 'cfr': '21CFR172.515', 'fema link': 'http://www.femaflavor.org/flavor/library/p-tolyl-phenylacetate', 'specific gravity': 'NaN', 'coe': 236, 'synonym(s)': 'p-cresyl phenylacetate;p-cresyl alpha-toluate;narcissin;p-tolyl alpha-toluate', 'fema': 3077, 'molecular weight': 226.27, 'stems': 'floral', 'acid value max': 1, 'solubility in ethanol': 'moderately soluble', 'solubility': 'insoluble in water', 'other requirements': 'mp: 71° (minimum)', 'jecfa': '705', 'descriptors': 'Floral', 'physical form/odour': 'white to off white crystals, faint lily, hyacinth, narcissus odour', 'flavis': 'NaN', 'boiling point (°c)': '310°', 'refractive index': 'NaN'}


FEMA chemical 3077 was merged twice, but that does not seem to affect the quality of the record, so will leave as is.

Save the merged records:

In [117]:
merged_chemicals_path = path.join(DATA_PATH, 'merged_chemicals.pkl')
with open(merged_chemicals_path, 'wb') as f:
    pickle.dump(merged_list, f, protocol=pickle.HIGHEST_PROTOCOL)