# Merge FEMA and JECFA datasets

Chemical information from the FEMA website was generated in [fema_extraction](fema_extraction.ipynb).
Chemical information from the JECFA website was generated in [jecfa_extraction](jecfa_extraction.ipynb). 

Here the two datasets are merged into a single one, making sure that the merged chemicals are in fact the same.

In [1]:
import os.path as path
import pickle
from data_paths import BASE_PATH

extracted_fema_path = path.join(BASE_PATH, 'fema_extraction', 'extracted_fema.pkl')

with open(extracted_fema_path, 'rb') as f:
    extracted_fema = pickle.load(f)

# Load JECFA database
extracted_jecfa_path = path.join(BASE_PATH, 'jecfa_extraction', 'extracted_jecfa.pkl')

with open(extracted_jecfa_path, 'rb') as f:
    extracted_jecfa = pickle.load(f)
    
# Data path for things made in this notebook

DATA_PATH = path.join(BASE_PATH, 'fema_jecfa_merge')

In [21]:
from copy import deepcopy

def fema_jecfa_merge(fema_d_list, jecfa_d_list):
    """
    Merges the individual chemical dictionaries from the FEMA and JECFA websites.
    Merges FEMA info onto JECFA info.
    Does not modify original dictionary lists
    """
    # everything will be merged onto the jecfa_c_list
    jecfa_c_list = deepcopy(jecfa_d_list)
    
    fema_c_list = deepcopy(fema_d_list)
    overlap = 0
    fema_only = 0
    jecfa_only = 0
    
    print('Length before merge: {}' .format(len(jecfa_c_list)))
    # Merge all overlapping chemicals
    for jecfa_c in jecfa_c_list:
        for fema_c in fema_c_list:
            if fema_c['fema'] == jecfa_c['fema']:
                # Capture the FEMA name under the synonym key if it's different from JECFA
                if fema_c['name'] != jecfa_c['name']:
                    if jecfa_c['synonym(s)'] == 'NaN':
                           jecfa_c['synonym(s)'] = fema_c['name']
                    else:
                        jecfa_c['synonym(s)'] += ', ' + fema_c['name']
                
                jecfa_c['fema link'] = fema_c['link']
                jecfa_c['jecfa link'] = jecfa_c['link']
                del jecfa_c['link']
                fema_c['both merge'] = True
                
                jecfa_c.update(fema_c)
                
                overlap += 1
    
    print('Length after overlap merge: {}' .format(len(jecfa_c_list)))
    
    # Copy non-overlapping FEMA into JECFA
    for fema_c in fema_c_list:
        if 'both merge' not in fema_c:
            fema_c['fema merge'] = True
            #make link key name consistent
            fema_c['fema link'] = fema_c['link']
            del fema_c['link']
            # add fema only link to merged list
            jecfa_c_list.append(fema_c)
            fema_only += 1
    
    print('Length after overlap and FEMA only merge: {}' .format(len(jecfa_c_list)))
    # Count jecfa only
    
    merges = ['both merge', 'fema merge']
    for jecfa_c in jecfa_c_list:
        if not any(merge in merges for merge in jecfa_c):
            #make link key name consistent
            jecfa_c['jecfa link'] = jecfa_c['link']
            del jecfa_c['link']
            jecfa_c['jecfa merge'] = True
            jecfa_only += 1
    
    print('{} overlaping chemicals, {} FEMA only, {} JECFA only' 
         .format(overlap, fema_only, jecfa_only))
    
    expected_length = overlap+fema_only+jecfa_only
    
    print('Total length: {}, overlap counts indicate it should be: {}' 
          .format(len(jecfa_c_list), expected_length))
    
    return jecfa_c_list

In [22]:
merged_list = fema_jecfa_merge(extracted_fema, extracted_jecfa)

Length before merge: 2183
Length after overlap merge: 2183
Length after overlap and FEMA only merge: 2833
2153 overlaping chemicals, 650 FEMA only, 31 JECFA only
Total length: 2833, overlap counts indicate it should be: 2834


Some record is not adding up. Figure it out

In [23]:
counts = [0, 0, 0]
for dicto in merged_list:
    if 'both merge' in dicto:
        counts[0] += 1
    elif 'fema merge' in dicto:
        counts [1] += 1
    elif 'jecfa merge' in dicto:
        counts[2] += 1
print(counts)

[2152, 650, 31]


The independent `both merge` count is 2152 although when the merge script says it was 2153, which suggests that some FEMA record got merged twice onto the same JECFA record, which explains why the expected length is 1 more than the actual length. Confirm this:

In [4]:
fema_nums = []
for dicto in extracted_fema:
    fema_nums.append(dicto['fema'])

dups = []
for num in fema_nums:
    count = fema_nums.count(num)
    if count > 1:
        dups.append((num, count))

dups = set(dups)

for tup in dups:
    print('{} is found {} times in extracted_fema'
      .format(tup[0], tup[1]))
    for dicto in extracted_fema:
        if dicto['fema'] == tup[0]:
            print(dicto)

3077 is found 2 times in extracted_fema
{'link': 'http://www.femaflavor.org/flavor/library/p-tolyl-phenylacetate-0', 'cas': '101-94-0', 'descriptors': 'Floral', 'name': 'p-tolyl phenylacetate', 'fema': 3077, 'stems': 'floral', 'jecfa': '705', 'cfr': '21CFR172.515'}
{'link': 'http://www.femaflavor.org/flavor/library/p-tolyl-phenylacetate', 'cas': '101-94-0', 'descriptors': 'Floral', 'name': 'p-tolyl phenylacetate', 'fema': 3077, 'stems': 'floral', 'jecfa': '705', 'cfr': '21CFR172.515'}


In [5]:
for dicto in merged_list:
    if dicto['fema'] == 3077:
        print(dicto)

{'both merge': True, 'solubility in ethanol': 'moderately soluble', 'coe': 236, 'link': 'http://www.femaflavor.org/flavor/library/p-tolyl-phenylacetate', 'jecfa link': 'http://www.femaflavor.org/flavor/library/p-tolyl-phenylacetate-0', 'chemical name': 'p-tolyl phenylacetate', 'boiling point (°c)': '310°', 'acid value max': 1, 'fema': 3077, 'refractive index': 'NaN', 'name': 'p-tolyl phenylacetate', 'molecular weight': 226.27, 'physical form/odour': 'white to off white crystals, faint lily, hyacinth, narcissus odour', 'flavis': 'NaN', 'chemical formula': 'c15h14o2', 'other requirements': 'mp: 71° (minimum)', 'jecfa': '705', 'fema link': 'http://www.femaflavor.org/flavor/library/p-tolyl-phenylacetate', 'cas': '101-94-0', 'descriptors': 'Floral', 'synonym(s)': 'p-cresyl phenylacetate;p-cresyl alpha-toluate;narcissin;p-tolyl alpha-toluate', 'cfr': '21CFR172.515', 'solubility': 'insoluble in water', 'stems': 'floral', 'specific gravity': 'NaN'}


FEMA chemical 3077 was merged twice, but that does not seem to affect the quality of the record, so will leave as is.

Merge the FEMA `stems` with the JECFA `odor`, into `merged descriptors` which can be used for downstream clustering:

In [48]:
def merge_descriptors(merged_list):
    """
    Returns a new dictionaty list with the FEMA 'stems' merged with the JECFA 'odor' values
    into 'merged descriptors'
    Removes commas from the stems
    """
    count = 0 #To keep track of chemicals that don't have any useful descriptors
    cop = deepcopy(merged_list)
    for dicto in cop:
        stems = dicto.get('stems', '')
        stems = stems.replace(',', '')
        merge = stems + ' ' + dicto.get('odor', '')
        merge = merge.strip()
        if merge:
            dicto['merged descriptors'] = merge
        else:
            count += 1
    return cop, count

In [49]:
merged_list, useless = merge_descriptors(merged_list)

In [51]:
print('Number of chemicals with no useful descriptors: {}' .format(useless))

Number of chemicals with no useful descriptors: 640


Remove chemicals with no useful descriptors so they don't bog down downstream analysis

In [52]:
def remove_useless(dicto_list):
    """
    Returns a new dicto_list without the dictos missing a 'merged descriptors' key
    """
    new_list = []
    for dicto in dicto_list:
        if 'merged descriptors' in dicto:
            new_list.append(dicto)
    return new_list

In [53]:
useful_chemicals = remove_useless(merged_list)
print('Lengths make sense: {}' .format(len(useful_chemicals) == len(merged_list) - useless))

Lengths make sense: True


Save the list of useful chemicals:

In [54]:
merged_chemicals_path = path.join(DATA_PATH, 'merged_chemicals.pkl')
with open(merged_chemicals_path, 'wb') as f:
    pickle.dump(useful_chemicals, f, protocol=pickle.HIGHEST_PROTOCOL)