# JECFA website information extraction

### The alphabetical index

The JECFA website contains an [index](http://www.fao.org/food/food-safety-quality/scientific-advice/jecfa/jecfa-flav/browse-alphabetically/en/) with all of the chemicals for which it has information. Javascript is using this [JSON database](http://www.fao.org/food/food-safety-quality/scientific-advice/jecfa/jecfa-flav/browse-alphabetically/jsonlist/en/) to display each of those links

The scripts below extract all of the links available from the JSON database

In [6]:
import json
import os.path as path

DATA_PATH = path.join(path.expanduser('~'),
                     'Dropbox',
                     'bymt',
                     'data_dumps',
                     'chem_project',
                     'jecfa_extraction')

json_path = path.join(DATA_PATH, 'index_links.json')

with open(json_path) as f:
    data = json.load(f)

In [145]:
data[0]

{'flavour_name': "<a  href='food/food-safety-quality/scientific-advice/jecfa/jecfa-flav/details/en/c/2008/'  title=''>(+)-Cedrol</a>",
 'sortfield1': 'C',
 'sortfield2': 'Ced',
 'sortfield3': 'Cedro'}

In [146]:
from bs4 import BeautifulSoup

def jecfa_link_finder(data):
    """
    Creates a list of dicts with compound names and links based on the json database
    used by the JECFA website index page
    """
    BASE_ADDRESS = 'http://www.fao.org/'
    ret_list = []
    for chemical in data:
        link_text = chemical['flavour_name']
        soup = BeautifulSoup(link_text, 'lxml')
        name = soup.a.text.lower()
        link = BASE_ADDRESS + soup.a['href']
        dicto = {'name': name, 'link': link}
        ret_list.append(dicto)
    return ret_list

In [147]:
jecfa_links = jecfa_link_finder(data)

In [149]:
jecfa_links[0]

{'link': 'http://www.fao.org/food/food-safety-quality/scientific-advice/jecfa/jecfa-flav/details/en/c/2008/',
 'name': '(+)-cedrol'}

### Individual chemical pages

Each chemical then has its own page (for example, [acetic acid]http://www.fao.org/food/food-safety-quality/scientific-advice/jecfa/jecfa-flav/details/en/c/3/) from which I will extract:
- Odor
- Physical form
- Synonyms
- JECFA, CAS, FEMA numbers

The functions below will extract the data

Bonuses:
- COE, FLAVIS numbers
- Molecular weight
- Chemical formula
- Solubility
- Solubility in ethanol
- Boiling point
- Acid value max
- Refractive index
- Specific gravity

In [54]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup, SoupStrainer

def link_to_soup(link, strainer=None):
    '''
    support function makes a beautiful soup object from link. Disguises itself
    as a browser so its not confused for a bot

    returns:
    -Soup object if one can be made
    -None otherwise
    '''
    try:
        req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
        page = urlopen(req).read()
        soup = BeautifulSoup(page, 'lxml', parse_only=strainer)
    except:
        return None

    return soup

In [184]:
REJECT_LABELS = ['latest jecfa evaluation',
                 'status of specification', 
                 'information required',
                 'assay min %',
                 'id test',
                 'spectrum']


def link_info(dicto, reject_labels=REJECT_LABELS):
    """
    Extract and add all available information from the JECFA website to dicto,
    based on the link provided within dicto
    
    returns a copy of dicto with extracted information added
    """
    
    def is_float(s):
        try:
            float(s)
            return True
        except ValueError:
            return False
    
    mod_dicto = dicto.copy()
    link = dicto['link']
    strainer = SoupStrainer('div',class_='tx-dynafef-pi4')
    try:
        soup = link_to_soup(link, strainer)
        rows = soup.findAll('tr')

        for row in rows:
            label = row.find('td', class_='label').text.lower()
            #remove 'number' from the labels for consistency with FEMA data
            label = label.replace('number', '').strip()

            # Check if original name and name on website match
            if label == 'flavouring':
                check_value = row.find('td', class_='value').text.lower()
                if check_value != mod_dicto['name']:
                    print("NAMES DON'T MATCH")
                    return None
            elif label not in reject_labels:
                value = row.find('td', class_='value').text.lower()
                #Convert numbers to ints or floats, EMPTYS TO NaNs
                if value.isdigit():
                    value = int(value)
    #                     print('{} converted to int' .format(value))
                #This might cause an issue for flavis but is worth it for molecular weight
                elif is_float(value):
                    value = float(value)
    #                     print('{} converted to float' .format(value))
                elif (not value or
                    value == 'na'):
                    value = 'NaN'
    #                 print('{}: {}' .format(label, value))
                mod_dicto[label] = value
    except:
        print('ERROR', end='')
        return None
    
    return mod_dicto

In [189]:
test = link_info(jecfa_links[8])
test

{'acid value max': 'NaN',
 'boiling point (°c)': '60-70° (1-2 mm hg)',
 'cas': '437770-28-0',
 'chemical formula': 'c12h24o2',
 'chemical name': '2,4,8-trimethyl-7-nonen-2-ol',
 'coe': 'NaN',
 'fema': 4212,
 'flavis': 'NaN',
 'jecfa': 1644,
 'link': 'http://www.fao.org/food/food-safety-quality/scientific-advice/jecfa/jecfa-flav/details/en/c/1633/',
 'molecular weight': 184.32,
 'name': '(+/-)-2,4,8-trimethyl-7-nonen-2-ol',
 'other requirements': 'NaN',
 'physical form/odour': 'clear, colourless liquid; fruity aroma',
 'refractive index': '1.448-1.455',
 'solubility': 'insoluble in water; soluble in non-polar organic solvents',
 'solubility in ethanol': 'soluble',
 'specific gravity': '0.846-0.853',
 'synonym(s)': 'NaN'}

In [186]:
def printed_jecfa_extraction(dicto_list):
    """
    Create a list of dictionaries with all of the extractable JECFA info.
    Displays a readout so that progress is known
    """
    
    out = []
    total = len(dicto_list)
    count = 0
    last_displayed = 0
    
    for dicto in dicto_list:
        out.append(link_info(dicto))
        
        # This noise is all about a nice display with percentage completed
        count += 1
        val = round((count / total) * 100)
        if (val % 5 == 0 and
            val != last_displayed):
            print('{:2.0f}%' .format(val), end = '.')
        else:
            print('.', end='')
        last_displayed = val
    
    return out

In [190]:
test = printed_jecfa_extraction(jecfa_links[:10])

10%.20%.30%.40%.50%.60%.70%.80%.90%.100%.

In [278]:
test[2]

{'acid value max': 1,
 'boiling point (°c)': '110-115° (3.5 mm hg)',
 'cas': '67663-01-8',
 'chemical formula': 'c11h20o2',
 'chemical name': '5-hexyldihydro-4-methylfuran-2(3h)-one',
 'coe': 'NaN',
 'fema': 3999,
 'flavis': 'NaN',
 'jecfa': 1158,
 'link': 'http://www.fao.org/food/food-safety-quality/scientific-advice/jecfa/jecfa-flav/details/en/c/1149/',
 'molecular weight': 184.28,
 'name': '(+/-) 3-methyl-gamma-decalactone',
 'other requirements': 'sc: heptan-1-ol',
 'physical form/odour': 'clear liquid; floral aroma',
 'refractive index': '1.446-1.452',
 'solubility': 'insoluble in water',
 'solubility in ethanol': 'soluble',
 'specific gravity': '0.938-0.944',
 'synonym(s)': 'NaN'}

In [284]:
import pickle

def jecfa_chunker(chunkable, splits=10, chunk_list=None):
    total = len(chunkable)
    
    # determine chunk size
    chunk_size, mod = total//(splits), total%splits
    if mod != 0:
        chunk_size = total//(splits-1)
        mod = total%chunk_size
        if (mod == 0 or
           mod < chunk_size/2): # This makes sure that the remainder is not too large
            chunk_size -= round(chunk_size/(2*splits))
    print('Chunk size: {}' .format(chunk_size))
    
    # Generate a list with the chunk indices so that if a specific chunk number is specified
    # it can be found and generated consistently
    start = 0
    end = chunk_size
    start_end_list = []
    while end != total:
        start_end_list.append((start, end))
        start += chunk_size
        end += chunk_size
        if end > total:
            end = total
    start_end_list.append((start,end))
    print('Number of chunks: {}' .format(len(start_end_list)))
    
    # This part dues the actual extraction from the JECFA website
    extracted_jecfa_path = path.join(DATA_PATH, 'extracted_jecfa.pkl')
    extracted_jecfa = []
    
    if not chunk_list:
        iterable = enumerate(start_end_list)
    else:
        sub_is = [tup[i] for i in chunk_list]
        iterable = enumerate(sub_is)
    
    for i, tup in iterable:
        print ('\nChunk number {}, start: {}, end: {}' .format(i, tup[0], tup[1]))
        chunk = chunkable[tup[0]:tup[1]]
        extracted_jecfa += printed_jecfa_extraction(chunk)
        
        # Save after every chunk
        with open(extracted_jecfa_path, 'wb') as f:
            pickle.dump(extracted_jecfa, f, protocol=pickle.HIGHEST_PROTOCOL)
       
    return extracted_jecfa

In [None]:
test = jecfa_chunker(jecfa_links)

Chunk size: 230
Number of chunks: 10

Chunk number 0, start: 0, end: 230
.......... 5%...........10%............15%...........20%............25%.

In [286]:
test

[{'acid value max': 'NaN',
  'boiling point (°c)': 'NaN',
  'cas': '77-53-2',
  'chemical formula': 'c15h26o',
  'chemical name': '(3r,3as,6r,7r,8as)-3,6,8,8-tetramethyloctahydro-1h-3a,7-methanoazulen-6-ol',
  'coe': 'NaN',
  'fema': 4503,
  'flavis': 2.12,
  'jecfa': 2030,
  'link': 'http://www.fao.org/food/food-safety-quality/scientific-advice/jecfa/jecfa-flav/details/en/c/2008/',
  'molecular weight': 222.37,
  'name': '(+)-cedrol',
  'other requirements': 'm.p. = 74-77°',
  'physical form/odour': 'pale yellow to yellow green solid; sweet fruity cedar-like aroma',
  'refractive index': 'NaN',
  'solubility': 'slightly soluble in water',
  'solubility in ethanol': 'soluble',
  'specific gravity': 'NaN',
  'synonym(s)': '8-beta, h-cedran-8-ol'},
 {'acid value max': 'NaN',
  'boiling point (°c)': '234-237°',
  'cas': '877-60-1',
  'chemical formula': 'c11h18o',
  'chemical name': '(+/-)(e,z)-5-(2,2-dimethylcyclopropyl)-3-methyl-2-pentenal',
  'coe': 'NaN',
  'fema': 4105,
  'flavis': '

Intermediate step data dump

In [8]:
import pickle
import os.path as path

DATA_PATH = path.join(path.expanduser('~'),
                     'Dropbox',
                     'bymt',
                     'data_dumps',
                     'chem_project')

fema_links_path = path.join(DATA_PATH, 'fema_links.pkl')

with open(fema_links_path, 'wb') as f:
    pickle.dump(fema_links, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import nltk

import re

from chemspipy import ChemSpider
cs = ChemSpider('0201ba66-585d-4135-9e6b-d28ba4724fcf')
from rdkit import Chem
from rdkit.Chem import Descriptors
from inspect import getmembers, isfunction

In [None]:
def search_and_filter(number,
                      search_prefix='http://www.femaflavor.org/search/apachesolr_search/',
                      substring='/flavor/library/'):
    '''
    support function for dictionary_maker
    searches the Fema website for the number given and
    returns a list of links that contain the substring.
    Returns None otherwise

    Inputs:
    -number: Fema number to search for
    -search_prefix: web address prefix to search in
    -substring: to filter results

    Returns:
    -page_headings
    -name
    -link

    or
    -None if none are found
    '''


    search_link = search_prefix + str(number)
    soup = link_to_soup(search_link)
    if soup:
        search_block = soup.find_all('dl', class_='search-results apachesolr_search-results')
    else:
        return None

    #See if there are any results and extract only the links to flavor compounds
    try:
        titles = search_block[0].find_all('dt', class_='title')
        #extract all search result links
        links = [title.find('a').get('href') for title in titles]
        #select only links with flavor compund substring
        links_checked = [link for link in links if substring in link]
    except:
        return None

    if len(links_checked) >= 1:
        for link in links_checked:
            print(link)
            soup = link_to_soup(link)
            if soup:
                page_title = soup.find('h2', class_='pageTitle')
                page_headings = soup.find_all('div', class_='field field-type-header')
                title = page_title.text.split('|')
                title = [word.strip() for word in title]
                name = title[0] #compound name
                title_num = title[-1] #compound number
                if title_num == str(number):
                    return page_headings, name, link
    else:
        return None

In [None]:
def same_chemical(results):
    '''
    returns an rdkit chemical object if a the chemicals in a chemspipy result list have:
    -the same molecular weight, and
    -the same smiles representation
    returns None otherwise
    '''
    if results.count == 0:
        return None

    smiles = []
    mws = []

    if results.count >= 1:
        for chemical in results:
            try:
                smiles_base = chemical.smiles
                chem_base = Chem.MolFromSmiles(smiles_base)

                smiles_temp = Chem.MolToSmiles(chem_base)
                smiles.append(smiles_temp)

                mw_temp = Chem.Descriptors.MolWt(chem_base)
                mws.append(mw_temp)
            except:
                continue

        if (len(set(smiles)) == 1 and
                len(set(mws)) == 1):
            return Chem.MolFromSmiles(Chem.MolToSmiles(chem_base))

    else:
        return None

In [None]:
def chem_search(dict_entry, priotity_list):
    '''
    returns a rdkit molecule after searching the chemspider database based on the items
    in the priority list.
    '''

    for tup in priotity_list:
        try:
            tup_string = dict_entry.get(tup[1])
        except AttributeError:
            continue

        if tup_string:
            search_string = tup[0] + tup_string
            #print('searching for: {}' .format(search_string))
            results = cs.search(search_string)
            #print('stopped searching')
            if same_chemical(results):
                #print(tup)
                return same_chemical(results)
            else:
                continue
    return None

In [None]:
def dictionary_maker(num_iterator):
    '''
    returns a dictionary of chemicals found in the femaflavor.org website with FEMA numbers in
    the given num_iterator

    inputs:
    -num_iterator: an iterable object with the fema numbers to be searched

    returns:
    dictionary with fema number as primary key and the following subkeys:
    'link','name', 'descriptors', 'CAS', 'JECFA', 'CFR'
    '''

    dictionary = {}
    count = 0
    priority_list = [('fema ', 'FEMA'), ('jecfa ', 'JECFA'), ('', 'CAS'), ('', 'name')]

    for number in num_iterator:
        #searchNameLink is (pageHeadings, name, link) if there is a FEMA website for number.
        # None otherwise
        page_name_link = search_and_filter(number)

        if page_name_link:
            #Add all information from FEMA webpage to dictionary[number][subentries]
            dictionary[number] = {}
            dictionary[number]['link'] = page_name_link[2]
            dictionary[number]['name'] = page_name_link[1]
            dictionary[number]['FEMA'] = str(number)
            for item in page_name_link[0]:
                try:
                    label = item.find('h3', class_='field-label').stripped_strings
                    label = list(label)[0]
                    content = item.find('div', class_='field-item').stripped_strings
                    content = list(content)[0]
                except:
                    continue

                if label == 'FLAVOR PROFILE':
                    dictionary[number]['descriptors'] = content
                    #lowercase, remove non-word characters (function1), and reduce words
                    # to their stem (function2)
                    content.lower()
                    pattern = re.compile('[\W_]+')
                    pattern.sub(' ', content)
                    stemmer = nltk.stem.SnowballStemmer('english')
                    stems = [stemmer.stem(word) for word in content.split(' ')]
                    stems = ' '.join(stems)
                    text = nltk.word_tokenize(stems)
                    tokens = nltk.pos_tag(text)
                    selected = [token[0] for token in tokens if token[1] in ['NN', 'JJ']]
                    dictionary[number]['tokens'] = selected
                elif label == 'CAS':
                    dictionary[number]['CAS'] = content
                elif label == 'JECFA NUMBER':
                    dictionary[number]['JECFA'] = content
                elif label == 'CFR':
                    dictionary[number]['CFR'] = content

            #Add rdkit molecule to dictionary[number]['rdkit Mol']
            test = chem_search(dictionary[number], priority_list)
            if test:
                dictionary[number]['rdkit Mol'] = test
            else:
                print(' {}nMol' .format(number), end='')

        else:
            print(' {}nLink' .format(number), end='')

        count += 1
        if count%10 == 0:
            print(' {:.2f}%' .format(count/len(num_iterator)*100), end='')
        else:
            print('.', end='')
    return dictionary