# JECFA website information extraction

### The alphabetical index

The JECFA website contains an [index](http://www.fao.org/food/food-safety-quality/scientific-advice/jecfa/jecfa-flav/browse-alphabetically/en/) with all of the chemicals for which it has information. Javascript is using this [JSON database](http://www.fao.org/food/food-safety-quality/scientific-advice/jecfa/jecfa-flav/browse-alphabetically/jsonlist/en/) to display each of those links

The scripts below extract all of the links available from the JSON database

In [6]:
import json
import os.path as path

DATA_PATH = path.join(path.expanduser('~'),
                     'Dropbox',
                     'bymt',
                     'data_dumps',
                     'chem_project',
                     'jecfa_extraction')

json_path = path.join(DATA_PATH, 'index_links.json')

with open(json_path) as f:
    data = json.load(f)

In [145]:
data[0]

{'flavour_name': "<a  href='food/food-safety-quality/scientific-advice/jecfa/jecfa-flav/details/en/c/2008/'  title=''>(+)-Cedrol</a>",
 'sortfield1': 'C',
 'sortfield2': 'Ced',
 'sortfield3': 'Cedro'}

In [146]:
from bs4 import BeautifulSoup

def jecfa_link_finder(data):
    """
    Creates a list of dicts with compound names and links based on the json database
    used by the JECFA website index page
    """
    BASE_ADDRESS = 'http://www.fao.org/'
    ret_list = []
    for chemical in data:
        link_text = chemical['flavour_name']
        soup = BeautifulSoup(link_text, 'lxml')
        name = soup.a.text.lower()
        link = BASE_ADDRESS + soup.a['href']
        dicto = {'name': name, 'link': link}
        ret_list.append(dicto)
    return ret_list

In [147]:
jecfa_links = jecfa_link_finder(data)

In [149]:
jecfa_links[0]

{'link': 'http://www.fao.org/food/food-safety-quality/scientific-advice/jecfa/jecfa-flav/details/en/c/2008/',
 'name': '(+)-cedrol'}

### Individual chemical pages

Each chemical then has its own page (for example, [acetic acid]http://www.fao.org/food/food-safety-quality/scientific-advice/jecfa/jecfa-flav/details/en/c/3/) from which I will extract:
- Odor
- Physical form
- Synonyms
- JECFA, CAS, FEMA, COE, and FLAVIS numbers
- Molecular weight
- Chemical formula
- Solubility
- Solubility in ethanol
- Boiling point
- Acid value max
- Refractive index
- Specific gravity

In [54]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup, SoupStrainer

def link_to_soup(link, strainer=None):
    '''
    support function makes a beautiful soup object from link. Disguises itself
    as a browser so its not confused for a bot

    returns:
    -Soup object if one can be made
    -None otherwise
    '''
    try:
        req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
        page = urlopen(req).read()
        soup = BeautifulSoup(page, 'lxml', parse_only=strainer)
    except:
        return None

    return soup

In [184]:
REJECT_LABELS = ['latest jecfa evaluation',
                 'status of specification', 
                 'information required',
                 'assay min %',
                 'id test',
                 'spectrum']


def jecfa_link_info(dicto, reject_labels=REJECT_LABELS):
    """
    Create a copy of dicto with extracted information from the JECFA website added
    reject_labels specifies which information not to include. 
    """
    
    def is_float(s):
        """
        Determines if string represents a float-type number
        """
        try:
            float(s)
            return True
        except ValueError:
            return False
    
    mod_dicto = dicto.copy()
    link = dicto['link']
    strainer = SoupStrainer('div',class_='tx-dynafef-pi4')
    try:
        soup = link_to_soup(link, strainer)
        rows = soup.findAll('tr')

        for row in rows:
            label = row.find('td', class_='label').text.lower()
            #remove 'number' from the labels for consistency with FEMA data
            label = label.replace('number', '').strip()

            # Check if original name and name on website match
            if label == 'flavouring':
                check_value = row.find('td', class_='value').text.lower()
                if check_value != mod_dicto['name']:
                    print("NAMES DON'T MATCH")
                    return None
            elif label not in reject_labels:
                value = row.find('td', class_='value').text.lower()
                #Convert numbers to ints or floats, EMPTYS TO NaNs
                if value.isdigit():
                    value = int(value)
                #This might cause an issue for flavis but is worth it for molecular weight
                elif is_float(value):
                    value = float(value)
                elif (not value or
                    value == 'na'):
                    value = 'NaN'
                mod_dicto[label] = value
    except:
        print('ERROR', end='')
        return None
    
    return mod_dicto

In [186]:
def printed_jecfa_extraction(dicto_list):
    """
    Applies jecfa_link_info function to a list of individual chemical dictionaries.
    Displays a readout so that progress is known.
    """
    
    out = []
    total = len(dicto_list)
    count = 0
    last_displayed = 0
    
    for dicto in dicto_list:
        out.append(jecfa_link_info(dicto))
        
        # This noise is all about a nice display with percentage completed
        count += 1
        val = round((count / total) * 100)
        if (val % 5 == 0 and
            val != last_displayed):
            print('{:2.0f}%' .format(val), end = '.')
        else:
            print('.', end='')
        last_displayed = val
    
    return out

In [284]:
import pickle

def jecfa_chunker(chunkable, file_name='pre_odor_jecfa.pkl', splits=10, chunk_list=None):
    """
    Splits the extraction of individual chemical information into separate chunks.
    As each chunk is completed it is saved into an updated pickle file. 
    chunk_list can specify particular chunks to be processed. 
    """
    
    total = len(chunkable)
    
    # determine chunk size
    chunk_size, mod = total//(splits), total%splits
    if mod != 0:
        chunk_size = total//(splits-1)
        mod = total%chunk_size
        if (mod == 0 or
           mod < chunk_size/2): # This makes sure that the remainder is not too large
            chunk_size -= round(chunk_size/(2*splits))
    print('Chunk size: {}' .format(chunk_size))
    
    # Generate a list with the chunk indices so that if a specific chunk number is specified
    # it can be found and generated consistently
    start = 0
    end = chunk_size
    start_end_list = []
    while end != total:
        start_end_list.append((start, end))
        start += chunk_size
        end += chunk_size
        if end > total:
            end = total
    start_end_list.append((start,end))
    print('Number of chunks: {}' .format(len(start_end_list)))
    
    # This part dues the actual extraction from the JECFA website
    extracted_jecfa_path = path.join(DATA_PATH, file_name)
    extracted_jecfa = []
    
    if not chunk_list:
        iterable = enumerate(start_end_list)
    else:
        sub_is = [start_end_list[i] for i in chunk_list]
        iterable = enumerate(sub_is)
    
    for i, tup in iterable:
        print ('\nChunk number {}, start: {}, end: {}' .format(i, tup[0], tup[1]))
        chunk = chunkable[tup[0]:tup[1]]
        extracted_jecfa += printed_jecfa_extraction(chunk)
        
        # Save after every chunk
        with open(extracted_jecfa_path, 'wb') as f:
            pickle.dump(extracted_jecfa, f, protocol=pickle.HIGHEST_PROTOCOL)
       
    return extracted_jecfa

In [287]:
pre_odor_jecfa = jecfa_chunker(jecfa_links)

Chunk size: 230
Number of chunks: 10

Chunk number 0, start: 0, end: 230
.......... 5%...........10%............15%...........20%............25%...........30%............35%...........40%............45%...........50%............55%...........60%............65%...........70%............75%...........80%............85%...........90%............95%...........100%..
Chunk number 1, start: 230, end: 460
.......... 5%...........10%............15%...........20%............25%...........30%............35%...........40%............45%...........50%............55%...........60%............65%...........70%............75%...........80%............85%...........90%............95%...........100%..
Chunk number 2, start: 460, end: 690
.......... 5%...........10%............15%...........20%............25%...........30%............35%...........40%............45%...........50%............55%...........60%............65%...........70%............75%...........80%............85%...........90%..........

In [289]:
print('Length extracted dictionary: {} compared to length of links: {}' 
      .format(len(pre_odor_jecfa), len(jecfa_links)))

Length extracted dictionary: 2183 compared to length of links: 2183


The JECFA chemcal websites combine physical form with odor characteristics. I only want the odor descriptions so will split off the physical description

In [1]:
# Reload intermediate data-dump

# import os.path as path
# import pickle

# DATA_PATH = path.join(path.expanduser('~'),
#                      'Dropbox',
#                      'bymt',
#                      'data_dumps',
#                      'chem_project',
#                      'jecfa_extraction')

# pre_odor_jecfa_path = path.join(DATA_PATH, 'pre_odor_jecfa.pkl')

# with open(extracted_jecfa_path, 'rb') as f:
#     extracted_jecfa = pickle.load(f)

Look at the strings to find a pattern

In [7]:
import random

rand_is = random.sample(range(len(pre_odor_jecfa)), 30)
for i in rand_is:
    print (pre_odor_jecfa[i]['physical form/odour'])

colourless clear liquid; fatty floral aroma
colourless to pale yellow liquid with cabbage odour
colourless liquid
colorless mobile liquid; camphor like aroma
colourless liquid with repulsive, mercaptan-like odour
colourless to yellow liquid; pepper-like aroma
white crystals, heavy balsamic-like rosy odour
colourless or yellowish slighty oily liquid; very sweet odour reminiscent of red rose with a fruity undertone
yellow to green liquid with a  penetrating, buttery odour on dilution
colourless to yellow liquid with strong odour
yellowish liquid
colourless liquid; celery, herbacous, spicy aroma
colourless liquid with a fruity odour
brownish liquid; roasted meat aroma
colourless to pale yellow liquid with metallic fruity odour
white to colourless liquid or crystals at room temperature; powerful floral, fruity aroma
white crystals
colourless liquid; fruity aroma
colourless liquid; floral fruity aroma
colourless to pale straw-yellow viscous liquid; faint, orris-like, green, sweet, woody aro

Most common patterns are: 

1) to divide physical characteristics from odor with semicolon

2) Have the physical descriptor ('liquid', 'solid', 'crystal', 'flake') immediately, or close to immediately precede the odor

In [115]:
import nltk
import re

def jecfa_odor_cleaner(dicto_list):
    new_list = dicto_list.copy()
    count = 0
    for dicto in dicto_list:
        text = dicto.get('physical form/odour')
        new_text = text.lower()
        pattern = re.compile(r'[,-]')
        new_text = re.sub(pattern, ' ', new_text)
        stemmer = nltk.stem.SnowballStemmer('english')
        stems = [stemmer.stem(word) for word in new_text.split(' ')]
        stems = ' '.join(stems)
        dicto['stems'] = stems
        splitters = re.compile(r'(;|liquid|solid|crystal|flake)')
        matches = re.finditer(splitters, stems)
        matches = [m for m in matches]
        if matches:
            last = matches[-1]
            split_stems = [stems[:last.end()], stems[last.end():]]
            if len(split_stems) == 2:
                dicto['physical'] = split_stems[0]
                dicto['odor'] = split_stems[1]
                count += 1

    print('splits: {} out of {}' .format(count, len(dicto_list)))
    return new_list

In [116]:
test = jecfa_odor_cleaner(pre_odor_jecfa)

splits: 2102 out of 2183


In [119]:
for i in rand_is:
    if not test[i].get('odor'):
        print(test[i]['stems'])
#     print ('original: {}\nphysical: {} ////// odor: {}'
#            .format(test[i]['physical form/odour'],
#                    test[i].get('physical'),
#                    test[i].get('odor')))

colourless liquid
yellowish liquid
white crystal
clear  colourless to yellowish liquid


In [105]:
stems = 'pale'
splitters = re.compile(r'(;|liquid|solid|crystal|flake)')
m = re.finditer(splitters, stems)

In [106]:
m = [m for m in m]

In [109]:
if m:
    print('yeah!')

In [104]:
stems[:34]

'pale yellow to yellow green solid;'