# FEMA website information extraction

### The library pages

The FEMA website contains a series of [library pages](https://www.femaflavor.org/flavor/library?page=) that list all of the FEMA chemicals.

The functions below extract all of the links available:

In [4]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup, SoupStrainer

def link_to_soup(link, strainer=None):
    '''
    Makes a beautiful soup object from link. Disguises itself
    as a browser so its not confused for a bot

    strainer: limits the html to be parsed

    returns:
    -Soup object if one can be made
    -None otherwise
    '''
    try:
        req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
        page = urlopen(req).read()
        soup = BeautifulSoup(page, 'lxml', parse_only=strainer)
    except:
        return None

    return soup

In [5]:
def fema_link_finder():
    """
    Creates a list of dictionaries with compound names, links and FEMA numbers
    based on the FEMA website library pages
    """
    fema_library_link = 'http://www.femaflavor.org/flavor/library?page='
    fema_base_link = 'http://www.femaflavor.org'
    strainer = SoupStrainer('tbody')
    ret_list = []
    for i in range(28):
        new_link = fema_library_link + str(i)
        soup = link_to_soup(new_link, strainer=strainer)
        rows = soup.findAll('tr')
        
        for row in rows:
            columns = row.find_all('td')
            for col in columns:
                if col.string:
                    num = int(col.string)
                    #print(num)
                elif col.a:
                    name = str(col.a.string).lower()
                    full_link = fema_base_link + col.a.get('href')
                    #print(name, full_link)
            dicto = {'name': name, 'link': full_link, 'fema': num}
            ret_list.append(dicto)
            print('.', end='')
    
    return ret_list

In [None]:
fema_links = fema_link_finder()

In [6]:
fema_links[0]

{'fema': 2001,
 'link': 'http://www.femaflavor.org/acacia-gum-acacia-senegal-l-willd-2',
 'name': 'acacia gum (acacia senegal (l.) willd.)'}

Intermediate data dump

In [3]:
import pickle
import os.path as path

DATA_PATH = path.join(path.expanduser('~'),
                     'Dropbox',
                     'bymt',
                     'data_dumps',
                     'chem_project',
                     'fema_extraction')

fema_links_path = path.join(DATA_PATH, 'fema_links.pkl')

# with open(fema_links_path, 'wb') as f:
#     pickle.dump(fema_links, f, protocol=pickle.HIGHEST_PROTOCOL)

with open(fema_links_path, 'rb') as f:
    fema_links = pickle.load(f)

Each chemical then has its own page (for example, [acetic acid](https://www.femaflavor.org/acetic-acid-2)) from which I will extract:
- Flavor descriptors
- Chemical Abstracts Service (CAS) registry number
- JECFA number
- US Government's Code of Ferderal Regulations (CFR) citation

The folowing functions take the data from `fema_links` to get the data from each individual chemical page:

In [7]:
import nltk
import re
from chemspipy import ChemSpider
cs = ChemSpider('0201ba66-585d-4135-9e6b-d28ba4724fcf')
from rdkit import Chem

def fema_link_info(dicto):
    """
    Create a copy of dicto with extracted information from the FEMA website added
    """
    link_dict = dicto.copy()
    soup = link_to_soup(link_dict['link'])
    if soup: 
        # Get the page title fema number and confirm it matches the number from link_dict
        page_titles = soup.find_all('h2', class_='pageTitle')
        for res in page_titles:
            if len(res.text) > 0:
                title = res.text.split('|')
                title = [word.strip() for word in title]
                title_num = int(title[-1]) #compound number
        if title_num != link_dict['fema']:
            print('FEMA # from link does not match page title', end=' ')
            return None
        
        # Get the page headings and extract their information
        page_headings = soup.find_all('div', class_='field field-type-header')
        for item in page_headings:
            try:
                label = item.find('h3', class_='field-label').stripped_strings
                label = list(label)[0]
                content = item.find('div', class_='field-item').stripped_strings
                content = list(content)[0]
            except:
                continue

            if label == 'FLAVOR PROFILE':
                link_dict['descriptors'] = content
                #lowercase, remove non-word characters (function1), and reduce words
                # to their stem (function2)
                content.lower()
                pattern = re.compile('[\W_]+')
                pattern.sub(' ', content)
                stemmer = nltk.stem.SnowballStemmer('english')
                stems = [stemmer.stem(word) for word in content.split(' ')]
                stems = ' '.join(stems)
                link_dict['stems'] = stems
            elif label == 'CAS':
                link_dict['cas'] = content
            elif label == 'JECFA NUMBER':
                link_dict['jecfa'] = content
            elif label == 'CFR':
                link_dict['cfr'] = content
        
        return link_dict
    
    else:
        print('No soup could be make from the link found')
        return None

In [8]:
def printed_fema_extraction(dicto_list):
    """
    Applies fema_link_info function to a list of individual chemical dictionaries.
    Displays a readout so that progress is known.
    """
    
    out = []
    total = len(dicto_list)
    count = 0
    last_displayed = 0
    
    for dicto in dicto_list:
        out.append(fema_link_info(dicto))
        
        # This noise is all about a nice display with percentage completed
        count += 1
        val = round((count / total) * 100)
        if (val % 5 == 0 and
            val != last_displayed):
            print('{:2.0f}%' .format(val), end = '.')
        else:
            print('.', end='')
        last_displayed = val
    
    return out

In [15]:
def fema_chunker(chunkable, file_name='extracted_fema.pkl', splits=10, chunk_list=None):
    """
    Splits the extraction of individual chemical information into separate chunks.
    As each chunk is completed it is saved into an updated pickle file. 
    chunk_list can specify particular chunks to be processed. 
    """
    total = len(chunkable)
    
    # determine chunk size
    chunk_size, mod = total//(splits), total%splits
    if mod != 0:
        chunk_size = total//(splits-1)
        mod = total%chunk_size
        if (mod == 0 or
           mod < chunk_size/2): # This makes sure that the remainder is not too large
            chunk_size -= round(chunk_size/(2*splits))
    print('Chunk size: {}' .format(chunk_size))
    
    # Generate a list with the chunk indices so that if a specific chunk number is specified
    # it can be found and generated consistently
    start = 0
    end = chunk_size
    start_end_list = []
    while end != total:
        start_end_list.append((start, end))
        start += chunk_size
        end += chunk_size
        if end > total:
            end = total
    start_end_list.append((start,end))
    print('Number of chunks: {}' .format(len(start_end_list)))
    
    # This part dues the actual extraction from the FEMA website
    extracted_fema_path = path.join(DATA_PATH, file_name)
    extracted_fema = []
    
    if not chunk_list:
        iterable = enumerate(start_end_list)
    else:
        sub_is = [start_end_list[i] for i in chunk_list]
        iterable = enumerate(sub_is)
    
    for i, tup in iterable:
        print ('\nChunk number {}, start: {}, end: {}' .format(i, tup[0], tup[1]))
        chunk = chunkable[tup[0]:tup[1]]
        extracted_fema += printed_fema_extraction(chunk)
        
        # Save after every chunk
        with open(extracted_fema_path, 'wb') as f:
            pickle.dump(extracted_fema, f, protocol=pickle.HIGHEST_PROTOCOL)
       
    return extracted_fema

In [10]:
extracted_fema = fema_chunker(fema_links)

Chunk size: 294
Number of chunks: 10

Chunk number 0, start: 0, end: 294
............. 5%..............10%...............15%...............20%...............25%..............30%...............35%...............40%..............45%...............50%...............55%..............60%...............65%...............70%...............75%..............80%...............85%...............90%..............95%...............100%..
Chunk number 1, start: 294, end: 588
............. 5%..............10%...............15%...............20%...............25%..............30%...............35%...............40%..............45%...............50%...............55%..............60%...............65%...............70%...............75%..............80%...............85%...............90%..............95%...............100%..
Chunk number 2, start: 588, end: 882
............. 5%..............10%...............15%...............20%...............25%..............30%...............35%...............40%.

There was an error in chunk # 3 so I will retry it and add it back to extracted_fema

In [14]:
chunk_3 = fema_chunker(fema_links, file_name='chunk3.pkl', chunk_list=[3])

Chunk size: 294
Number of chunks: 10

Chunk number 0, start: 882, end: 1176
............. 5%..............10%...............15%...............20%...............25%..............30%...............35%...............40%..............45%...............50%...............55%..............60%...............65%...............70%...............75%..............80%...............85%...............90%..............95%...............100%..

In [None]:
print(extracted_fema.index(None))
extracted_fema[882:1176] = chunk_3

In [21]:
print(extracted_fema.index(None))

ValueError: None is not in list

In [22]:
print('Length extracted dictionary: {} compared to length of links: {}' 
      .format(len(extracted_fema), len(fema_links)))

Length extracted dictionary: 2795 compared to length of links: 2795


Data dump of the final list of dictionaries with the FEMA website information

In [23]:
extracted_fema_path = path.join(DATA_PATH, 'extracted_fema.pkl')
with open(extracted_fema_path, 'wb') as f:
    pickle.dump(extracted_fema, f, protocol=pickle.HIGHEST_PROTOCOL)