# FEMA website information extraction

### The library pages

The FEMA website contains a series of [library pages](https://www.femaflavor.org/flavor/library?page=) that list all of the FEMA chemicals.

The functions below extract all of the links available:

In [1]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup, SoupStrainer

def link_to_soup(link, strainer=None):
    '''
    support function makes a beautiful soup object from link. Disguises itself
    as a browser so its not confused for a bot

    input:
    link: to use as the source for the Beautiful soup object
    strainer: can limit the output soup object to a specific type of content

    returns:
    -Soup object if one can be made
    -None otherwise
    '''
    try:
        req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
        page = urlopen(req).read()
        soup = BeautifulSoup(page, 'lxml', parse_only=strainer)
    except:
        return None

    return soup

In [None]:
def fema_link_finder():
    """
    Creates a list of dictionaries with compound names, links and FEMA numbers
    based on the FEMA website library pages
    """
    fema_library_link = 'http://www.femaflavor.org/flavor/library?page='
    fema_base_link = 'http://www.femaflavor.org'
    strainer = SoupStrainer('tbody')
    ret_list = []
    for i in range(28):
        new_link = fema_library_link + str(i)
        soup = link_to_soup(new_link, strainer=strainer)
        rows = soup.findAll('tr')
        
        for row in rows:
            columns = row.find_all('td')
            for col in columns:
                if col.string:
                    num = int(col.string)
                    #print(num)
                elif col.a:
                    name = str(col.a.string).lower()
                    full_link = fema_base_link + col.a.get('href')
                    #print(name, full_link)
            dicto = {'name': name, 'link': full_link, 'fema': num}
            ret_list.append(dicto)
            print('.', end='')
    
    return ret_list

In [None]:
fema_links = fema_link_finder()

In [None]:
fema_links[0]

Intermediate data dump

In [2]:
import pickle
import os.path as path

DATA_PATH = path.join(path.expanduser('~'),
                     'Dropbox',
                     'bymt',
                     'data_dumps',
                     'chem_project',
                     'fema_extraction')

fema_links_path = path.join(DATA_PATH, 'fema_links.pkl')

# with open(fema_links_path, 'wb') as f:
#     pickle.dump(fema_links, f, protocol=pickle.HIGHEST_PROTOCOL)

with open(fema_links_path, 'rb') as f:
    fema_links = pickle.load(f)

Each chemical then has its own page (for example, [acetic acid](https://www.femaflavor.org/acetic-acid-2)) from which I will extract:
- Flavor descriptors
- Chemical Abstracts Service (CAS) registry number
- JECFA number
- US Government's Code of Ferderal Regulations (CFR) citation

The folowing functions take the data from `fema_links` to get the data from each individual chemical page:

In [3]:
import nltk
import re
from chemspipy import ChemSpider
cs = ChemSpider('0201ba66-585d-4135-9e6b-d28ba4724fcf')
from rdkit import Chem

def link_info(dicto):
    """
    Extract and add descriptors, stems, CAS, JECFA, and CFR numbers to the link_dict,
    based on the link provided within link_dict
    """
    link_dict = dicto.copy()
    soup = link_to_soup(link_dict['link'])
    if soup: 
        # Get the page title fema number and confirm it matches the number from link_dict
        page_titles = soup.find_all('h2', class_='pageTitle')
        for res in page_titles:
            if len(res.text) > 0:
                title = res.text.split('|')
                title = [word.strip() for word in title]
                title_num = int(title[-1]) #compound number
        if title_num != link_dict['fema']:
            print('FEMA # from link does not match page title', end=' ')
            return None
        
        # Get the page headings and extract their information
        page_headings = soup.find_all('div', class_='field field-type-header')
        for item in page_headings:
            try:
                label = item.find('h3', class_='field-label').stripped_strings
                label = list(label)[0]
                content = item.find('div', class_='field-item').stripped_strings
                content = list(content)[0]
            except:
                continue

            if label == 'FLAVOR PROFILE':
                link_dict['descriptors'] = content
                #lowercase, remove non-word characters (function1), and reduce words
                # to their stem (function2)
                content.lower()
                pattern = re.compile('[\W_]+')
                pattern.sub(' ', content)
                stemmer = nltk.stem.SnowballStemmer('english')
                stems = [stemmer.stem(word) for word in content.split(' ')]
                stems = ' '.join(stems)
                link_dict['stems'] = stems
            elif label == 'CAS':
                link_dict['cas'] = content
            elif label == 'JECFA NUMBER':
                link_dict['jecfa'] = content
            elif label == 'CFR':
                link_dict['cfr'] = content
        
        return link_dict
    
    else:
        print('No soup could be make from the link found')
        return None

In [4]:
def printed_fema_extraction(dicto_list):
    """
    Create a list of dictionaries with all of the extractable FEMA info.
    Displays a readout so that progress is known
    """
    
    out = []
    total = len(dicto_list)
    count = 0
    last_displayed = 0
    
    for dicto in dicto_list:
        out.append(link_info(dicto))
        
        # This noise is all about a nice display with percentage completed
        count += 1
        val = round((count / total) * 100)
        if (val % 5 == 0 and
            val != last_displayed):
            print('{:2.0f}%' .format(val), end = '.')
        else:
            print('.', end='')
        last_displayed = val
    
    return out

In [12]:
for i = range(1,11):
    print 280*i

SyntaxError: invalid syntax (<ipython-input-12-ddfbac50a4ac>, line 1)

In [8]:
extracted_fema_1 = printed_fema_extraction(fema_links[:280])

............ 5%..............10%..............15%..............20%..............25%..............30%..............35%..............40%..............45%..............50%..............55%..............60%..............65%..............70%..............75%..............80%..............85%..............90%..............95%..............100%..

In [10]:
extracted_fema_2 = printed_fema_extraction(fema_links[280:560])

............ 5%..............10%..............15%..............20%..............25%..............30%..............35%..............40%..............45%..............50%..............55%..............60%..............65%..............70%..............75%..............80%..............85%..............90%..............95%..............100%..

In [11]:
extracted_fema_3 = printed_fema_extraction(fema_links[560:840])

............ 5%..............10%..............15%..............20%..............25%..............30%..............35%..............40%..............45%..............50%..............55%..............60%..............65%..............70%..............75%..............80%..............85%..............90%..............95%..............100%..

In [14]:
extracted_fema = extracted_fema_1 + extracted_fema_2 + extracted_fema_3

In [15]:
extracted_fema_path = path.join(DATA_PATH, 'extracted_fema.pkl')
with open(extracted_fema_path, 'wb') as f:
    pickle.dump(extracted_fema, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
extracted_fema_4 = printed_fema_extraction(fema_links[840:1120])

In [None]:
extracted_fema_5 = printed_fema_extraction(fema_links[1120:1400])

In [None]:
extracted_fema_6 = printed_fema_extraction(fema_links[1400:1680])

In [None]:
extracted_fema_7 = printed_fema_extraction(fema_links[1680:1960])

In [None]:
import nltk

import re

from chemspipy import ChemSpider
cs = ChemSpider('0201ba66-585d-4135-9e6b-d28ba4724fcf')
from rdkit import Chem
from rdkit.Chem import Descriptors
from inspect import getmembers, isfunction

In [None]:
def search_and_filter(number,
                      search_prefix='http://www.femaflavor.org/search/apachesolr_search/',
                      substring='/flavor/library/'):
    '''
    support function for dictionary_maker
    searches the Fema website for the number given and
    returns a list of links that contain the substring.
    Returns None otherwise

    Inputs:
    -number: Fema number to search for
    -search_prefix: web address prefix to search in
    -substring: to filter results

    Returns:
    -page_headings
    -name
    -link

    or
    -None if none are found
    '''


    search_link = search_prefix + str(number)
    soup = link_to_soup(search_link)
    if soup:
        search_block = soup.find_all('dl', class_='search-results apachesolr_search-results')
    else:
        return None

    #See if there are any results and extract only the links to flavor compounds
    try:
        titles = search_block[0].find_all('dt', class_='title')
        #extract all search result links
        links = [title.find('a').get('href') for title in titles]
        #select only links with flavor compund substring
        links_checked = [link for link in links if substring in link]
    except:
        return None

    if len(links_checked) >= 1:
        for link in links_checked:
            print(link)
            soup = link_to_soup(link)
            if soup:
                page_title = soup.find('h2', class_='pageTitle')
                page_headings = soup.find_all('div', class_='field field-type-header')
                title = page_title.text.split('|')
                title = [word.strip() for word in title]
                name = title[0] #compound name
                title_num = title[-1] #compound number
                if title_num == str(number):
                    return page_headings, name, link
    else:
        return None

In [None]:
def chem_search(dict_entry, priotity_list):
    '''
    returns a rdkit molecule after searching the chemspider database based on the items
    in the priority list.
    '''

    for tup in priotity_list:
        try:
            tup_string = dict_entry.get(tup[1])
        except AttributeError:
            continue

        if tup_string:
            search_string = tup[0] + tup_string
            #print('searching for: {}' .format(search_string))
            results = cs.search(search_string)
            #print('stopped searching')
            if same_chemical(results):
                #print(tup)
                return same_chemical(results)
            else:
                continue
    return None

In [None]:
def dictionary_maker(num_iterator):
    '''
    returns a dictionary of chemicals found in the femaflavor.org website with FEMA numbers in
    the given num_iterator

    inputs:
    -num_iterator: an iterable object with the fema numbers to be searched

    returns:
    dictionary with fema number as primary key and the following subkeys:
    'link','name', 'descriptors', 'CAS', 'JECFA', 'CFR'
    '''

    dictionary = {}
    count = 0
    priority_list = [('fema ', 'FEMA'), ('jecfa ', 'JECFA'), ('', 'CAS'), ('', 'name')]

    for number in num_iterator:
        #searchNameLink is (pageHeadings, name, link) if there is a FEMA website for number.
        # None otherwise
        page_name_link = search_and_filter(number)

        if page_name_link:
            #Add all information from FEMA webpage to dictionary[number][subentries]
            dictionary[number] = {}
            dictionary[number]['link'] = page_name_link[2]
            dictionary[number]['name'] = page_name_link[1]
            dictionary[number]['FEMA'] = str(number)
            for item in page_name_link[0]:
                try:
                    label = item.find('h3', class_='field-label').stripped_strings
                    label = list(label)[0]
                    content = item.find('div', class_='field-item').stripped_strings
                    content = list(content)[0]
                except:
                    continue

                if label == 'FLAVOR PROFILE':
                    dictionary[number]['descriptors'] = content
                    #lowercase, remove non-word characters (function1), and reduce words
                    # to their stem (function2)
                    content.lower()
                    pattern = re.compile('[\W_]+')
                    pattern.sub(' ', content)
                    stemmer = nltk.stem.SnowballStemmer('english')
                    stems = [stemmer.stem(word) for word in content.split(' ')]
                    stems = ' '.join(stems)
                    text = nltk.word_tokenize(stems)
                    tokens = nltk.pos_tag(text)
                    selected = [token[0] for token in tokens if token[1] in ['NN', 'JJ']]
                    dictionary[number]['tokens'] = selected
                elif label == 'CAS':
                    dictionary[number]['CAS'] = content
                elif label == 'JECFA NUMBER':
                    dictionary[number]['JECFA'] = content
                elif label == 'CFR':
                    dictionary[number]['CFR'] = content

            #Add rdkit molecule to dictionary[number]['rdkit Mol']
            test = chem_search(dictionary[number], priority_list)
            if test:
                dictionary[number]['rdkit Mol'] = test
            else:
                print(' {}nMol' .format(number), end='')

        else:
            print(' {}nLink' .format(number), end='')

        count += 1
        if count%10 == 0:
            print(' {:.2f}%' .format(count/len(num_iterator)*100), end='')
        else:
            print('.', end='')
    return dictionary

In [None]:
def same_chemical(results):
    '''
    returns an rdkit chemical object if a the chemicals in a chemspipy result list have:
    -the same molecular weight, and
    -the same smiles representation
    returns None otherwise
    '''
    if results.count == 0:
        return None

    smiles = []
    mws = []

    if results.count >= 1:
        for chemical in results:
            try:
                smiles_base = chemical.smiles
                chem_base = Chem.MolFromSmiles(smiles_base)

                smiles_temp = Chem.MolToSmiles(chem_base)
                smiles.append(smiles_temp)

                mw_temp = Chem.Descriptors.MolWt(chem_base)
                mws.append(mw_temp)
            except:
                continue

        if (len(set(smiles)) == 1 and
                len(set(mws)) == 1):
            return Chem.MolFromSmiles(Chem.MolToSmiles(chem_base))

    else:
        return None