# FEMA website information extraction

### The library pages

The FEMA website contains a series of [library pages](https://www.femaflavor.org/flavor/library?page=) that list all of the FEMA chemicals.

The functions below extract all of the links available:

In [1]:
from urllib.request import Request, urlopen
from bs4 import BeautifulSoup, SoupStrainer

def link_to_soup(link, strainer=None):
    '''
    support function makes a beautiful soup object from link. Disguises itself
    as a browser so its not confused for a bot

    input:
    link: to use as the source for the Beautiful soup object
    strainer: can limit the output soup object to a specific type of content

    returns:
    -Soup object if one can be made
    -None otherwise
    '''
    try:
        req = Request(link, headers={'User-Agent': 'Mozilla/5.0'})
        page = urlopen(req).read()
        soup = BeautifulSoup(page, 'lxml', parse_only=strainer)
    except:
        return None

    return soup

In [None]:
def fema_link_finder():
    """
    Creates a list of dictionaries with compound names, links and FEMA numbers
    based on the FEMA website library pages
    """
    fema_library_link = 'http://www.femaflavor.org/flavor/library?page='
    fema_base_link = 'http://www.femaflavor.org'
    strainer = SoupStrainer('tbody')
    ret_list = []
    for i in range(28):
        new_link = fema_library_link + str(i)
        soup = link_to_soup(new_link, strainer=strainer)
        rows = soup.findAll('tr')
        
        for row in rows:
            columns = row.find_all('td')
            for col in columns:
                if col.string:
                    num = int(col.string)
                    #print(num)
                elif col.a:
                    name = str(col.a.string).lower()
                    full_link = fema_base_link + col.a.get('href')
                    #print(name, full_link)
            dicto = {'name': name, 'link': full_link, 'fema': num}
            ret_list.append(dicto)
            print('.', end='')
    
    return ret_list

In [None]:
fema_links = fema_link_finder()

In [None]:
fema_links[0]

Intermediate data dump

In [2]:
import pickle
import os.path as path

DATA_PATH = path.join(path.expanduser('~'),
                     'Dropbox',
                     'bymt',
                     'data_dumps',
                     'chem_project',
                     'fema_extraction')

fema_links_path = path.join(DATA_PATH, 'fema_links.pkl')

# with open(fema_links_path, 'wb') as f:
#     pickle.dump(fema_links, f, protocol=pickle.HIGHEST_PROTOCOL)

with open(fema_links_path, 'rb') as f:
    fema_links = pickle.load(f)

Each chemical then has its own page (for example, [acetic acid](https://www.femaflavor.org/acetic-acid-2)) from which I will extract:
- Flavor descriptors
- Chemical Abstracts Service (CAS) registry number
- JECFA number
- US Government's Code of Ferderal Regulations (CFR) citation

The folowing functions take the data from `fema_links` to get the data from each individual chemical page:

In [3]:
import nltk
import re
from chemspipy import ChemSpider
cs = ChemSpider('0201ba66-585d-4135-9e6b-d28ba4724fcf')
from rdkit import Chem

def link_info(dicto):
    """
    Extract and add descriptors, stems, CAS, JECFA, and CFR numbers to the link_dict,
    based on the link provided within link_dict
    """
    link_dict = dicto.copy()
    soup = link_to_soup(link_dict['link'])
    if soup: 
        # Get the page title fema number and confirm it matches the number from link_dict
        page_titles = soup.find_all('h2', class_='pageTitle')
        for res in page_titles:
            if len(res.text) > 0:
                title = res.text.split('|')
                title = [word.strip() for word in title]
                title_num = int(title[-1]) #compound number
        if title_num != link_dict['fema']:
            print('FEMA # from link does not match page title', end=' ')
            return None
        
        # Get the page headings and extract their information
        page_headings = soup.find_all('div', class_='field field-type-header')
        for item in page_headings:
            try:
                label = item.find('h3', class_='field-label').stripped_strings
                label = list(label)[0]
                content = item.find('div', class_='field-item').stripped_strings
                content = list(content)[0]
            except:
                continue

            if label == 'FLAVOR PROFILE':
                link_dict['descriptors'] = content
                #lowercase, remove non-word characters (function1), and reduce words
                # to their stem (function2)
                content.lower()
                pattern = re.compile('[\W_]+')
                pattern.sub(' ', content)
                stemmer = nltk.stem.SnowballStemmer('english')
                stems = [stemmer.stem(word) for word in content.split(' ')]
                stems = ' '.join(stems)
                link_dict['stems'] = stems
            elif label == 'CAS':
                link_dict['cas'] = content
            elif label == 'JECFA NUMBER':
                link_dict['jecfa'] = content
            elif label == 'CFR':
                link_dict['cfr'] = content
        
        return link_dict
    
    else:
        print('No soup could be make from the link found')
        return None

In [4]:
def printed_fema_extraction(dicto_list):
    """
    Create a list of dictionaries with all of the extractable FEMA info.
    Displays a readout so that progress is known
    """
    
    out = []
    total = len(dicto_list)
    count = 0
    last_displayed = 0
    
    for dicto in dicto_list:
        out.append(link_info(dicto))
        
        # This noise is all about a nice display with percentage completed
        count += 1
        val = round((count / total) * 100)
        if (val % 5 == 0 and
            val != last_displayed):
            print('{:2.0f}%' .format(val), end = '.')
        else:
            print('.', end='')
        last_displayed = val
    
    return out

In [8]:
extracted_fema_1 = printed_fema_extraction(fema_links[:280])

............ 5%..............10%..............15%..............20%..............25%..............30%..............35%..............40%..............45%..............50%..............55%..............60%..............65%..............70%..............75%..............80%..............85%..............90%..............95%..............100%..

In [10]:
extracted_fema_2 = printed_fema_extraction(fema_links[280:560])

............ 5%..............10%..............15%..............20%..............25%..............30%..............35%..............40%..............45%..............50%..............55%..............60%..............65%..............70%..............75%..............80%..............85%..............90%..............95%..............100%..

In [11]:
extracted_fema_3 = printed_fema_extraction(fema_links[560:840])

............ 5%..............10%..............15%..............20%..............25%..............30%..............35%..............40%..............45%..............50%..............55%..............60%..............65%..............70%..............75%..............80%..............85%..............90%..............95%..............100%..

In [14]:
extracted_fema = extracted_fema_1 + extracted_fema_2 + extracted_fema_3

In [16]:
extracted_fema_4 = printed_fema_extraction(fema_links[840:1120])

............ 5%..............10%..............15%..............20%..............25%..............30%..............35%..............40%..............45%..............50%..............55%..............60%..............65%..............70%..............75%..............80%..............85%..............90%..............95%..............100%..

In [17]:
extracted_fema_5 = printed_fema_extraction(fema_links[1120:1400])

............ 5%..............10%..............15%..............20%..............25%..............30%..............35%..............40%..............45%..............50%..............55%..............60%..............65%..............70%..............75%..............80%..............85%..............90%..............95%..............100%..

In [18]:
extracted_fema_6 = printed_fema_extraction(fema_links[1400:1680])

............ 5%..............10%..............15%..............20%..............25%..............30%..............35%..............40%..............45%..............50%..............55%..............60%..............65%..............70%..............75%..............80%..............85%..............90%..............95%..............100%..

In [19]:
extracted_fema_7 = printed_fema_extraction(fema_links[1680:1960])

............ 5%..............10%..............15%..............20%..............25%..............30%..............35%..............40%..............45%..............50%..............55%..............60%..............65%..............70%..............75%..............80%..............85%..............90%..............95%..............100%..

In [20]:
extracted_fema = extracted_fema + extracted_fema_4 + extracted_fema_5 +\
                 extracted_fema_6 + extracted_fema_7

In [23]:
extracted_fema_8 = printed_fema_extraction(fema_links[1960:2240])

............ 5%..............10%..............15%..............20%..............25%..............30%..............35%..............40%..............45%..............50%..............55%..............60%..............65%..............70%..............75%..............80%..............85%..............90%..............95%..............100%..

In [24]:
extracted_fema_9 = printed_fema_extraction(fema_links[2240:2520])

............ 5%..............10%..............15%..............20%..............25%..............30%..............35%..............40%..............45%..............50%..............55%..............60%..............65%..............70%..............75%..............80%..............85%..............90%..............95%..............100%.No soup could be make from the link found
.

In [25]:
extracted_fema_10 = printed_fema_extraction(fema_links[2520:len(fema_links)])

No soup could be make from the link found
............ 5%..............10%.............15%..............20%..............25%..............30%.............35%..............40%..............45%..............50%.............55%..............60%..............65%..............70%.............75%..............80%..............85%..............90%.............95%..............100%..

In [37]:
extracted_fema = extracted_fema + extracted_fema_8 + extracted_fema_9 +\
                 extracted_fema_10

In [46]:
print('Length extracted dictionary: {} compared to length of links: {}' 
      .format(len(extracted_fema), len(fema_links)))

Length extracted dictionary: 2795 compared to length of links: 2795


Data dump of the final list of dictionaries with the FEMA website information

In [47]:
extracted_fema_path = path.join(DATA_PATH, 'extracted_fema.pkl')
with open(extracted_fema_path, 'wb') as f:
    pickle.dump(extracted_fema, f, protocol=pickle.HIGHEST_PROTOCOL)