# Scraping NIH grant information 
https://exporter.nih.gov/about.aspx

In [1]:
import requests, zipfile, io
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
url = 'https://exporter.nih.gov/about.aspx'
r = requests.get(url)

#checks that the request was successful
try:
    r.raise_for_status()
except Exception as exc:
    print('There was a problem {}'.format(exc))

html_doc = r.text
soup = BeautifulSoup(html_doc, 'html.parser')

Create dataframe of column names and respective descriptions from < strong > tags. < strong > tags correspond to column names and the next_sibling to the associated description.

In [3]:
cols = []
desc = []
for strong_tag in soup.find_all('strong'):
    cols.append(strong_tag.text)
    desc.append(strong_tag.next_sibling)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_colwidth', 5000)

col_info = pd.DataFrame()
col_info['column_name'] = cols
col_info['descriptions'] = desc

col_info.head()

Unnamed: 0,column_name,descriptions
0,Application_ID:,A unique identifier of the project record in the ExPORTER database.
1,Activity:,"A 3-character code identifying the grant, contract, or intramural activity through which a project is supported. Within each"
2,\n\r\n Administering_IC:,"Administering Institute or Center - A two-character code to designate the agency, NIH Institute, or Center administering the grant. See"
3,Application_Type:,A one-digit code to identify the type of application funded:
4,ARRA_Funded:,“Y” indicates a project supported by funds appropriated through the American Recovery and Reinvestment Act of 2009.


Perform basic text cleaning

In [4]:
col_info['column_name'] = col_info['column_name'].str.replace('\n\r\n', '')
col_info['column_name'] = col_info['column_name'].str.replace(':', '')
col_info['column_name'] = col_info['column_name'].str.lower()
col_info = col_info.drop(col_info.index[[8, 37]]) # <br/> text

#save information about publications in case needed in the future
publication_info = col_info.iloc[46:, :]
publication_info.to_csv('publication_info.csv')
col_info = col_info.drop(publication_info.index)

Not all descriptions are full descriptions due to the fact that href links are embedded along with text and are therefore not captured by next_sibling. To get the full descriptions of these columns, including the titles of the links, see below.

In [5]:
def join_list(list_of_strings, string_to_join = ' '):
    '''
    Joins a list of strings into a single string.
    '''
    text = string_to_join.join(list_of_strings)
    text = text.strip().replace('\xa0', '')
    return text

def get_full_desc(soup, index_num, gen_num):
    '''
    Scrape full column descriptions for those descriptions where
    text and html are embedded.
    '''
    gen = soup.find_all('strong')[index_num].next_siblings
    text_list = []
    for index, item in enumerate(gen):
        if index < gen_num:
            text_list.append(item.string)
    text = join_list(text_list)
    return text

col_gen_pairs = [
    (1, 7),
    (2, 2),
    (12, 4),
    (14, 3),
    (15, 3),
    (18, 3),
    (44, 3),
    (45, 3),
]

full_descs = []
for pair in col_gen_pairs:
    description = get_full_desc(soup, pair[0], pair[1])
    full_descs.append(description)

Replace partial descriptions with full descriptions.

In [6]:
indexer = [_[0] for _ in col_gen_pairs]
for i in indexer:
    col_info = col_info.replace(col_info.ix[i, 1], full_descs[indexer.index(i)])

In one description, text was bolded (< strong >) in the body of the paragraph and therefore a new row was formed. Add text to appropriate description and remove extraneous row.

In [7]:
col_info.ix[42, 1] = str(col_info.ix[42, 1]) + '04 is in its fourth year of support.'

col_info = col_info.drop(col_info.index[[41]])

Two descriptions had associated lists. Get the list information and add to the appropriate descriptions.

In [8]:
def get_desc_uls(soup, subcomponent, string_to_join = ' -'):
    uls = soup.find_all('ul')
    elements = []
    for ul in uls:
        for element in ul.find_all(subcomponent):
            elements.append(element.text.strip())
    return join_list(elements, string_to_join)

rn = '\r\n'

application_type = get_desc_uls(soup, 'p')
application_type = application_type.replace(rn, '')
application_type = application_type.replace('\t\t\t\t\t\t\t', '')

total_cost = get_desc_uls(soup, 'li')
total_cost = total_cost + soup.find_all('ul')[-1].next_sibling.string.replace(rn, '')

In [9]:
def concat_df_strings(df, index, to_concat, rn):
    df.iloc[index, 1] = ((str(df.iloc[index, 1]) + to_concat).replace(rn, ' ').strip())
    return df

col_info = concat_df_strings(col_info, -2, total_cost, rn)
col_info = concat_df_strings(col_info, 3, application_type, rn)

Final table

In [10]:
col_info.tail(10)

Unnamed: 0,column_name,descriptions
36,serial_number,A six-digit number assigned in serial number order within each administering organization.
38,study_section,A designator of the legislatively-mandated panel of subject matter experts that reviewed the research grant application for scientific and technical merit.
39,study_section_name,The full name of a regular standing Study Section that reviewed the research grant application for scientific and technical merit. Applications reviewed by panels other than regular standing study sections are designated by “Special Emphasis Panel.”
40,subproject_id,A unique numeric designation assigned to subprojects of a “parent” multi-project research grant.
41,suffix,"A suffix to the grant application number that includes the letter ""A"" and a serial number to identify an amended version of an original application and/or the letter ""S"" and serial number indicating a supplement to the project. ."
42,support_year,"The year of support for a project, as shown in the full project number. For example, a project with number 5R01GM0123456-04 is in its fourth year of support."
44,direct_cost_amt,Total indirect cost funding for a project from all NIH Institute and Centers for a given fiscal year. Costs are available only for NIH awards funded in FY 2012 and onward. Indirect cost amounts are not available for SBIR/STTR awards.
45,indirect_cost_amt,Total indirect cost funding for a project from all NIH Institute and Centers for a given fiscal year. Costs are available only for NIH awards funded in FY 2012 and onward. Indirect cost amounts are not available for SBIR/STTR awards.
46,total_cost,"Total project funding from all NIH Institute and Centers for a given fiscal year. Costs are available only for: NIH, CDC, and FDA grant awards (only the parent record of multi-project grants). -NIH intramural projects (activity codes beginning with “Z”) in FY 2007 and later fiscal years. -NIH contracts (activity codes beginning with “N”) . For multi-project grants, Total_Cost includes funding for all of the constituent subprojects. This field will be blank on subproject records; the total cost of each subproject is found in Total_Cost_Sub_Project ."
47,total_cost_sub_project,Applies to subproject records only. Total funding for a subproject from all NIH Institute and Centers for a given fiscal year. Costs are available only for NIH awards.


Write to csv

In [11]:
col_info.to_csv('grant_col_info_all.csv', index = False)