In [3]:
import os
from bs4 import BeautifulSoup
from urllib.request import urlopen, urlretrieve
import zipfile
import glob

# Get and clean NIH files

### Scrape files from NIH exporter
NIH makes this very easy to do by having all the zip files of a single topic, by year, on a single webpage. I scraped from two webpages - abstracts and projects.

In [13]:
def get_zip_files(URL, OUTPUT_DIR):
    """
    Gets zip files from URL and downloads into OUTPUT_DIR
    """
    u = urlopen(URL)
    soup = BeautifulSoup(u.read(), 'html.parser')
    
    for link in soup.find_all('a'):
        href = link.get('href')
        try:
            if href.startswith('CSVs') and href.endswith('.zip'):
                link = 'https://exporter.nih.gov/' + href
                filename = os.path.join(OUTPUT_DIR, link.rsplit('/', 1)[-1])
                urlretrieve(link, filename)
        except AttributeError:
            continue

### Unzip files into csvs and delete the zips

In [29]:
def unzip_files(directory):
    """
    unzip files in input directory
    """
    filelist = glob.glob(directory + '*.zip')
    for file in filelist:
        with zipfile.ZipFile(file, mode='r') as zip_ref:
            zip_ref.extractall(path=directory)
            zip_ref.close()

def delete_zips(directory):
    """
    delete zips in a given directory
    """
    for file in os.listdir(directory):
        if file.endswith('.zip'):
            os.remove(directory + file)
            
def get_csvs(webgroup):
    """
    gets zip files from a given url and converts them into csvs
    webgroup[0] is the url to get zipfiles to download
    webgroup[1] is the directory where zips and csvs will be processed in
    """
    # scrape zipfiles from NIH
    get_zip_files(webgroup[0], webgroup[1])
    
    # unzip them into csvs
    unzip_files(webgroup[1])
    
    # delete the zips
    delete_zips(webgroup[1])

### Get the files!

In [28]:
abstracts = ['https://exporter.nih.gov/ExPORTER_Catalog.aspx?sid=0&index=1', 
             './abstracts/']

projects = ['https://exporter.nih.gov/ExPORTER_Catalog.aspx',
            './projects/']

get_csvs(abstracts)
get_csvs(projects)