In [2]:
import os
import sys
import time
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from collections import OrderedDict

In [8]:
paths = [str(x) for x in range(2010,2017)]

## Scraping the NSF
Use BeautifulSoup to crawl the NSF site and extract awards, award recipients (PI's), univerities and NSF internal divisions. Dump'em into JSON for analysis/warehousing...
<br>Currently only one year at a time-- how to scale up + do it all in memory?(!?!)
<br>TODO: use multiprocessing pool to speed up crawls!
<br><t> cut doops by checking if key exists

In [21]:
# Initiate globals
def main(path):
    outfiles = ["NSF_AWARDS_"+path+".json",
                "NSF_ABSTRACT_"+path+".json",
                "NSF_INSTITUTIONS_"+path+".json",
                "NSF_PI_"+path+".json",
                "NSF_DIVISION_"+path+".json"]
   
    d1_list = []
    d2_list = []
    d3_list = []
    d4_list = []
    d5_list = []

    def xml_parse(file):
        handler = open(file,'r',encoding='utf8').read()
        # get the xml file into soup
        soup = BeautifulSoup(handler,"lxml")
        soup.prettify()

        # sectional soups
        soup2= soup.find("institution")
        soup3 = soup.find("investigator")
        soup4 = soup.find("organization")
        
        
        # awards
        try:
            award = soup.find("awardid").string
            university = soup2.find("name").string
            nsf_division = soup4.find("division").find("longname").string
            email = soup3.find("emailaddress").string
            d1_list.append(
                OrderedDict(
                    [('award_id',award),
                     ('award_title',soup.find("awardtitle").string),
                     ('award_effective_date',soup.find("awardeffectivedate").string),
                     ('award_expiration_date',soup.find("awardexpirationdate").string),
                     ('award_amount',soup.find("awardamount").string),
                     ('institution_name',university),
                     ('division',nsf_division),
                     ('email_address',email)]
                )
            )
        except:
            return
        # abstract
        try:
            d2_list.append(
                OrderedDict([
                    ('award_id',award),
                    ('abstract_narration',soup.find('abstractnarration').string.strip())]
                )
            )
        except:
            return

        # institution
        try:
            d3_list.append(
                OrderedDict(
                    [('institution_name',university),
                     ('city_name',soup2.find("cityname").string),
                     ('phone_number',soup2.find("phonenumber").string),
                     ('street_address',soup2.find("streetaddress").string),
                     ('country_name',soup2.find("countryname").string),
                     ('state_name',soup2.find("statename").string),
                     ('state_code',soup2.find("statecode").string)]
                )
            )
        except:
            return

        # PI
        try:
            d4_list.append(
                OrderedDict(
                    [('first_name',soup3.find("firstname").string),
                     ('last_name',soup3.find("lastname").string),
                     ('email_address',email),
                     ('institution_name',soup2.find("name").string)]
                )
            )
        except:
            return

        # Division
        try:
            d5_list.append(
                OrderedDict(
                    [('code',soup4.find("code").string),
                     ('directorate',soup4.find("directorate").find("longname").string),
                     ('division',nsf_division)]
                )
            )
        except:
            return

    def json_dump():
        dict_NSF = [d1_list, d2_list, d3_list, d4_list, d5_list]
        for i in range(len(dict_NSF)):
            # remove duplicates
            dict_NSF[i] = [j for n, j in enumerate(dict_NSF[i]) if j not in dict_NSF[i][n + 1:]]
            with open(outfiles[i], 'w') as outfile:
                json.dump(dict_NSF[i], outfile, indent=4)

    # main script
    start = time.time()
    count = 0
    # iterate through the path directory
    for file in os.listdir('data/'+path):
        xml_parse(os.path.join('data/'+path,file))
        count += 1
    print("Scraped", count,"records in", round(time.time()-start, 2), "sec.")   
    json_dump()
    #TODO: create a SQL or noSQL dump!

# call for each year
main('2010')

'''
# iterate through each year
for path in paths:
    main(path)
'''

Scraped 13092 records in 114.8 sec.


'\n# iterate through each year\nfor path in paths:\n    main(path)\n'

## Analysis of NSF 2015 
Cool things to look at
<br> Common words in titles
<br> email each PI with a status update.
<br> Chloropleth about instituional funding.

In [None]:
award_df = pd.read_json(outfiles[0])
abstract_df = pd.read_json(outfiles[1])
org_df = pd.read_json(outfiles[2])
pi_df = pd.read_json(outfiles[3])
div_df = pd.read_json(outfiles[4])

In [None]:
award_df.merge(div_df)

Let's take a look at the columns...

In [None]:
pi_df.columns

For RDBA this works great... However, for publishing results nobody wants columns displayed_like_this! So here's a helper function to convert underscored columns into respectible, well-mannered headers. Appropriated from Stackoverflow user <a target="_blank" href='http://stackoverflow.com/a/6425628/5094480'>Siegfried Gevatter</a>

In [None]:
def beautify(underscored_word):
    # check for acronym
    if(len(underscored_word)>2):
        return ' '.join(x.capitalize() or '_' for x in underscored_word.split('_'))
    else:
        return underscored_word

A simple, flexible reporting function to find the top n funded column.

In [None]:
def top_funds(col,n):
    top_fund_col = []
    for uni in award_df[col].unique():
        top_fund_col.append(
            {beautify(col):uni, "Total Award Money" : award_df[award_df[col]==uni].award_amount.sum()})
    fund_df = pd.DataFrame(sorted(top_fund_col, key=lambda k: k['Total Award Money'], reverse=True))
    fund_df.index = fund_df.index+1
    return fund_df[:n]

In [None]:
top_funds('institution_name',5)

In [None]:
top_funds('division',5)

In [1]:
# TODO fix PI shared names with email addresses!
def top_PI():
    award_df['PI'] = award_df.first_name+" "+award_df.last_name
    pi_df['PI'] = pi_df.first_name+" "+pi_df.last_name
    top_fund_col = []
    for PI in pi_df.PI:
        top_fund_col.append(
            {'PI':PI, 'funding_division': np.asarray(award_df[award_df.PI==PI].division)[0],"Total_Award_Money" : award_df[award_df['PI']==PI].award_amount.sum()})
    fund_df = pd.DataFrame(sorted(top_fund_col, key=lambda k: k['Total_Award_Money'], reverse=True))
    fund_df = pd.merge(left=fund_df,right=pi_df,on='PI',how='inner')
    fund_df.index = fund_df.index+1
    fund_df.rename(columns=lambda x: beautify(x), inplace=True)
    return fund_df[["Total Award Money","PI","Email Address","Funding Division"]][:25].drop_duplicates()
top_PI()

NameError: name 'award_df' is not defined