In [123]:
from bs4 import BeautifulSoup
import os
import sys
from collections import OrderedDict
import time
import json
import pandas as pd
import numpy as np

In [2]:
path = '2015'

## Scraping the NSF
Use BeautifulSoup to crawl the NSF site and extract awards, award recipients (PI's), univerities and NSF internal divisions. Dump'em into JSON for analysis/warehousing...
<br>Currently only one year at a time-- how to scale up + do it all in memory?(!?!)
<br>TODO: use multiprocessing pool to speed up crawls!
<br><t> cut doops by checking if key exists

In [142]:
# Initiate globals
d1_list = []
d2_list = []
d3_list = []
d4_list = []

outfiles= ["NSF_AWARDS_"+path+".json","NSF_INSTITUTIONS_"+path+".json","NSF_PI_"+path+".json","NSF_DIVISIONS_"+path+".json"]

def xml_parse(file):
    handler = open(file,'r',encoding='utf8').read()
    soup = BeautifulSoup(handler,"lxml")
    soup.prettify()
    # sectional soups
    soup2= soup.find("institution")
    soup3 = soup.find("investigator")
    soup4 = soup.find("organization")
    
    # awards
    try:
        d1_list.append(
            OrderedDict(
                        [('award_id',soup.find("awardid").string.strip()),
                         ('award_title',soup.find("awardtitle").string.strip()),
                         ('abstract_narration',soup.find('abstractnarration').string.strip()),
                         ('award_effective_date',soup.find("awardeffectivedate").string.strip()),
                         ('award_expiration_date',soup.find("awardexpirationdate").string.strip()),
                         ('award_amount',soup.find("awardamount").string.strip()),
                         ('institution_name',soup2.find("name").string.strip()),
                         ('division',soup4.find("division").find("longname").string.strip()),
                         ('first_name',soup3.find("firstname").string.strip()),
                         ('last_name',soup3.find("lastname").string.strip())]
            )
        )
    except:
        return
    # institution
    try:
        d2_list.append(
            OrderedDict(
                        [('institution_name',soup2.find("name").string.strip()),
                         ('city_name',soup2.find("cityname").string.strip()),
                         ('phone_number',soup2.find("phonenumber").string.strip()),
                         ('street_address',soup2.find("streetaddress").string.strip()),
                         ('country_name',soup2.find("countryname").string.strip()),
                         ('state_name',soup2.find("statename").string.strip()),
                         ('state_code',soup2.find("statecode").string.strip())]
            )
        )
    except:
        return

    # PI
    try:
        d3_list.append(
            OrderedDict(
                        [('first_name',soup3.find("firstname").string.strip()),
                         ('last_name',soup3.find("lastname").string.strip()),
                         ('email_address',soup3.find("emailaddress").string.strip()),
                         ('role_code',soup3.find("rolecode").string.strip()),
                         ('institution_name',soup2.find("name").string.strip())]
            )
        )
    except:
        return
    # Division
    try:
        d4_list.append(
            OrderedDict(
                        [('code',soup4.find("code").string.strip()),
                         ('directorate',soup4.find("directorate").find("longname").string.strip()),
                         ('division',soup4.find("division").find("longname").string.strip())]
            )
        )
    except:
            return

def dump():
    dict_NSF = [d1_list,d2_list,d3_list,d4_list]
    for i in range(len(dict_NSF)):
        # remove duplicates
        dict_NSF[i] = [j for n, j in enumerate(dict_NSF[i]) if j not in dict_NSF[i][n + 1:]]
        # dump the dictionary into a JSON
        with open(outfiles[i], 'w') as outfile:
            json.dump(dict_NSF[i], outfile,indent=4)
                       
def main():
    start    = time.time()
    for file in os.listdir(path):
        current = os.path.join(path,file)
        if os.path.isfile(current):
            xml_parse(current)
    dump()
    print("Scraped and dumped in ",round(time.time() - start,2)," sec.")   

main()

107.31


## Analysis of NSF 2015 
Cool things to look at
<br> Common words in titles
<br> email each PI with a status update.
<br> Chloropleth about instituional funding.

In [145]:
award_df = pd.read_json(outfiles[0])
org_df = pd.read_json(outfiles[1])
pi_df = pd.read_json(outfiles[2])
div_df = pd.read_json(outfiles[3])

Let's take a look at the columns...

In [77]:
pi_df.columns

Index(['email_address', 'first_name', 'institution_name', 'last_name',
       'role_code'],
      dtype='object')

For RDB purposes and analysis this works great... However, for showing results to readers I don't want them displayed like that! So here's a helper function to convert underscored columns into respectible, well-mannered headers. Appropriated from Stackoverflow user <a target="_blank" href='http://stackoverflow.com/a/6425628/5094480'>Siegfried Gevatter</a>

In [109]:
def beautify(underscored_word):
    # check for acronym
    if(len(underscored_word)>2):
        return ' '.join(x.capitalize() or '_' for x in underscored_word.split('_'))
    else:
        return underscored_word

A simple and flexible reporting function to find the top n funded column.

In [110]:
def top_funds(col,n):
    top_fund_col = []
    for uni in award_df[col].unique():
        top_fund_col.append(
            {beautify(col):uni, "Total Award Money" : award_df[award_df[col]==uni].award_amount.sum()})
    fund_df = pd.DataFrame(sorted(top_fund_col, key=lambda k: k['Total Award Money'], reverse=True))
    fund_df.index = fund_df.index+1
    return fund_df[:n]

In [104]:
top_funds('institution_name',5)

Unnamed: 0,Institution Name,Total Award Money
1,University of Michigan Ann Arbor,55769241
2,Georgia Tech Research Corporation,53330266
3,University of Washington,48617063
4,University of Texas at Austin,46796453
5,Purdue University,41803128


In [73]:
top_funds('division',5)

Unnamed: 0,Division,Total Award Money
1,Division Of Computer and Network Systems,221571754
2,Division Of Undergraduate Education,196196942
3,Div Of Information & Intelligent Systems,165422557
4,"Div Of Civil, Mechanical, & Manufact Inn",143102736
5,Division Of Research On Learning,142659383


In [111]:
award_df['PI'] = award_df.first_name+" "+award_df.last_name
top_funds('PI',25)

Unnamed: 0,PI,Total Award Money
1,Robert Hamers,8151104
2,Jean-Paul Pinelli,8000000
3,Tim Turner,7251554
4,Pamela Mills,6931732
5,Richard Wolski,5875090
6,Larry Smarr,5048543
7,Douglas Swany,4918411
8,Corrinne Sande,4158633
9,Susan Lord,4010978
10,Nicholas Hud,4000000


In [149]:
# TODO fix PI shared names with email addresses!
def top_PI():
    award_df['PI'] = award_df.first_name+" "+award_df.last_name
    pi_df['PI'] = pi_df.first_name+" "+pi_df.last_name
    top_fund_col = []
    for PI in pi_df.PI:
        top_fund_col.append(
            {'PI':PI, 'funding_division': np.asarray(award_df[award_df.PI==PI].division)[0],"Total_Award_Money" : award_df[award_df['PI']==PI].award_amount.sum()})
    fund_df = pd.DataFrame(sorted(top_fund_col, key=lambda k: k['Total_Award_Money'], reverse=True))
    fund_df = pd.merge(left=fund_df,right=pi_df,on='PI',how='inner')
    fund_df.index = fund_df.index+1
    fund_df.rename(columns=lambda x: beautify(x), inplace=True)
    return fund_df[["Total Award Money","PI","Email Address","Funding Division"]][:25].drop_duplicates()
top_PI()

Unnamed: 0,Total Award Money,PI,Email Address,Funding Division
1,8151104,Robert Hamers,rjhamers@wisc.edu,Division Of Chemistry
2,8000000,Jean-Paul Pinelli,pinelli@fit.edu,"Div Of Civil, Mechanical, & Manufact Inn"
3,7251554,Tim Turner,t.turner@asee.org,Div Of Industrial Innovation & Partnersh
4,6931732,Pamela Mills,pamela.mills@lehman.cuny.edu,Division Of Research On Learning
5,5875090,Richard Wolski,rich@cs.ucsb.edu,Division of Computing and Communication Founda...
9,5048543,Larry Smarr,lsmarr@ucsd.edu,Div Of Advanced Cyberinfrastructure
10,4918411,Douglas Swany,swany@iu.edu,Div Of Advanced Cyberinfrastructure
11,4158633,Corrinne Sande,csande@whatcom.ctc.edu,Division Of Graduate Education
12,4010978,Susan Lord,slord@sandiego.edu,Div Of Engineering Education and Centers
13,4000000,Nicholas Hud,hud@chemistry.gatech.edu,Division Of Chemistry
