# NIH awards data
https://exporter.nih.gov/ExPORTER_Catalog.aspx

In [68]:
import requests, zipfile, io
import glob
from bs4 import BeautifulSoup
import os

from datetime import datetime

import pandas as pd
import numpy as np

import cleaning_strings as cln

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Download grants data from years 1985-2016

In [None]:
years = range(1985, 2017)
for year in years:
    url = 'https://exporter.nih.gov/CSVs/final/RePORTER_PRJ_C_FY{}'.format(year) + '.zip'
    r = requests.get(url).content
    z = zipfile.ZipFile(io.BytesIO(r))
    z.extractall()

Import only one year to get column names/dtypes

In [16]:
#import first two rows of grants data from a single year
csv = 'RePORTER_PRJ_C_FY2016.csv'
grants_2016_raw = pd.read_csv(csv, encoding = 'latin1', nrows = 2)
pd.set_option('display.max_columns', 50)
grants_2016_raw

Unnamed: 0,APPLICATION_ID,ACTIVITY,ADMINISTERING_IC,APPLICATION_TYPE,ARRA_FUNDED,AWARD_NOTICE_DATE,BUDGET_START,BUDGET_END,CFDA_CODE,CORE_PROJECT_NUM,ED_INST_TYPE,FOA_NUMBER,FULL_PROJECT_NUM,FUNDING_ICs,FUNDING_MECHANISM,FY,IC_NAME,NIH_SPENDING_CATS,ORG_CITY,ORG_COUNTRY,ORG_DEPT,ORG_DISTRICT,ORG_DUNS,ORG_FIPS,ORG_NAME,ORG_STATE,ORG_ZIPCODE,PHR,PI_IDS,PI_NAMEs,PROGRAM_OFFICER_NAME,PROJECT_START,PROJECT_END,PROJECT_TERMS,PROJECT_TITLE,SERIAL_NUMBER,STUDY_SECTION,STUDY_SECTION_NAME,SUBPROJECT_ID,SUFFIX,SUPPORT_YEAR,DIRECT_COST_AMT,INDIRECT_COST_AMT,TOTAL_COST,TOTAL_COST_SUB_PROJECT
0,9115627,K23,GM,4,N,7/27/2016,8/1/2016,7/31/2017,859,K23GM104401,SCHOOLS OF MEDICINE,PA-11-009,4K23GM104401-04,NIGMS:194460\,OTHER RESEARCH-RELATED,2016,NATIONAL INSTITUTE OF GENERAL MEDICAL SCIENCES,,NEW YORK,UNITED STATES,GENETICS,13,78861598,US,ICAHN SCHOOL OF MEDICINE AT MOUNT SINAI,NY,100296574,PUBLIC HEALTH RELEVANCE: Antiplatelet response...,10799126;,"SCOTT, STUART ALEXANDER;","LONG, ROCHELLE M.",8/1/2013,7/31/2017,ABCB1 gene; Accounting; acute coronary syndrom...,The Pharmacogenomic Control of Clopidogrel Res...,104401,GHD,Genetics of Health and Disease Study Section,,,4,180500,13960,194460,
1,9128072,R01,NS,4,N,8/15/2016,8/1/2016,7/31/2017,853,R01NS085165,SCHOOLS OF MEDICINE,PA-11-260,4R01NS085165-04,NINDS:335781\,Non-SBIR/STTR RPGs,2016,NATIONAL INSTITUTE OF NEUROLOGICAL DISORDERS A...,,BALTIMORE,UNITED STATES,ANESTHESIOLOGY,7,188435911,US,UNIVERSITY OF MARYLAND BALTIMORE,MD,212011508,PUBLIC HEALTH RELEVANCE: Activation of microgl...,7017365;,"POLSTER, BRIAN M;","MORRIS, JILL A",9/30/2013,7/31/2018,Acute; analog; Antioxidants; attenuation; Bind...,Novel Mechanisms of Microglial Neurotoxicity a...,85165,NOMD,Neural Oxidative Metabolism and Death Study Se...,,,4,218750,117031,335781,


There are three different dtypes in the grant data: str (the most common), floats and datetime. Create dictionaries/lists to specify dtypes on import.

In [72]:
#names of columns with dtypes of datetime or floats
dates = 'AWARD_NOTICE_DATE BUDGET_START BUDGET_END PROJECT_START PROJECT_END'.split()
nums = 'DIRECT_COST_AMT INDIRECT_COST_AMT TOTAL_COST TOTAL_COST_SUB_PROJECT'.split()

#filter out numeric columns from full column set
col_nums = grants_2016_raw[nums].columns
col_str = grants_2016_raw.drop(col_nums, axis = 1).columns

#create dictionaries with key:value pairs for str and float dtypes
#datetime will be converted from str to datetime using the parse_dates argument
dict1 = {key: str for key in col_str}
dict2 = {key: float for key in col_nums}
dtypes = {**dict1, **dict2}

Import csvs from all years and concatenate into a single df

In [75]:
all_csvs = glob.glob('RePORTER_PRJ_C_FY*.csv')
all_grants_raw = pd.DataFrame()
list_ = []
for csv in all_csvs:
    df = pd.read_csv(csv, index_col = None, header = 0, encoding = 'latin1',
                    dtype = dtypes, parse_dates = dates)
    list_.append(df)
all_grants_raw = pd.concat(list_)
#os.remove(csv)

Re-arrange columns to original column sequence

In [81]:
all_grants_raw = all_grants_raw[grants_2016_raw.columns.tolist()]
all_grants_raw.head(2)

Unnamed: 0,APPLICATION_ID,ACTIVITY,ADMINISTERING_IC,APPLICATION_TYPE,ARRA_FUNDED,AWARD_NOTICE_DATE,BUDGET_START,BUDGET_END,CFDA_CODE,CORE_PROJECT_NUM,ED_INST_TYPE,FOA_NUMBER,FULL_PROJECT_NUM,FUNDING_ICs,FUNDING_MECHANISM,FY,IC_NAME,NIH_SPENDING_CATS,ORG_CITY,ORG_COUNTRY,ORG_DEPT,ORG_DISTRICT,ORG_DUNS,ORG_FIPS,ORG_NAME,ORG_STATE,ORG_ZIPCODE,PHR,PI_IDS,PI_NAMEs,PROGRAM_OFFICER_NAME,PROJECT_START,PROJECT_END,PROJECT_TERMS,PROJECT_TITLE,SERIAL_NUMBER,STUDY_SECTION,STUDY_SECTION_NAME,SUBPROJECT_ID,SUFFIX,SUPPORT_YEAR,DIRECT_COST_AMT,INDIRECT_COST_AMT,TOTAL_COST,TOTAL_COST_SUB_PROJECT
0,3000011,A03,AH,1,,NaT,1985-07-01,1986-06-30,,A03AH000859,SCHOOLS OF PUBLIC HEALTH,,1A03AH000859-01,,,1985,"DIVISION OF ASSOCIATED, DENTAL HEALTH PROFESSIONS",,BIRMINGHAM,UNITED STATES,,7,4514360,US,UNIVERSITY OF ALABAMA AT BIRMINGHAM,AL,35294,,3700006;,"BRIDGERS, WILLIAM F;",,1985-07-01,1986-06-30 00:00:00,,PUBLIC HEALTH TRAINEESHIPS,859,STC,,,,1,,,,
1,3000012,A03,AH,1,,NaT,1985-07-01,1986-06-30,,A03AH000860,SCHOOLS OF PUBLIC HEALTH,,1A03AH000860-01,,,1985,"DIVISION OF ASSOCIATED, DENTAL HEALTH PROFESSIONS",,BERKELEY,UNITED STATES,,9,94878337,US,UNIVERSITY OF CALIFORNIA BERKELEY,CA,947045940,,2407264;,"LASHOF, JOYCE C.;",,1985-07-01,1986-06-30 00:00:00,,PUBLIC HEALTH TRAINEESHIPS,860,STC,,,,1,,,,


# Processing data

## Basic text processing of column names

In [None]:
grants_2016 = grants_2016_raw.copy()

grants_2016.columns = grants_2016.columns.str.lower()
grants_2016.head(1)

In [None]:
#converts df to lowercase string
grants_2016 = grants_2016.apply(lambda x: x.astype(str).str.lower())
grants_2016.head(1)

In [None]:
#converts 'nan' to NaN
grants_2016.replace(['nan'], np.nan, inplace=True)
grants_2016.head(1)

In [34]:
#Use this csv to clean PI information
grants_2016.to_csv('grants_all.csv', index = False, compression = 'gzip')

## Selecting columns for analysis
There are a total of 45 columns, which may contain redundant or unnecessary information.

In [35]:
grants_2016 = pd.read_csv('grants_all.csv', compression = 'gzip')
pd.set_option('display.max_colwidth', 5000)
col_info = pd.read_csv('grant_col_info_all.csv')
col_info[:9]

Unnamed: 0,column_name,descriptions
0,application_id,A unique identifier of the project record in the ExPORTER database.
1,activity,"A 3-character code identifying the grant, contract, or intramural activity through which a project is supported. Within each funding mechanism , NIH uses 3-character activity codes (e.g., F32, K08, P01, R01, T32, etc.) to differentiate the wide variety of research-related programs NIH supports. A comprehensive list of activity codes for grants and cooperative agreements may be found on the Types of Grant Programs Web page. RePORTER also includes R&D contracts (activity codes beginning with the letter N) and intramural projects (beginning with the letter Z)."
2,administering_ic,"Administering Institute or Center - A two-character code to designate the agency,NIH Institute, or Center administering the grant. See Institute/Center code definitions"
3,application_type,"A one-digit code to identify the type of application funded: 1 = New application 2 = Competing continuation (also, competing renewal) 3 = Application for additional (supplemental) support. There are two kinds of type 3competing revisions (which are peer-reviewed and administrative supplements) 4 = Competing extension for an R37 award or first non-competing year of a Fast Track SBIR/STTR award 5 = Non-competing continuation 7 = Change of grantee institution 9 = Change of NIH awarding Institute or Division (on a competing continuation)"
4,arra_funded,“Y” indicates a project supported by funds appropriated through the American Recovery and Reinvestment Act of 2009.
5,award_notice_date,Award notice date or Notice of Grant Award (NGA) is a legally binding document stating the government has obligated funds and which defines the period of support and the terms and conditions of award.\r\n
6,budget_start,The date when a project’s funding for a particular fiscal year begins.
7,budget_end,The date when a project’s funding for a particular fiscal year ends.
8,cfda_code,"Federal programs are assigned a number in the Catalog of Federal Domestic Assistance (CFDA), which is referred to as the ""CFDA code."" The CFDA database helps the Federal government track all programs it has domestically funded. \r\n"


To remove (initial): 5-8, redundant information

In [None]:
to_drop = ['award_notice_date', 'budget_start', 'budget_end', 'cfda_code']
grants_2016 = grants_2016.drop(to_drop, axis = 1)

In [None]:
col_info[9:18]

To remove: 9 (may add in later if correlating with publications), 10-12, 16; also 2 (redundant with funding_ics)

In [None]:
to_drop2 = ['administering_ic', 'core_project_num', 'ed_inst_type', 'foa_number', 'full_project_num', 'ic_name']
grants_2016 = grants_2016.drop(to_drop2, axis = 1)

In [None]:
col_info[18:27]

All redundant: 18, 20-23, 25

Also remove 19, 24, 26; these will be added again later.

In [None]:
to_drop3 = ['org_city', 'org_country', 'org_dept', 'org_district', 'org_duns', 'org_fips', 'org_name', 'org_state', 'org_zipcode']
grants_2016 = grants_2016.drop(to_drop3, axis = 1)

In [None]:
col_info[27:36]

27 (phr): maybe later for text analysis, too complex for initial analysis; 29, 34, 35

In [None]:
to_drop4 = ['phr', 'pi_names', 'program_officer_name', 'project_title', 'serial_number']
grants_2016 = grants_2016.drop(to_drop4, axis = 1)

In [None]:
col_info[36:]

37 (redundant with 36, can always look up)

In [None]:
#check number of subprojects
#grants_2016.shape
#subproject_cols = ['subproject_id', 'suffix', 'total_cost_sub_project']
#grants_2016[subproject_cols].isnull().sum()

In [None]:
grants_2016 = grants_2016.drop(['study_section_name', 'subproject_id', 'suffix'], axis = 1)

In [None]:
pd.set_option('display.max_colwidth', 50)
grants_2016.head()

### More cleaning of columns

Convert strings to numeric and dates

In [None]:
def convert_column(df, list_of_names, function):
    '''
    Converts df column values from x to 'function'.
    Function = numeric: pd.to_numeric()
    Function = date: pd.to_datetime()
    Takes a df and a list with the column names.
    Returns df.
    '''
    for name in list_of_names:
        if function == 'numeric':
            df[name] = pd.to_numeric(df[name])
        elif function == 'date':
            df[name] = pd.to_datetime(df[name], errors = 'coerce')
    return df

numeric_cols = ['direct_cost_amt', 'indirect_cost_amt', 'total_cost', 'total_cost_sub_project']
grants_2016 = convert_column(grants_2016, numeric_cols, function = 'numeric')

date_cols = ['project_start', 'project_end']
grants_2016 = convert_column(grants_2016, date_cols, function = 'date')


grants_2016.head(1)

## Extract funding institute information 

In [42]:
grants_2016['funding_ics'] = grants_2016['funding_ics'].astype(str)
institute_funds = pd.DataFrame([grants_2016['application_id'], grants_2016['funding_ics']]).T

## Splitting individual PIs when more than one is listed on a grant
Split grants with multiple PIs so that each row only has a single PI listed. This will allow analysis on funding per individual and per institution.

In [None]:
col_list = ['nih_spending_cats', 'pi_ids', 'project_terms']
grants_2016 = cln.strip_series(grants_2016, col_list, strip = '; ')
grants_2016 = cln.split_rows(grants_2016, col_name = 'pi_ids', by = ';')

More cleaning

In [None]:
grants_2016 = cln.strip_series(grants_2016, ['pi_ids'])
grants_2016 = cln.strip_series(grants_2016, ['pi_ids'], strip = ' ')
grants_2016.head()

Split grant totals by number of associated PIs (assumption is that all PIs on a grant receive the same amount of money).

In [None]:
#divide cost columns by # of times the application_id occurs
pi_per_grant = pd.DataFrame(grants_2016['application_id'].value_counts())
pi_per_grant = pi_per_grant.reset_index()
pi_per_grant.columns = ['application_id', 'num_pis']

#Match application IDs in grants_2016 and pi_per_grant
#Divide the 3 cost columns by 
grants_2016 = pd.merge(grants_2016, pi_per_grant, on = 'application_id')

In [None]:
grants_2016.head()

In [None]:
def divide_by_column(df, col_list = ['direct_cost_amt', 'indirect_cost_amt', 'total_cost'], divide_by = 'num_pis'):
    '''
    
    Divide df columns by value in another column
    
    '''
    for col in col_list:
        df[col] = round(df[col] / df[divide_by])
    return df

grants_2016 = divide_by_column(grants_2016)

In [None]:
grants_2016.head()

## Add organization information

Import pi_info.csv, which contains the necessary information.

In [None]:
cols_import = ['pi_ids', 'org_name', 'org_country', 'org_zipcode']
pi_info = pd.read_csv('pi_info.csv', compression = 'gzip', usecols = cols_import, dtype = {'pi_ids':str})
pi_info.head()

Merge the two dataframes so the information is listed together.

In [None]:
grants_2016 = pd.merge(grants_2016, pi_info, left_on = 'pi_ids', right_on = 'pi_ids', how = 'left')

## Cost of grants (funds)
There are 4 cost columns. Indirect and direct costs sum to total costs or to total subproject costs. Drop indirect and direct costs columns and combine the total costs into one column (costs are either listed as total cost or total subproject cost).

In [None]:
grants_2016 = grants_2016.drop(['direct_cost_amt', 'indirect_cost_amt'], axis = 1)
grants_2016['total_cost'].fillna(grants_2016['total_cost_sub_project'], inplace = True)
del grants_2016['total_cost_sub_project']

In [None]:
grants_2016.rename(columns = {'total_cost':'funds'}, inplace = True)
grants_2016.head()

Save dataframe to csv

In [None]:
grants_2016.to_csv('for_analysis.csv', index = False, compression = 'gzip')

## Institute funds per grant

In [43]:
institute_funds.head()

Unnamed: 0,application_id,funding_ics
0,9115627,nigms:194460\
1,9128072,ninds:335781\
2,9056435,fic:146822\nida:75000\
3,9213716,nimh:1593922\
4,8986215,nimh:294755\


Split 'funding_ics' column so that every row contains a single institute code associated with the application and the amount of money given by that institute.

In [44]:
institute_funds['funding_ics'] = institute_funds['funding_ics'].str.strip('\ ')
institute_funds = cln.split_rows(institute_funds, col_name = 'funding_ics', by = '\\')
institute_funds = institute_funds.reset_index()
del institute_funds['index']
institute_funds.head()

Unnamed: 0,application_id,funding_ics
0,9115627,nigms:194460
1,9128072,ninds:335781
2,9056435,fic:146822
3,9056435,nida:75000
4,9213716,nimh:1593922


Create a new column, 'funds_awarded', with the amount of money the institute awarded to the particular application.

In [45]:
ics = list(institute_funds['funding_ics'])
for i in range(len(ics)):
    ics[i] = ics[i].split(':')

to_concat = pd.DataFrame(ics, columns = ['institute', 'funds_awarded'])
#institute_funds['institute'] = to_concat[0]
#institute_funds['funds_awarded'] = to_concat[1]
institute_funds = pd.concat([institute_funds, to_concat], axis = 1)
del institute_funds['funding_ics']
institute_funds.head()

Unnamed: 0,application_id,institute,funds_awarded
0,9115627,nigms,194460
1,9128072,ninds,335781
2,9056435,fic,146822
3,9056435,nida,75000
4,9213716,nimh,1593922


Save to .csv

In [46]:
institute_funds.to_csv('institute_funds.csv', index = False, compression = 'gzip')